# Alignment statistics

Import required modules

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import defaultdict

#### Data preparation

Read in samtools stats output files and populate dictonary

In [None]:
stats_dict = defaultdict(list)
for file in snakemake.input.stats:
    with open(file) as file:
        for line in file:
            line = line.strip('\n').split('\t')
            stats_dict[line[0]].append(float(line[1]))

Convert dictonary to pandas dataframe and rename rows

In [None]:
rename = {num: name for (num, name) in enumerate(snakemake.params.datasetnames)}
stats = pd.DataFrame.from_dict(stats_dict)
stats = stats.rename(index=rename)

Take a look at the dataframe

In [None]:
stats

#### Plots

Set matplotlib settings

In [None]:
plt.rcParams["figure.figsize"] = [10, 5]
plt.rcParams["figure.autolayout"] = True
plt.rcParams.update({'font.size': 15})

Plot read alignments

In [None]:
cols = ['reads mapped:', 'reads unmapped:']
stats[cols].plot(kind='bar')
plt.yscale('log')
plt.ylabel('Number of reads')
plt.xlabel('sample names')
plt.title('Read alignment per sample')
plt.xticks(rotation=30, ha='right')
plt.legend(loc='upper left')
plt.savefig(snakemake.output.align, dpi=200)

Plot mismatches

In [None]:
stats['mismatches:'].plot(kind='bar')
plt.yscale('log')
plt.ylabel('Number of bases')
plt.xlabel('sample names')
plt.title('Mismatches per sample')
plt.xticks(rotation=30, ha='right')
plt.savefig(snakemake.output.mismatch, dpi=200)

Plot average read length

In [None]:
stats['average length:'].plot(kind='bar')
plt.ylabel('Number of bases')
plt.xlabel('sample names')
plt.title('Average read length per sample')
plt.xticks(rotation=30, ha='right')
plt.savefig(snakemake.output.readlen, dpi=200)