# Transcript abundance statistics

Import required modules

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import defaultdict

Set matplotlib settings

In [None]:
plt.rcParams["figure.figsize"] = [10, 10]
plt.rcParams["figure.autolayout"] = True
plt.rcParams.update({'font.size': 15})

#### [Subread featurecounts](http://subread.sourceforge.net/) data preparation

Populate dictionary

In [None]:
subreads_counts = {}
with open(snakemake.input.subread_count, 'r') as file:
    next(file)
    next(file)
    for line in file:
        line = line.split()
        counts = [int(count) for count in line[6:]]
        subreads_counts[line[0]] = [line[1], line[2], line[3], line[4], line[5]] + counts

Dictionary to dataframe

In [None]:
cols = ['chr', 'start', 'end', 'strand', 'length'] + snakemake.params.datasetnames
subreads_df = pd.DataFrame.from_dict(subreads_counts, orient='index', columns=cols)

Take a look at the dataframe

In [None]:
subreads_df

#### Plots subread

Get total counts per sample

In [None]:
totals = []
for sample in snakemake.params.datasetnames:
    totals.append(subreads_df[sample].sum())

Plot total counts per sample

In [None]:
plt.bar(snakemake.params.datasetnames, totals)
plt.ylabel('Number of transcripts')
plt.xlabel('sample number')
plt.title('Total count per sample')
plt.xticks(rotation=30, ha='right')
plt.savefig(snakemake.output.total, dpi=200)

Get count per annotated gene

In [None]:
count_per_gene = {}
for gene_name, values in subreads_counts.items():
    count_per_gene[gene_name] = sum(subreads_counts[gene_name][len(snakemake.params.datasetnames):])

Plot count per annotated gene

In [None]:
labels, data = [*zip(*count_per_gene.items())]
plt.rcParams["figure.figsize"] = [50, 10]
plt.bar(labels, data)
plt.ylabel('Number of raw counts')
plt.xlabel('feature')
plt.xticks(rotation=30, ha='right')
plt.title('Raw counts per feature')
plt.savefig(snakemake.output.per_feature, dpi=200)

#### Pipeline counts data preparation

Function for sorting oxford counts

In [None]:
def sort_ox(sort_order, counts):
    with open(snakemake.input.oxford_count, 'r') as file:
        order = file.readline().strip().split(',')[1:]
    current_order = [(i,j) for i,j in zip(order,counts)]
    current_order.sort(key = lambda i: sort_order.index(i[0]))
    sorted_counts = [item[1] for item in current_order]
    return sorted_counts

Read in count data and populate dictonary

In [None]:
count_dict = {}

counts_location = range(11, 11+len(snakemake.params.datasetnames))
talon_samples = np.genfromtxt(snakemake.input.talon_count, delimiter='\t', skip_header=True, usecols=(counts_location))
count_dict['talon'] = talon_samples.sum(axis=0)

counts_location = range(1, 1+len(snakemake.params.datasetnames))
flair_samples = np.genfromtxt(snakemake.input.flair_count, delimiter='\t', skip_header=True, usecols=counts_location)
count_dict['flair'] = flair_samples.sum(axis=0)
    
counts_location = range(1, 1+len(snakemake.params.datasetnames))
oxford_samples = np.genfromtxt(snakemake.input.oxford_count, delimiter=',', skip_header=True, usecols=counts_location)
                              
counts = oxford_samples.sum(axis=0)
sorted_counts = sort_ox(snakemake.params.datasetnames, counts)
count_dict['oxford'] = sorted_counts

counts_location = range(6, 6+len(snakemake.params.datasetnames))
subread_samples = np.genfromtxt(snakemake.input.subread_count, delimiter='\t', skip_header=2, usecols=counts_location)
count_dict['subread'] = subread_samples.sum(axis=0)

Dictionary to dataframe

In [None]:
rename = {num: name for (num, name) in enumerate(snakemake.params.datasetnames)}
count = pd.DataFrame.from_dict(count_dict)
count = count.rename(index=rename)

Take a look at the dataframe

In [None]:
count

#### Plots pipeline counts + subread

Plot total number of counts per sample per pipeline

In [None]:
count.plot(kind="bar", figsize=(10, 5))
plt.yscale('log')
plt.ylabel('Raw total count')
plt.xlabel('sample number')
plt.title('Total number of counts per sample per pipeline')
plt.xticks(rotation=30, ha='right')
plt.savefig(snakemake.output.comp, dpi=200)