# Transcript abundance statistics

selfnote:
input:
annotation counts from rule annotation_count
flar, oxford, talon count files

Import required modules

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

#### [Subread featurecounts](http://subread.sourceforge.net/) data preparation

Populate dictionary

In [None]:
subreads_counts = {}
with open(snakemake.input.annocount, 'r') as file:
    next(file)
    next(file)
    for line in file:
        line = line.split()
        a = [int(line[6])]
        subreads_counts[line[0]] = sum(a)

Dictionary to dataframe

In [None]:
subreads_df = pd.DataFrame.from_dict(subreads_counts, orient='index')

Take a look at the dataframe

In [None]:
subreads_df

#### Pipeline counts data preparation

In [None]:
sample_dict = defaultdict(list)
talon_samples = np.genfromtxt(snakemake.input.talon_count, delimiter='\t', skip_header=True)
flair_samples = np.genfromtxt(snakemake.input.flair_count, delimiter='\t', skip_header=True)
oxford_samples = np.genfromtxt(snakemake.input.oxford_count, delimiter=',', skip_header=True)
subreads_samples = np.genfromtxt(snakemake.input.annocount, delimiter='\t', skip_header=True)

Populate dictonary

In [None]:
# temporary solution, rewrite this later
for i, j in enumerate(range(1, len(oxford_samples[0]))):
    sample_dict[i + 1].append(np.nansum(oxford_samples[:, j]))
for i, j in enumerate(range(1, len(flair_samples[0]))):
    sample_dict[i + 1].append(np.nansum(flair_samples[:, j]))
for i, j in enumerate(range(11, len(talon_samples[0]))):
    sample_dict[i + 1].append(np.nansum(talon_samples[:, j]))
for i, j in enumerate(range(7, len(subreads_samples[0]))):
    sample_dict[i + 1].append(np.nansum(subreads_samples[:, j]))

In [None]:
samples = pd.DataFrame.from_dict(sample_dict, orient='index', columns=['oxford', 'flair', 'talon', 'subread'])

#### Plots subread

Get total counts per sample

In [None]:
totals = []
for column in subreads_df.loc[4:]:
    totals.append(subreads_df[column])

In [None]:
plt.bar(totals)
plt.ylabel('Number of transcripts')
plt.xlabel('sample number')
plt.title('Total count per sample')
plt.savefig(snakemake.output.total, dpi=200)

Plot count per annotated gene

In [None]:
labels, data = [*zip(*subreads_counts.items())]
plt.bar(labels,data)
plt.ylabel('Number of raw counts')
plt.xlabel('feature')
plt.title('Raw counts per feature')
plt.xticks(rotation=30, ha='right')
plt.savefig(snakemake.output.per_feature, dpi=200)

#### Plots pipeline counts + subread

Plot total number of counts per sample per pipeline

In [None]:
samples.plot(kind="bar", figsize=(10, 5))
plt.yscale('log')
plt.ylabel('Raw total count')
plt.xlabel('sample number')
plt.title('Total number of counts per sample per pipeline')
plt.savefig(snakemake.output.comp, dpi=200)