In [None]:
import pandas as pd
from pathlib import Path
import sys
import plotly.express as px
import pyranges as pr

In [None]:
annotation_file = "../../../scratch/03_23_transcriptomics/ref/ASF_Salmonella.gff"
metadata_file = "../../../scratch/03_23_transcriptomics/rnaseq_metatdata.csv"

In [None]:
sds = pd.read_csv(metadata_file)

In [None]:
class CountDataSet:
    genome_map = {'C825': 'ASF519',
        'C824': 'ASF502',
         'SL1344': 'SL1344', 
         'K401': 'DSM755',
         'A4V08': 'YL32', 
         'A4U99': 'YL31',
         'A4V09': 'YL58',
        }
    annotation_columns = ['Chromosome',  'Feature', 'Start', 'End', 'Strand','ID', 
                          'Name', 'locus_tag', 'product']
    
    def __init__(self, data_dir, gff_file, feature='gene'):
        self.data_dir = Path(data_dir)
        self.count_data = pd.DataFrame()
        self.gff_file = gff_file
        self.feature = feature
        self.annot = self.process_gff()
       
        
        
    def load_count_files(self):
        pass
    
    def process_gff(self):
        gff = pr.read_gff3(self.gff_file).as_df()[self.annotation_columns]
        return gff[gff['Feature'] == self.feature]
        
    
class HtseqCounts(CountDataSet):
    count_col = 'count'
    
    def load_count_files(self):
        files = list(self.data_dir.rglob('*.txt'))
        df_list = []
        for f in files:
            df = pd.read_table(f, names= ['Name', 'count'], header=None).assign(sample_id=f.stem.split(".")[0])
            df['Name'] = df.Name.str.split("gene-", expand=True)[1]
            df = df.dropna(subset=['Name'])
            df['genome'] = [self.genome_map.get(name.split("_")[0], 'SL1344') for name in df.Name.values]
            df_list.append(df)
        self.count_data = (pd.concat(df_list).rename({self.count_col: 'read_counts',
                                                     'Name': 'locus_tag'}, axis=1)
                           .merge(self.annot, on='locus_tag', how='left'))

        
class SalmonCounts(CountDataSet):
    count_col = 'NumReads'
    def load_count_files(self):
        files =  list(self.data_dir.rglob('quant.sf'))
        df_list = []
        for f in files:
            name = f.parent.stem.split("_quant")[0]
            df = pd.read_table(f).assign(sample_id=name)
            df['locus_tag'] = (df.Name.str.split('locus_tag=', expand=True)[1]
                               .str.split("::", expand=True)[0]
                              .str.split(";", expand=True)[0])
            df = df.rename({"Name": 'full_name'}, axis=1)
            #df['Name'] = df.full_name.str.split('Name=', expand=True)[1].str.split(";", expand=True)[0]
            df['genome'] = df.locus_tag.str.split("_", expand=True)[0].replace(self.genome_map)
            df_list.append(df)
        
        self.count_data = (pd.concat(df_list).rename({self.count_col: 'read_counts'}, axis=1)
                           .merge(self.annot, on='locus_tag', how='left'))
        
        
class FeatureCounts(CountDataSet):
    count_col = None
    def load_count_files(self):
        files = list(self.data_dir.rglob("*.count.txt"))
        df_list = []
        for f in files:
            name = f.stem.split(".count")[0]
            print(name)
            df = pd.read_table(f, comment='#').assign(sample_id=name)
            df.columns = ['locus_tag', 'chr','start', 'end', 'strand', 'length', 'read_counts', 'sample_id']
            df = df.dropna(subset=['locus_tag'])
            df['genome'] = [self.genome_map.get(name.split("_")[0], 'SL1344') for name in df.locus_tag.values]
            df_list.append(df)
        self.count_data = (pd.concat(df_list)[['locus_tag', 'read_counts', 'sample_id', 'genome']]
                           .merge(self.annot, on='locus_tag', how='left'))
       
    @property
    def summary_df(self):
        files = list(self.data_dir.rglob("*.count.txt.summary"))
        df_list = []
        for f in files:
            df = pd.read_table(f)
            name = df.columns[1].split("/")[-1].split('.')[0]
            df = df.assign(sample_id = name)
            df.columns = ['status', 'read_counts', 'sample_id']
            df_list.append(df)
        fdf = pd.concat(df_list)
        summary = fdf.groupby('sample_id').read_counts.sum().reset_index()
        summary.columns = ['sample_id', 'total']
        summary = (summary.merge(fdf[fdf.status == 'Assigned'][['read_counts', 'sample_id']], on = 'sample_id')
                   .rename({'read_counts': 'assigned'}, axis=1)
                   .merge(fdf[fdf.status == 'Unassigned_Unmapped'][['read_counts', 'sample_id']], on = 'sample_id') 
                   .rename({'read_counts':'unmapped'}, axis=1)
                   .merge(fdf[fdf.status=='Unassigned_NoFeatures'][['read_counts', 'sample_id']], on = 'sample_id')
                  .rename({'read_counts':'no_feature'}, axis=1))
        summary['percent_assigned'] = summary['assigned']/summary['total']*100
        summary['percent_unmapped'] = summary['unmapped']/summary['total']*100
        summary['percent_no_feature'] = summary['no_feature']/summary['total']*100
        return summary
               

In [None]:
htseq_dir = "../../../scratch/03_23_transcriptomics/Sal-asf-all-gene-a0_htseqcount"
salmon_dir = "../../../scratch/03_23_transcriptomics/salmon/" 

htseq = HtseqCounts(htseq_dir, annotation_file)
salmon = SalmonCounts(salmon_dir, annotation_file)

htseq.load_count_files()
salmon.load_count_files()


# Load FeatureCounts Data

In [None]:
feat_dir = "../../../scratch/03_23_transcriptomics/gene_bowtie_featurecounts/"
feat = FeatureCounts(feat_dir, annotation_file)
feat.load_count_files()

In [None]:
feat.summary_df

In [None]:
feat.count_data.to_csv("../../../scratch/03_23_transcriptomics/featureCounts.csv")

In [None]:
feat_summary = feat.count_data.groupby(['sample_id', 'genome']).read_counts.sum().reset_index()
feat_summary = feat_summary.rename({'read_counts':'featCounts'}, axis=1)

In [None]:
htseq_summary = htseq.count_data.groupby(['sample_id', 'genome']).read_counts.sum().reset_index()
htseq_summary = htseq_summary.rename({'read_counts':'htseq'}, axis=1)

In [None]:
salmon_summary = salmon.count_data.groupby(['sample_id', 'genome']).read_counts.sum().reset_index()
salmon_summary = salmon_summary.rename({'read_counts':'salmon'}, axis=1)

In [None]:
df = (htseq_summary.merge(salmon_summary, on=['genome', 'sample_id'])
      .merge(feat_summary, on=['genome', 'sample_id']))

In [None]:
df = #df.melt(id_vars=['sample_id', 'genome'], value_name='count', var_name='method')

In [None]:
df.head()

In [None]:
px.scatter(df, x='salmon', y='featCounts', hover_data=['genome', 'sample_id'], color='genome', 
          log_y=True, log_x=True)

In [None]:
px.scatter(df, x='htseq', y='featCounts', hover_data=['genome', 'sample_id'], color='genome', 
          log_y=True, log_x=True)

In [None]:
px.scatter(df, x='htseq', y='salmon', hover_data=['genome', 'sample_id'], color='genome', 
          log_y=True, log_x=True)

In [None]:
by_sample = df.groupby('sample_id').agg({'htseq':['sum'], 'salmon':['sum'],
                                                   'featCounts':['sum']}).reset_index()
by_sample.columns = ['sample_id','htseq_total', 'salmon_total', 'featCounts_total']

In [None]:
genome_counts = df.groupby(['genome', 'sample_id']).agg({'htseq':['sum'], 'salmon':['sum'],
                                                   'featCounts':['sum']}).reset_index()
genome_counts.columns = ['genome', 'sample_id', 'htseq', 'salmon', 'featCounts']

In [None]:
fdf = genome_counts.merge(by_sample, on=['sample_id'])
fdf['htseq_perc'] = fdf['htseq']/fdf['htseq_total']*100
fdf['salmon_perc'] = fdf['salmon']/fdf['salmon_total']*100
fdf['featCounts_perc'] = fdf['featCounts']/fdf['featCounts_total']*100

In [None]:
fdf[fdf.genome == 'SL1344']

In [None]:
fdf = fdf.merge(sds, on='sample_id')

In [None]:
exp1 = fdf[fdf.Mouse=='invitro']
exp2 = fdf[fdf.Mouse=='Oligo']
exp3 = fdf[fdf.Mouse=='LCM']

In [None]:
exp1.head()

In [None]:
%store -r lsu_df

In [None]:
fdf = fdf.merge(lsu_df, on='sample_id')

In [None]:
fdf

In [None]:
px.scatter(fdf[['sample_id', 'Treatment', '%mapped', 'featCounts_total']].drop_duplicates(),
          x='%mapped', y='featCounts_total', hover_data=['sample_id'])

In [None]:
gff = pr.read_gff3(annotation_file).as_df()

In [None]:
test = gff.dropna(subset=['product'])
test[test['product'].str.contains('ribosomal protein')][['Chromosome','Name', 'product']]

In [None]:
gff[['Name', 'product']].head()

In [None]:
px.box(fdf[['sample_id', 'Treatment', '%mapped', 'featCounts_total']].drop_duplicates(),
      x='Treatment', y='%mapped', color='Treatment')

In [None]:
px.bar(exp1[['sample_id', 'featCounts_total', 'Mouse', 'Treatment']].sort_values("featCounts_total").drop_duplicates(), 
       x='sample_id', y='featCounts_total',
       color='Treatment',facet_row='Mouse', height=1200)

In [None]:
px.bar(exp2[['sample_id', 'featCounts_total', 'Mouse', 'Treatment']].sort_values("featCounts_total").drop_duplicates(), 
       x='sample_id', y='featCounts_total',
       color='Treatment',facet_row='Mouse',)

In [None]:
px.bar(exp3[['sample_id', 'featCounts_total', 'Mouse', 'Treatment']].sort_values("featCounts_total").drop_duplicates(), 
       x='sample_id', y='featCounts_total',
       color='Treatment',facet_row='Mouse',)

In [None]:
px.bar(exp1, x='genome', y='featCounts', color='genome', 
       facet_col='sample_id', facet_col_wrap=4, log_y=True)

In [None]:
exp2.groupby(['genome', 'Treatment']).featCounts.median()

In [None]:
px.bar(exp2, x='genome', y='featCounts', color='genome', 
       facet_col='sample_id', facet_col_wrap=3, log_y=True, height=1000, width=800)

In [None]:
px.box(exp3[exp3.genome == 'SL1344'].sort_values("Treatment"), x='Treatment', y='featCounts', color='genome', 
       facet_col='genome', facet_col_wrap=1, log_y=False, points='all', height=600, width=600)

In [None]:
px.box(exp3[exp3.genome == 'SL1344'].sort_values("Treatment"), x='Treatment', y='featCounts_perc', color='genome', 
       facet_col='genome', facet_col_wrap=1, log_y=False, points='all', height=600, width=600)

In [None]:
px.box(exp3[exp3.genome == 'YL58'].sort_values("Treatment"), x='Treatment', y='featCounts_perc', color='genome', 
       facet_col='genome', facet_col_wrap=1, log_y=False, points='all', height=600, width=600)

In [None]:
px.box(exp3[exp3.genome == 'YL32'].sort_values("Treatment"), x='Treatment', y='featCounts', color='genome', 
       facet_col='genome', facet_col_wrap=1, log_y=False, points='all', height=600, width=600)

In [None]:
px.box(exp3[exp3.genome == 'YL58'].sort_values("Treatment"), x='Treatment', y='featCounts', color='genome', 
       facet_col='genome', facet_col_wrap=1, log_y=False, points='all', height=600, width=600)

In [None]:
sal_counts = df[df.genome=='SL1344'].groupby(['sample_id']).agg({'htseq':['sum'], 'salmon':['sum'],
                                                   'featCounts':['sum']})

sal_counts/1000

In [None]:
feat.count_data.head()

In [None]:
feat.count_data.dropna(subset=['product']).head()

In [None]:
class MetaT:
    
    def __init__(self, data_dir, prefix, methods = ['htseq', 'salmon', 'bowtie']):
        
        self.root_dir = Path(data_dir)
        self.htseq_dir = self.data_dir.rglob(f'{prefix}*htseqcount') if 'htseq' in methods else None
        self.salmon_dir = self.data_dir.rglob(f'{prefix}*salmon') if 'salmon' in methods else None
        self.bowtie_dir = self.data_dir.rglob(f'{prefix}*bowtie') if 'bowtie' in methods else None
        
            
    

In [None]:
# htseqcounts a0
htseq_dir = "../../../scratch/03_23_transcriptomics/Sal-asf-all-gene-a0_htseqcount"
htseq_files = 

In [None]:
htseq_df.genome.unique()

In [None]:
htseq_df.groupby(['sample_id'])['count'].sum()

In [None]:
htseq_df.groupby(['sample_id', 'genome'])['count'].sum()

In [None]:
salmon_files = list(Path(salmon_dir).rglob('quant.sf'))

In [None]:
f = salmon_files[0]

In [None]:
df_list = []
for f in salmon_files:
    name = f.parent.stem.split("_quant")[0]
    print(name)
    df = pd.read_table(f).assign(sample_id=name)
    df['locus_tag'] = df.Name.str.split('locus_tag=', expand=True)[1].str.split(";", expand=True)[0]
    strains = {'C825': 'ASF519',
    'C824': 'ASF502',
     'SL1344': 'SL1344', 
     'K401': 'DSM755',
     'A4V08': 'YL32', 
     'A4U99': 'YL31',
     'A4V09': 'YL58',
    }
    df['genome'] = df.locus_tag.str.split("_", expand=True)[0].replace(strains)
    df_list.append(df)
fdf = pd.concat(df_list)

In [None]:
summed_reads = fdf.groupby(['sample_id', 'genome']).NumReads.sum().reset_index()

In [None]:
f1 = summed_reads[summed_reads.genome == 'SL1344']

In [None]:
f2 = fdf.groupby('sample_id').NumReads.sum().reset_index()

In [None]:
summed_reads[summed_reads.sample_id == 'AU657']

In [None]:
f3 = f1.merge(f2, on='sample_id', suffixes=['_sl1344', '_total'])

In [None]:
f3['%sal'] = f3['NumReads_sl1344']/f3['NumReads_total']*100

In [None]:
f3[f3['%sal'] < 95]['%sal'].median()

In [None]:
f3

In [None]:
sal = fdf[fdf.genome == 'SL1344'].copy()

In [None]:
inter = sal[['Name', 'sample_id', 'NumReads']].pivot(index='Name', columns='sample_id')
inter = inter/inter.sum()*1000000
inter.columns = [c[1] for c in inter.columns]
inter = inter.reset_index().melt(id_vars='Name', var_name = 'sample_id', value_name='new_tpms')
sal = sal.merge(inter, on=['Name', 'sample_id'])

In [None]:
sal

In [None]:
sal.groupby('sample_id').sum(numeric_only=True)

In [None]:
import numpy as np

In [None]:
np.quantile(sal.new_tpms, 0.95)

In [None]:
sal[sal.new_tpms > 900]

In [None]:
sal.groupby('sample_id').new_tpms.apply(np.quantile, q=0.95)