In [None]:
import pandas as pd
from pathlib import Path
import sys
import plotly.express as px
import pyranges as pr
import yaml
import numpy as np
with open("config.yaml", "r") as fh:
    config_dict = yaml.safe_load(fh)['default']

In [None]:
root = Path(config_dict["root"])
out_dir = root/config_dict["output_dir"]
sample_data_file = root/config_dict["sample_data_file"]
gff_file = root/config_dict["gff_file"]

In [None]:
sds = pd.read_csv(sample_data_file)

In [None]:
sds

# Load the data

In [None]:

class GenomeAnnot:
    genome_map = {'CP015399.2': 'YL32',
                  'CP015400.2': 'KB18',
                  'CP015401.2': 'I48',
                  'CP015402.2': 'YL27',
                  'CP015403.2': 'YL45',
                  'CP015404.2': 'I46',
                  'CP015405.2': 'YL58',
                  'CP015406.2': 'YL31',
                  'CP015407.2': 'YL2',
                  'CP015408.2': 'I49',
                  'CP015409.2': 'YL44',
                  'CP015410.2': 'KB1',
                  'GCF_000364265': 'ASF519',
                  'FQ312003.1': 'SL1344',
                  'FQ312003.1;FQ312003.1': 'SL1344',
                  'HE654725.1': 'SL1344',
                  'HE654726.1': 'SL1344',
                  'HE654724.1': 'SL1344',
                  'contig_15': 'contig_15',
                  'contig_21': 'contig_21',
                  'contig_26': 'contig_26',
                  'contig_46': 'contig_46',
                  'AQFU02000001.1': 'ASF 502',
                  'AQFU02000002.1': 'ASF 502',
                  'AQFU02000003.1': 'ASF 502',
                  'CP097573.1': 'ASF500',
                  'NZ_CP097810.1': 'ASF356',
                  'NZ_AQFR02000001.1': 'ASF360',
                  'NZ_AQFR02000002.1': 'ASF360',
                  'NZ_AQFR02000003.1': 'ASF360',
                  'NZ_CP097561.1': 'ASF361',
                  'NZ_AQFT02000001.1': 'ASF492',
                  'NZ_AQFT02000002.1': 'ASF492',
                  'NZ_AQFT02000003.1': 'ASF492',
                  'NZ_AQFT02000004.1': 'ASF492',
                  'NZ_AQFT02000005.1': 'ASF492',
                  'NZ_AQFT02000006.1': 'ASF492',
                  'NZ_AQFV02000001.1': 'ASF519',
                  'NZ_AQFV02000002.1': 'ASF519',
                  'NZ_AQFV02000003.1': 'ASF519',
                  'NZ_AQFV02000004.1': 'ASF519',
                  'NZ_AQFV02000005.1': 'ASF519',
                  'NZ_AQFV02000006.1': 'ASF519',
                  'NZ_CP097562.1': 'ASF457'
                  }
    annotation_columns = ['Chromosome',  'Feature', 'Start', 'End', 'Strand', 'ID',
                          'Name', 'locus_tag', 'gene_biotype', 'product']

    def __init__(self, gff_file, ):
        self.gff_file = gff_file
        self.feature = "gene"
        self.annot = self.process_gff()

    def process_gff(self):
        gff = pr.read_gff3(self.gff_file).as_df()[self.annotation_columns]
        return gff[gff['Feature'] == self.feature]

    def annotate_df(self, df):
        fdf = df.merge(self.annot, on='ID', how='left')
        fdf['genome'] = fdf['Chromosome'].replace(
            self.genome_map)
        return fdf


class CountDataSet:
    def __init__(self, data_dir):
        self.data_dir = Path(data_dir)
        self.count_data = pd.DataFrame()

    def load_count_files(self):
        pass


# Currently not looking at these


class HtseqCounts(CountDataSet):
    count_col = 'count'

    def load_count_files(self):
        files = list(self.data_dir.rglob('*.txt'))
        df_list = []
        for f in files:
            df = pd.read_table(f, names=['Name', 'count'], header=None).assign(
                sample_id=f.stem.split(".")[0])
            df['Name'] = df.Name.str.split("gene-", expand=True)[1]
            df = df.dropna(subset=['Name'])
            df['genome'] = [self.genome_map.get(name.split(
                "_")[0], 'SL1344') for name in df.Name.values]
            df_list.append(df)
        self.count_data = (pd.concat(df_list).rename({self.count_col: 'read_counts',
                                                     'Name': 'locus_tag'}, axis=1)
                           .merge(self.annot, on='locus_tag', how='left'))


class SalmonCounts(CountDataSet):
    count_col = 'NumReads'
    gene_col = 'Name'

    def load_count_files(self):
        files = list(self.data_dir.rglob('quant.sf'))

        df_list = []
        for f in files:
            name = f.parent.stem.split("_quant")[0]
            print(name)
            df = pd.read_table(f).assign(sample_id=name)
            df = df.rename(
                {self.count_col: 'salmon_read_counts'}, axis=1)
            df['ID'] = (df[self.gene_col].str.split('ID=', expand=True)[1]
                        .str.split(";", expand=True)[0])
            df = df.drop(columns=[self.gene_col])
            df_list.append(df)
        self.count_data = pd.concat(df_list)


class FeatureCounts(CountDataSet):
    count_col = None

    def load_count_files(self):
        files = list(self.data_dir.rglob("*.count.txt"))
        df_list = []
        for f in files:
            name = f.stem.split(".count")[0]
            print(name)
            df = pd.read_table(f, comment='#').assign(sample_id=name)
            df.columns = ['ID', 'chr', 'start', 'end',
                          'strand', 'length', 'fc_read_counts', 'sample_id']
            df = df[['ID', 'fc_read_counts', 'sample_id']]
            df_list.append(df)
        self.count_data = pd.concat(df_list)

    @property
    def summary_df(self):
        files = list(self.data_dir.rglob("*.count.txt.summary"))
        df_list = []
        for f in files:
            df = pd.read_table(f)
            name = df.columns[1].split("/")[-1].split('.')[0]
            df = df.assign(sample_id=name)
            df.columns = ['status', 'read_counts', 'sample_id']
            df_list.append(df)
        fdf = pd.concat(df_list)
        summary = fdf.groupby('sample_id').read_counts.sum().reset_index()
        summary.columns = ['sample_id', 'total']
        summary = (summary.merge(fdf[fdf.status == 'Assigned'][['read_counts', 'sample_id']], on='sample_id')
                   .rename({'read_counts': 'assigned'}, axis=1)
                   .merge(fdf[fdf.status == 'Unassigned_Unmapped'][['read_counts', 'sample_id']], on='sample_id')
                   .rename({'read_counts': 'unmapped'}, axis=1)
                   .merge(fdf[fdf.status == 'Unassigned_NoFeatures'][['read_counts', 'sample_id']], on='sample_id')
                   .rename({'read_counts': 'no_feature'}, axis=1))
        summary['percent_assigned'] = summary['assigned']/summary['total']*100
        summary['percent_unmapped'] = summary['unmapped']/summary['total']*100
        summary['percent_no_feature'] = summary['no_feature'] / \
            summary['total']*100
        return summary


class SushiCounts(CountDataSet):
    count_col = "total_insertcount"
    gene_col = "#reference"

    def load_count_files(self):
        files = list(self.data_dir.rglob("*ushicounts"))
        df_list = []
        for f in files:
            name = f.stem.split(".")[0]
            print(name)
            df = pd.read_table(
                f, usecols=[0, 2, 6, 7, 8]).assign(sample_id=name)
            df = df.rename(columns={self.count_col: "sushi_insertcount"})
            df['ID'] = df['#reference'].str.split(
                ';', expand=True)[0].str.split('ID=', expand=True)[1]
            df = df.drop(columns=[self.gene_col])
            df_list.append(df)
        self.count_data = pd.concat(df_list)

        # self.count_data = fdf.merge(self.annot, on='ID', how='left')
        # self.count_data["genome"] = self.count_data['Chromosome'].replace(
        #     self.genome_map)

annot = GenomeAnnot(gff_file)

# Load sushi data

In [None]:
sushi_dir = root/config_dict['sushi_dir']
sushi = SushiCounts(sushi_dir, gff_file)
sushi.load_count_files()
su = sushi.count_data
su.head()

# Load FeatureCounts Data

In [None]:
feat_dir = root/config_dict["featurecounts_dir"]
feat = FeatureCounts(feat_dir)
feat.load_count_files()
fc = feat.count_data

In [None]:
fc_annotated = annot.annotate_df(fc)
fc_annotated.head()

## Saving data for DESeq

In [None]:
fc_for_deseq = fc_annotated[['ID', 'genome', 'sample_id', 'fc_read_counts']].drop_duplicates().pivot(index=['ID', 'genome'], columns='sample_id').reset_index()
fc_for_deseq.columns = ['ID', 'genome'] + [c[1] for c in fc_for_deseq.columns[2:]]

In [None]:
fc_for_deseq.to_csv(root/config_dict['deseq_count_file'], index=False)

In [None]:
fc_for_deseq.head()

In [None]:
fc_to_save = fc[fc.sample_id.isin(sds[sds.Mouse == 'Oligo'].sample_id.values)].pivot(index='ID', columns='sample_id').reset_index()
fc_to_save.columns = ['ID'] + [c[1] for c in fc_to_save.columns[1:]]
fc_to_save.head()
fc_to_save.to_csv(root/config_dict['oligo_fc_raw'], index=False)

In [None]:
# Getting I48 and IL27 data
i48 = fc_annotated[fc_annotated["genome"] == 'I48']
i48 = i48[i48.sample_id.isin(sds[sds.Mouse == 'Oligo'].sample_id.values)]
i48= i48[['ID', 'fc_read_counts', 'sample_id']].pivot(index='ID', columns='sample_id').reset_index()
i48.columns = ['ID'] + [c[1] for c in i48.columns[1:]]
i48.head()
i48.to_csv(root/config_dict['oligo_i48_fc_raw'], index=False)

In [None]:
# Getting I48 and IL27 data
yl27 = fc_annotated[fc_annotated["genome"] == 'YL27']
yl27 = yl27[yl27.sample_id.isin(sds[sds.Mouse == 'Oligo'].sample_id.values)]
yl27= yl27[['ID', 'fc_read_counts', 'sample_id']].pivot(index='ID', columns='sample_id')#.reset_index()
yl27.columns = [c[1] for c in yl27.columns]
yl27 = yl27.loc[:,yl27.sum() > 2e6].reset_index()
yl27.to_csv(root/config_dict['oligo_yl27_fc_raw'], index=False)

# Load salmon data

In [None]:
salmon_dir = root/config_dict['salmon_dir']
salmon = SalmonCounts(salmon_dir, gff_file)
salmon.load_count_files()
sc = salmon.count_data

In [None]:
sc.shape

# Merge count data

- Look at how Salmon Effective Length corresponds to horizontal covearge from sushicounter
    - No easily identifiable relationship

In [None]:
count_df = sc.merge(fc, on=['sample_id', 'ID']).merge(
    su, on=['sample_id', 'ID'])

In [None]:
count_df.head()

In [None]:
count_df = annot.annotate_df(count_df)

In [None]:
count_df['sal_cov'] = count_df.EffectiveLength/count_df.Length
px.scatter(count_df[(count_df.sushi_insertcount > 0) & (count_df.salmon_read_counts > 0)].sample(
    5000), x='sal_cov', y='horizontal_coverage', width=600, height=600)

In [None]:
count_df.head()

In [None]:
coding_df = count_df[~count_df.gene_biotype.isin(['tRNA', 'RNase_P_RNA', 'ncRNA', 'rRNA', 'tmRNA',
                                                  'SRP_RNA', 'misc_RNA'])]
by_genome = (coding_df.groupby(['sample_id', 'genome'])
             .agg({'salmon_read_counts': ['sum'], 'fc_read_counts': ['sum'], 'sushi_insertcount': ['sum']})
             .reset_index())
by_genome.columns = ['sample_id', 'genome', 'sal_read_counts_sum',
                     'fc_read_counts_sum', 'sushi_read_counts_sum']

by_sample = (coding_df.groupby(['sample_id'])
             .agg({'salmon_read_counts': ['sum'], 'fc_read_counts': ['sum'], 'sushi_insertcount': ['sum']})
             .reset_index())
by_sample.columns = ['sample_id',  'sal_read_counts_sample_sum',
                     'fc_read_counts_sample_sum', 'sushi_read_counts_sample_sum']
by_genome = by_genome.merge(by_sample, on='sample_id')
by_genome['sal_pgen'] = round(by_genome['sal_read_counts_sum']/by_genome['sal_read_counts_sample_sum']*100, 2)
by_genome['fc_pgen'] = round(by_genome['fc_read_counts_sum']/by_genome['fc_read_counts_sample_sum']*100, 2)
by_genome['sushi_pgen'] = round(by_genome['sushi_read_counts_sum']/by_genome['sushi_read_counts_sample_sum']*100, 2)

In [None]:
by_genome[by_genome.genome == 'contig_21']

In [None]:
px.scatter(by_genome, x='fc_read_counts_sum', y='sal_read_counts_sum',
           color='genome', hover_data=['genome', 'sample_id'], log_x=True, log_y=True,
           height=800, width=850, template='plotly_white')

In [None]:
px.scatter(by_genome, x='fc_read_counts_sum', y='sushi_read_counts_sum',
           color='genome', hover_data=['genome', 'sample_id'], log_x=True, log_y=True,
           height=800, width=850, template='plotly_white')

In [None]:
px.scatter(by_genome, x='sal_read_counts_sum', y='sushi_read_counts_sum',
           color='genome', hover_data=['genome', 'sample_id'], log_x=True, log_y=True,
           height=800, width=850, template='plotly_white')

## Explore YL45 in 2 samples -> AU655  and Inoc_1

- The differences are mainly due to rRNA genes -> feature counts is more strict when assigning these? 
- Overall well exclude not protein coding sequences from the analysis for now. 

In [None]:
df = count_df[(count_df.genome == 'YL45') & (
    count_df.sample_id == 'AU655')].copy()
df2 = count_df[(count_df.genome == 'YL45') & (
    count_df.sample_id == 'Inoc_1')].copy()

In [None]:
px.scatter(df, x='fc_read_counts', y='sushi_insertcount', hover_data=[
           'ID', 'locus_tag', 'gene_biotype'], log_x=True, log_y=True, height=800, width=800)

In [None]:
px.scatter(df2, x='fc_read_counts', y='sushi_insertcount', hover_data=[
           'ID', 'locus_tag', 'gene_biotype'], log_x=True, log_y=True, height=800, width=800)

# Explore correlation between transcriptome abundances and mOTUs data

# Transcriptome composition for each sample

In [None]:
sds

## In vitro

In [None]:
sds.Mouse.unique()

In [None]:
invitro = by_genome[by_genome.sample_id.isin(sds[sds.Mouse == 'invitro'].sample_id.values)]

In [None]:
invitro[invitro.genome == 'SL1344'][['sample_id'] + [c for c in invitro.columns if 'pgen' in c]]

In [None]:
px.bar(invitro, x="sample_id", y="fc_read_counts_sum", colorc="genome", log_y=True, height=600, width=800, template="plotly_white")

In [None]:
lcm = by_genome[by_genome.sample_id.isin(sds[sds.Mouse == 'LCM'].sample_id.values)]

In [None]:
lcm[lcm.genome == 'SL1344'][['sample_id'] + [c for c in lcm.columns if 'pgen' in c]]

In [None]:
px.bar(lcm, x="sample_id", y="fc_read_counts_sum", color="genome", log_y=True, height=600, width=1000, template="plotly_white")

# DE Results

In [None]:
from typing import Union, List
class DEResults:

    def __init__(self, files: Union[Path, List[Path]], annotation_file: Union[str, Path] = [],
                 annotation_type: str = 'gff', id_col: str = 'ID', contrast_col: str = 'contrast',
                 lfc_th: float = 1, pval_th: float = 0.01) -> None:
        self.files = files
        self.annotation_file = annotation_file
        self.annotation_type = annotation_type
        self.id_col = id_col
        self.contrast_col = contrast_col
        self.lfc_th = lfc_th
        self.pval_th = pval_th
        self.results = pd.DataFrame()


    def read_results(self):
        if type(self.files) == list:
            self.results = pd.concat([pd.read_csv(f).assign(analysis_tag=f.stem) for f in self.files])  
        else:
            self.results = pd.read_csv(self.files).assign(analysis_tag='deseq_de')
    
    def validate_results(sefl):

        # todo create validation schema
        pass

    def clean_name(self, col_name: str = 'analysis_tag', separator: str = '_', 
                   index_to_keep: int = 1, text_to_remove: str = ""):
        self.results[col_name] = (self.results[col_name].str.split(separator, expand=True)[index_to_keep]
                                  .str.replace(text_to_remove, ""))

    def compare_two_analyses(self, contrast: str, analysis_tags: List ):

        df = self.results[(self.results[self.contrast_col] == contrast) &(self.results["analysis_tag"].isin(analysis_tags))].copy()
        df['hits'] = (abs(df["log2FoldChange"]) > self.lfc_th) & (df["padj"]< self.pval_th)
        hits_color = df.groupby(self.id_col).hits.apply(list).reset_index()
        hits_color.columns = ['ID', 'hits_color']
        df = df[['ID', 'baseMean', 'log2FoldChange', 'padj', 'analysis_tag']].pivot(index='ID', columns='analysis_tag').reset_index()
        df.columns = ['ID'] + [f"{c[0]}_{c[1]}" for c in df.columns[1:]]
        df = df.dropna(subset=[c for c in df.columns if 'log2FoldChange' in c]).merge(hits_color, on='ID', how='left')
        
        return df


    def volcano_plot(self):
        pass

    def ma_plot(self):
        pass

    def add_gene_info(self):
        if not self.annotation_file:
            return "No annotation file provided"
        elif self.annotation_type == 'gff':
            gene_info = GenomeAnnot(self.annotation_file)
            self.results = gene_info.annotate_df(self.results)
        else:
            return "Not implemented"    

## Oligo experiment

### All genes together with fc and deseq2

In [None]:
out_dir

In [None]:
list(out_dir.rglob("2023-07-31*"))

In [None]:

files = list(out_dir.rglob("2023-07-*LPS*results.csv"))
files

In [None]:
res = DEResults(files, gff_file)

In [None]:
res.read_results()
res.clean_name('analysis_tag', '_', 1, '-LPS')
res.add_gene_info()

In [None]:
fdf = res.results

In [None]:
fdf.sample(20)

In [None]:
fdf[fdf.padj < 0.05].groupby('analysis_tag').ID.nunique()

In [None]:
fdf[(fdf.genome == 'YL27') & (fdf.padj < 0.05)].groupby('analysis_tag').ID.nunique()

In [None]:
y27_sig = fdf[(fdf.genome == 'YL27') & (fdf.padj < 0.05)]

for i in y27_sig[y27_sig.analysis_tag == 'oligo-alone-within-taxon'].locus_tag.values:
    print(i)

In [None]:
fdf[(fdf.genome == 'I48') & (fdf.padj < 0.05)].groupby('analysis_tag').ID.nunique()

In [None]:
cnts = pd.read_csv(out_dir/"2023-07-31_oligo-alone-within-taxon-norm_cnts.csv", index_col=0).set_index('ID')
cnts = np.log2(cnts +1)

In [None]:
from umap import UMAP
cnts = pd.read_csv(out_dir/"2023-07-31_oligo-alone-within-taxon-norm_cnts.csv", index_col=0).set_index('ID')
cnts = np.log2(cnts +1)
cnts = cnts.loc[(cnts.var(axis=1)>3),].T

umap_2d = UMAP(n_components=2, init='random', random_state=0)
proj_2d = pd.DataFrame(umap_2d.fit_transform(cnts), index=cnts.index)

In [None]:
sds

In [None]:
df = res.compare_two_analyses("LPS_vs_PBS", ["oligo-metat-fc-deseq", "oligo-alone-within-taxon"])
df['hits_color'] = df['hits_color'].astype(str)

In [None]:
px.scatter(df, x= "log2FoldChange_oligo-metat-fc-deseq", y = "log2FoldChange_oligo-i48-fc-deseq", 
           color='hits_color', hover_data=df.columns, height=900, width=1000, trendline='ols' )

In [None]:

df.dropna(subset=['hits_color'])

In [None]:
df.comparision.unique()

In [None]:
df[df.ID == 'gene-A4V02_09235'].groupby('ID').hits.apply(list)

In [None]:
i48_res = pd.read_csv(out_dir/"2023-07-21_oligo-i48-fc-deseq-LPS_vs_PBS_l0a0.01_results.csv")
i48_res['anal§ysis'] = 'indiv'

In [None]:
res_df = annot.annotate_df(res_df)

In [None]:
res_df = res_df[res_df.genome == 'I48']


In [None]:
cdf48 = pd.concat([res_df, i48_res])

In [None]:
lfc48.sample(5)

In [None]:
lfc48 = cdf48[['ID', 'log2FoldChange', 'padj', 'analysis']]
lfc48 = lfc48.pivot(index='ID', columns='analysis').reset_index()
lfc48.columns = ['ID', 'lfc_indiv', 'lfc_together', 'padj_indiv', 'padj_together']

In [None]:
lfc48.sample(5)

In [None]:
indiv_genes = set(lfc48[(abs(lfc48.lfc_indiv) > 1) & (lfc48.padj_indiv < 0.01)].ID.values)

In [None]:
together_genes = set(lfc48[(abs(lfc48.lfc_together) > 1) & (lfc48.padj_together < 0.01)].ID.values)

In [None]:
len(indiv_genes.intersection(together_genes))/len(together_genes)

In [None]:
lfc48.isna().sum()

In [None]:
px.scatter(lfc48, x='lfc_indiv', y='lfc_together', height=800, width=800, hover_data=lfc48.columns)

In [None]:
%store -r lsu_df

In [None]:
fdf = fdf.merge(lsu_df, on='sample_id')

In [None]:
fdf

In [None]:
px.scatter(fdf[['sample_id', 'Treatment', '%mapped', 'featCounts_total']].drop_duplicates(),
           x='%mapped', y='featCounts_total', hover_data=['sample_id'])

In [None]:
gff = pr.read_gff3(annotation_file).as_df()

In [None]:
test = gff.dropna(subset=['product'])
test[test['product'].str.contains('ribosomal protein')][[
    'Chromosome', 'Name', 'product']]

In [None]:
gff[['Name', 'product']].head()

In [None]:
px.box(fdf[['sample_id', 'Treatment', '%mapped', 'featCounts_total']].drop_duplicates(),
       x='Treatment', y='%mapped', color='Treatment')

In [None]:
px.bar(exp1[['sample_id', 'featCounts_total', 'Mouse', 'Treatment']].sort_values("featCounts_total").drop_duplicates(),
       x='sample_id', y='featCounts_total',
       color='Treatment', facet_row='Mouse', height=1200)

In [None]:
px.bar(exp2[['sample_id', 'featCounts_total', 'Mouse', 'Treatment']].sort_values("featCounts_total").drop_duplicates(),
       x='sample_id', y='featCounts_total',
       color='Treatment', facet_row='Mouse',)

In [None]:
px.bar(exp3[['sample_id', 'featCounts_total', 'Mouse', 'Treatment']].sort_values("featCounts_total").drop_duplicates(),
       x='sample_id', y='featCounts_total',
       color='Treatment', facet_row='Mouse',)

In [None]:
px.bar(exp1, x='genome', y='featCounts', color='genome',
       facet_col='sample_id', facet_col_wrap=4, log_y=True)

In [None]:
exp2.groupby(['genome', 'Treatment']).featCounts.median()

In [None]:
px.bar(exp2, x='genome', y='featCounts', color='genome',
       facet_col='sample_id', facet_col_wrap=3, log_y=True, height=1000, width=800)

In [None]:
px.box(exp3[exp3.genome == 'SL1344'].sort_values("Treatment"), x='Treatment', y='featCounts', color='genome',
       facet_col='genome', facet_col_wrap=1, log_y=False, points='all', height=600, width=600)

In [None]:
px.box(exp3[exp3.genome == 'SL1344'].sort_values("Treatment"), x='Treatment', y='featCounts_perc', color='genome',
       facet_col='genome', facet_col_wrap=1, log_y=False, points='all', height=600, width=600)

In [None]:
px.box(exp3[exp3.genome == 'YL58'].sort_values("Treatment"), x='Treatment', y='featCounts_perc', color='genome',
       facet_col='genome', facet_col_wrap=1, log_y=False, points='all', height=600, width=600)

In [None]:
px.box(exp3[exp3.genome == 'YL32'].sort_values("Treatment"), x='Treatment', y='featCounts', color='genome',
       facet_col='genome', facet_col_wrap=1, log_y=False, points='all', height=600, width=600)

In [None]:
px.box(exp3[exp3.genome == 'YL58'].sort_values("Treatment"), x='Treatment', y='featCounts', color='genome',
       facet_col='genome', facet_col_wrap=1, log_y=False, points='all', height=600, width=600)

In [None]:
sal_counts = df[df.genome == 'SL1344'].groupby(['sample_id']).agg({'htseq': ['sum'], 'salmon': ['sum'],
                                                                  'featCounts': ['sum']})

sal_counts/1000

In [None]:
feat.count_data.head()

In [None]:
feat.count_data.dropna(subset=['product']).head()

In [None]:
class MetaT:

    def __init__(self, data_dir, prefix, methods=['htseq', 'salmon', 'bowtie']):

        self.root_dir = Path(data_dir)
        self.htseq_dir = self.data_dir.rglob(
            f'{prefix}*htseqcount') if 'htseq' in methods else None
        self.salmon_dir = self.data_dir.rglob(
            f'{prefix}*salmon') if 'salmon' in methods else None
        self.bowtie_dir = self.data_dir.rglob(
            f'{prefix}*bowtie') if 'bowtie' in methods else None

In [None]:
# htseqcounts a0
htseq_dir = "../../../scratch/03_23_transcriptomics/Sal-asf-all-gene-a0_htseqcount"
htseq_files = 

In [None]:
htseq_df.genome.unique()

In [None]:
htseq_df.groupby(['sample_id'])['count'].sum()

In [None]:
htseq_df.groupby(['sample_id', 'genome'])['count'].sum()

In [None]:
salmon_files = list(Path(salmon_dir).rglob('quant.sf'))

In [None]:
f = salmon_files[0]

In [None]:
df_list = []
for f in salmon_files:
    name = f.parent.stem.split("_quant")[0]
    print(name)
    df = pd.read_table(f).assign(sample_id=name)
    df['locus_tag'] = df.Name.str.split('locus_tag=', expand=True)[
        1].str.split(";", expand=True)[0]
    strains = {'C825': 'ASF519',
               'C824': 'ASF502',
               'SL1344': 'SL1344',
               'K401': 'DSM755',
               'A4V08': 'YL32',
               'A4U99': 'YL31',
               'A4V09': 'YL58',
               }
    df['genome'] = df.locus_tag.str.split("_", expand=True)[0].replace(strains)
    df_list.append(df)
fdf = pd.concat(df_list)

In [None]:
summed_reads = fdf.groupby(['sample_id', 'genome']
                           ).NumReads.sum().reset_index()

In [None]:
f1 = summed_reads[summed_reads.genome == 'SL1344']

In [None]:
f2 = fdf.groupby('sample_id').NumReads.sum().reset_index()

In [None]:
summed_reads[summed_reads.sample_id == 'AU657']

In [None]:
f3 = f1.merge(f2, on='sample_id', suffixes=['_sl1344', '_total'])

In [None]:
f3['%sal'] = f3['NumReads_sl1344']/f3['NumReads_total']*100

In [None]:
f3[f3['%sal'] < 95]['%sal'].median()

In [None]:
f3

In [None]:
sal = fdf[fdf.genome == 'SL1344'].copy()

In [None]:
inter = sal[['Name', 'sample_id', 'NumReads']].pivot(
    index='Name', columns='sample_id')
inter = inter/inter.sum()*1000000
inter.columns = [c[1] for c in inter.columns]
inter = inter.reset_index().melt(
    id_vars='Name', var_name='sample_id', value_name='new_tpms')
sal = sal.merge(inter, on=['Name', 'sample_id'])

In [None]:
sal

In [None]:
sal.groupby('sample_id').sum(numeric_only=True)

In [None]:
import numpy as np

In [None]:
np.quantile(sal.new_tpms, 0.95)

In [None]:
sal[sal.new_tpms > 900]

In [None]:
sal.groupby('sample_id').new_tpms.apply(np.quantile, q=0.95)