In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import plotnine as p9
import sys
sys.path.append("/Users/ansintsova/git_repos/nguyenb_tnseq/code/mbarq_analysis")
import quality_control as qc

In [None]:
dataDir = Path("/Users/ansintsova/git_repos/nguyenb_tnseq/data/08_21")
mappedFiles = [f for f in (dataDir/'counts').iterdir() if 'counts_mapped.csv' in f.name]
unmappedFiles = [f for f in (dataDir/'counts').iterdir() if 'counts_unmapped.csv' in f.name]
metadataFiles = [f for f in (dataDir/'metadata').iterdir() if "metadata.edited.txt" in f.name]
controlFile = dataDir/"controls.txt"
metadata = pd.concat(pd.read_table(f) for f in metadataFiles)

In [None]:
df, controlDf = qc.load_samples(mappedFiles, metadata, unmappedFiles, controlFile)

In [None]:
old_cnts = pd.read_csv(dataDir/'old_counts.csv')
df = old_cnts[old_cnts.phenotype.isna()].copy().rename({'cnt':'barcode_cnt'}, axis=1)
controlDf = old_cnts[old_cnts.phenotype.notnull()].copy().rename({'cnt':'barcode_cnt'}, axis=1)

In [None]:
inoculum = df[df.mouse == 'inoculum']
total_cnt = inoculum.groupby('sampleName').barcode_cnt.sum().reset_index()
total_cnt.columns = ['sampleName', 'total_cnt']
inoculum = inoculum.merge(total_cnt, on='sampleName', how='left')
inoculum.groupby('library').sampleName.nunique()

# Dropping Library 14_1, not enough samples
df =df[df.library!='library_14_1']

In [None]:
# sns.set(font_scale=1.2)
# sns.set_style('ticks')
# fig, axes = plt.subplots(4,3, figsize=(20, 15))
# axes = axes.flatten()
# colors = sns.color_palette("rocket", 12)
# libraries = list(inoculum.library.unique())
# for i, lib in enumerate(libraries):
#     c = colors[i]
#     sns.histplot(data=inoculum[inoculum.library == lib], x='barcode_cnt', hue='experiment', bins=500, ax=axes[i])
#     axes[i].set_title(lib)
#     axes[i].set_xlim(0, 10000)
# plt.subplots_adjust(left=None, bottom=None, right=None, top=None, wspace=0.2, hspace=1)


In [None]:
corr_df, good_samples = qc.calculate_correlation(controlDf, concentration_col='conc',
                          cnt_col='barcode_cnt', phenotype_col='phenotype',
                          for_each='sampleName', how='log', wt_phenotype='wt', cutoff=0.8)

In [None]:
good_df = df[df.sampleName.isin(good_samples)]
good_control = controlDf[controlDf.sampleName.isin(good_samples)].copy()
good_inoculum = good_df[good_df.mouse == 'inoculum']

In [None]:
# sns.set(font_scale=1.2)
# sns.set_style('ticks')
# fig, axes = plt.subplots(4,3, figsize=(20, 15))
# axes = axes.flatten()
# colors = sns.color_palette("rocket", 12)
# libraries = list(good_inoculum.library.unique())
# for i, lib in enumerate(libraries):
#     c = colors[i]
#     sns.histplot(data=good_inoculum[good_inoculum.library == lib], x='barcode_cnt', hue='experiment', bins=500, ax=axes[i])
#     axes[i].set_title(lib)
#     axes[i].set_xlim(0, 10000)
# plt.subplots_adjust(left=None, bottom=None, right=None, top=None, wspace=0.2, hspace=1)

In [None]:
experiments_to_keep = good_inoculum.experiment.unique()
good_df = good_df[good_df.experiment.isin(experiments_to_keep)]
# inoculum = inoculum[~inoculum.experiment.isin(experiments_to_drop)]

In [None]:
relAb = (good_inoculum[['barcode', 'sampleName', 'barcode_cnt']]
         .drop_duplicates().pivot(index='barcode', columns='sampleName'))
relAb = (relAb/relAb.sum()*100).reset_index().melt(id_vars='barcode', value_name='relAb')
relAb = relAb[['barcode', 'sampleName', 'relAb']]

In [None]:
good_inoculum = good_inoculum.merge(relAb, on=['barcode', 'sampleName'], how='left')

In [None]:
good_inoculum.sample(10)

In [None]:
# sns.set(font_scale=1.2)
# sns.set_style('ticks')
# fig, axes = plt.subplots(4,3, figsize=(20, 15))
# axes = axes.flatten()
# colors = sns.color_palette("rocket", 12)
# libraries = list(good_inoculum.library.unique())
# for i, lib in enumerate(libraries):
#     c = colors[i]
#     sns.histplot(data=good_inoculum[good_inoculum.library == lib], x='relAb', hue='experiment', bins=500, ax=axes[i])
#     axes[i].set_title(lib)
# plt.subplots_adjust(left=None, bottom=None, right=None, top=None, wspace=0.2, hspace=1)

In [None]:
good_inoculum.locus_tag.nunique()

In [None]:
inoculumFilt = good_inoculum[good_inoculum.relAb > 0.01]

In [None]:
inoculumFilt.locus_tag.nunique()

In [None]:
inoculumFilt.barcode.nunique()

In [None]:
inoc_table1 = (inoculumFilt.groupby(['library', 'experiment', 'dnaid'])
               .agg({"locus_tag":['nunique', lambda x: x.isna().sum()]})
              .reset_index())
inoc_table1.columns = ['Library', 'Experiment', 'dnaid', '# of disrupted genes', '# of insertions outside of CDS']
inoc_table1['Library'] = inoc_table1['Library'].str.replace("_", '-')

In [None]:
map_table1 = pd.read_csv(dataDir/"14-10-2021-table1.csv", index_col=0)
map_table1.columns = ["Library"] + [f"{c} (map)" for c in map_table1.columns if 'Library' not in c]

inoc_table1 = inoc_table1.merge(map_table1, on='Library')

In [None]:
inoc_table1

In [None]:
inocSum = (inoculumFilt.groupby(['locus_tag', 'sseqid',])
           .agg({'library':['nunique'], 'sstart':['min'], 'relAb':['mean']})
           .reset_index())
inocSum.columns = ['locus_tag', 'sseqid',  'num_lib', 'sstart', 'mean_relAb']

In [None]:
sns.set(font_scale=2)
sns.set_style('ticks')
fig, ax = plt.subplots(figsize=(20,10))
sns.histplot(data=inocSum, 
             x='num_lib',  color=(0.20973515, 0.09747934, 0.24238489), bins=10)
ax.set_xticks(range(1,11));
plt.title("How many libraries was a mutation found in?")
plt.xlabel('Number of libraries with gene disruption')
plt.xlim(0, 12);

In [None]:
inocSum

In [None]:
36+58

In [None]:
inocSum[inocSum["num_lib"] > 1].locus_tag.nunique()/2984

In [None]:
inocFQ = inocSum[inocSum.sseqid == 'FQ312003.1'].copy()
inocFQ['sstart'] = inocFQ.sstart/1000000

(p9.ggplot(inocFQ, p9.aes(x='sstart', y='mean_relAb', color='num_lib'))
     + p9.geom_point(size=4, )
     + p9.geom_segment(p9.aes(x='sstart', xend='sstart', y='mean_relAb', yend=0), size=1, alpha=0.4)
     + p9.theme_classic()
     + p9.theme( text = p9.element_text(size=24), figure_size=(25,10), )
     + p9.xlab(f"Postion, Mb")
     +p9.labs(title='FQ312003.1')
     +p9.ylab('Mean relative abundance in the inoculum')
     + p9.scale_color_gradientn(colors=sns.color_palette('rocket_r') )
    + p9.scale_y_log10())

In [None]:
# inocFQ = inocSum[inocSum.sseqid == 'HE654726.1'].copy()
# inocFQ['sstart'] = inocFQ.sstart/1000000

# (p9.ggplot(inocFQ, p9.aes(x='sstart', y='mean_relAb', color='num_lib'))
#      + p9.geom_point(size=4, )
#      + p9.geom_segment(p9.aes(x='sstart', xend='sstart', y='mean_relAb', yend=0), size=1, alpha=0.4)
#      + p9.theme_classic()
#      + p9.theme( text = p9.element_text(size=24), figure_size=(25,10), )
#      + p9.xlab(f"Postion, Mb")
#      +p9.labs(title='HE654726.1')
#      + p9.scale_color_gradientn(colors=sns.color_palette('rocket_r') )
#     + p9.scale_y_log10())

In [None]:
inocFQ = inocSum[inocSum.sseqid == 'HE654725.1'].copy()
inocFQ['sstart'] = inocFQ.sstart/1000000

(p9.ggplot(inocFQ, p9.aes(x='sstart', y='mean_relAb', color='num_lib'))
     + p9.geom_point(size=4, )
     + p9.geom_segment(p9.aes(x='sstart', xend='sstart', y='mean_relAb', yend=0), size=1, alpha=0.4)
     + p9.theme_classic()
     + p9.theme( text = p9.element_text(size=24), figure_size=(25,10), )
     + p9.xlab(f"Postion, Mb")
     +p9.labs(title='HE654725.1')
     + p9.scale_color_gradientn(colors=sns.color_palette('rocket_r') )
    + p9.scale_y_log10())

In [None]:
inocFQ = inocSum[inocSum.sseqid == 'HE654724.1'].copy()
inocFQ['sstart'] = inocFQ.sstart/1000000

(p9.ggplot(inocFQ, p9.aes(x='sstart', y='mean_relAb', color='num_lib'))
     + p9.geom_point(size=4, )
     + p9.geom_segment(p9.aes(x='sstart', xend='sstart', y='mean_relAb', yend=0), size=1, alpha=0.4)
     + p9.theme_classic()
     + p9.theme( text = p9.element_text(size=24), figure_size=(25,10), )
     + p9.xlab(f"Postion, Mb")
     +p9.labs(title='HE654724.1')
     + p9.scale_color_gradientn(colors=sns.color_palette('rocket_r') )
    + p9.scale_y_log10())

In [None]:
corr_df[corr_df.phenotype=='wt'].sample(20)

In [None]:
corr_df[corr_df.phenotype == 'wt'].sort_values('R', ascending=False).head(16)

In [None]:
controlDf[(controlDf.sampleName == 'dnaid2024_9') & (controlDf.phenotype == 'wt')]

In [None]:
plt.figure(figsize=(6,6))
ex_inoc = controlDf[(controlDf.sampleName == 'dnaid2024_9') & (controlDf.phenotype == 'wt')]
sns.regplot(data=ex_inoc, x='logConc', y='logCnts')
plt.title("Inoculum, dnaid2024_9")
plt.xlim(-5, -2.5)

In [None]:
plt.figure(figsize=(6,6))
ex_good = controlDf[(controlDf.sampleName == 'dnaid2028_73') & (controlDf.phenotype == 'wt')]
sns.regplot(data=ex_good, x='logConc', y='logCnts')
plt.title("Good Sample, dnaid2028_73")
plt.xlim(-5, -2.5)



In [None]:

plt.figure(figsize=(6,6))
ex_bad = controlDf[(controlDf.sampleName == 'dnaid2016_17') & (controlDf.phenotype == 'wt')]
sns.regplot(data=ex_bad, x='logConc', y='logCnts')
plt.title("Bad Sample, dnaid2016_17")
plt.xlim(-4.5, -2.5)

#
#dnaid2016_17

In [None]:
corr_df['r2'] = corr_df['R']**2
corr_df[(corr_df.phenotype == 'wt') & (corr_df.sampleName.isin(['dnaid2016_17', 'dnaid2028_73', 'dnaid1315_10']))]

In [None]:
# mice = controlDf[(controlDf.sampleName.isin(good_samples)) & (controlDf.tissue != 'inoculum')].mouse.unique()

In [None]:
# fig, axes = plt.subplots(32, 3, figsize=(15,150))
# axes = axes.flatten()
# for i, mouse in enumerate(mice):
#     test = controlDf[controlDf.mouse == mouse].sort_values('day')
#     sns.lineplot(data=test, x='day', y='logCnts', hue='conc', palette='rocket_r', ax=axes[i])
#     axes[i].set_title(mouse)
#     axes[i].legend([],[], frameon=False)

In [None]:
columns_in_clean = ['barcode', 'barcode_cnt',  'library', 
              'experiment', 'dnaid', 'sampleName', 'mouse', 'day', 'tissue']
columns_in_control = columns_in_clean + ['phenotype', 'conc']
cleanDf = pd.concat([good_df[columns_in_clean].copy(), good_control[columns_in_control].copy()]).drop_duplicates()
cleanDfnc = good_df[columns_in_clean].copy().drop_duplicates()
cleanDfnc = cleanDfnc[cleanDfnc.sampleName.isin(good_samples)]
cleanDfnc = cleanDfnc[cleanDfnc.mouse != 'unenriched_inoculum']

cleanDfnc['relAb'] = cleanDfnc.groupby('sampleName').barcode_cnt.apply(lambda x: x/x.sum()*100).values
#control_bcs = controlDf.barcode.values
#controlRelAb = cleanDf[cleanDf.barcode.isin(control_bcs)]

In [None]:
good_control['ShortName']  = good_control['barcode'] 
good_control['locus_tag']  = good_control['phenotype'] + '-' + good_control['conc'].astype(str)

In [None]:
good_control

In [None]:
# controlRelAb

In [None]:
# fig, axes = plt.subplots(29, 3, figsize=(15,150))
# axes = axes.flatten()
# for i, mouse in enumerate(mice):
#     test = controlRelAb[controlRelAb.mouse == mouse].sort_values('day')
#     sns.lineplot(data=test, x='day', y='relAb', hue='conc', palette='rocket_r', ax=axes[i])
#     axes[i].set_title(mouse)
#     axes[i].legend([],[], frameon=False)
#     axes[i].set_yscale('log')
#     axes[i].set_ylim(0.001, 1)
# plt.subplots_adjust(left=None, bottom=None, right=None, top=None, wspace=0.2, hspace=.5)

In [None]:
plt.figure(figsize=(10,6))
cleanDf.groupby('sampleName').barcode_cnt.sum().hist(bins=100)

In [None]:
depth = cleanDf.groupby('sampleName').barcode_cnt.sum().reset_index()
depth[depth.barcode_cnt < 1000000]

In [None]:
cleanDf.groupby(['library', 'day']).sampleName.nunique()

In [None]:
def filter_inoculum(exp_df, filter_below=0, filter_col='relAb', sample_id='sampleID'):
    filt_df = exp_df.copy()
    if 'ShortName' in filt_df.columns:
        filt_df = filt_df.drop(['ShortName'], axis=1)
    if 'locus_tag' in filt_df.columns:
        filt_df = filt_df.drop(['locus_tag'], axis=1)
    to_filter=filt_df[filt_df.mouse == 'inoculum'][sample_id].unique()
    filt_df = (filt_df[['barcode', sample_id, filter_col]]
            .drop_duplicates().pivot(index='barcode', columns=sample_id, values=filter_col))
    filt_df = filt_df[(filt_df[to_filter] >= filter_below).all(1)].reset_index()
    filt_df = filt_df.melt(id_vars='barcode', value_name = filter_col)
    filt_df = filt_df.merge(exp_df, on=['barcode', sample_id, filter_col], how='left')
    return filt_df

In [None]:
# Unfiltered not gene level
df.head()

In [None]:
# Filtered not gene level
filtDf = cleanDfnc.groupby('library').apply(filter_inoculum, filter_below=0.01, sample_id='sampleName')
filtDf = filtDf.drop('library', axis=1).reset_index().drop('level_1', axis=1)
filtDf = filtDf.merge(df[['barcode', 'ShortName', 'locus_tag', 'sampleName']].drop_duplicates(), 
                      on=['barcode', 'sampleName'], how='left')

filtDf = pd.concat([filtDf, good_control[columns_in_control]])

In [None]:
# unfiltered gene level
geneDf = (cleanDfnc.merge(good_df[['barcode', 'sampleName', 'sseqid', 
                         'sstart', 'ShortName', 'locus_tag']], on=['barcode', 'sampleName'], how='left')
          .groupby(['library', 'sampleName', 'ShortName', 'mouse', 'day', 'tissue'])
          .barcode_cnt.sum().reset_index())
geneDf = pd.concat([geneDf, good_control[columns_in_control + ['locus_tag', 'ShortName']]])

In [None]:
# filtered gene levele

geneFilt = (filtDf.merge(good_df[['barcode', 'sampleName', 'sseqid', 
                         'sstart', 'ShortName', 'locus_tag']], on=['barcode', 'sampleName'], how='left')
          .groupby(['library', 'sampleName', 'ShortName', 'mouse', 'day', 'tissue', 'dnaid', 'experiment'])
          .barcode_cnt.sum().reset_index())
geneFilt = pd.concat([geneFilt, good_control[columns_in_control + ['locus_tag', 'ShortName']]])

In [None]:
filtDf

In [None]:
len(good_samples)

In [None]:
geneDf

In [None]:
geneFilt.to_csv(dataDir/"15-10-filtered-gene-level.csv")
filtDf.to_csv(dataDir/"15-10-filtered-barcode-level.csv")

In [None]:
good_df[(good_df.barcode == 'AAAACAATCTATCGACC') & (good_df.sampleName == 'dnaid1315_17')]