In [None]:
import pandas as pd
from pathlib import Path
import numpy as np
from scipy.stats import ranksums
import matplotlib.pyplot as plt
import chart_studio
import chart_studio.tools as tls
import chart_studio.plotly as py
import plotly.express as px
%matplotlib inline
import cufflinks as cf
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)
cf.go_offline()
import dash_bio as dashbio
import os

from quality_control import *
from method1_analysis import *
from method2_analysis import *
from final_analysis import * 

# Table of Contents: <a id='start'></a>

1. [Loading the data](#loading-data)
2. [Filtering the data](#filtering)
2. [Method 1](#Method-1)
3. [Method 2](#Method-2)
4. [Compare the results](#Compare)

# Loading the data <a id='loading-data'></a>


## Data Analysed:

List of dnaids






In [None]:

# Todo: move file directories to config files, so that can be re-run with different counts

counts_dir ="/Users/ansintsova/git_repos/nguyenb_tnseq/data/13_04_results/counts"
outdir = '/Users/ansintsova/git_repos/nguyenb_tnseq/data/07_06_results/'
control_file = Path("/Users/ansintsova/git_repos/nguyenb_tnseq/data/13_04_results")/'controls.txt'

# Load
dnaids = ['dnaid1315', 'dnaid1428', 'dnaid1429', 'dnaid1457', 'dnaid2015', 'dnaid2016', 'dnaid2017', 'dnaid2018', 'dnaid2019',
         'dnaid2023', 'dnaid2024', 'dnaid2025', 'dnaid2026', 'dnaid2027', 'dnaid2028', 'dnaid2029' ]

cnt_df = load_files(dnaids, Path(counts_dir))
# Create unique identifier for each sample
cnt_df['sampleID'] = cnt_df['sampleID'] + "_" + cnt_df['dnaid'] + "_" + cnt_df['experiment']
cnt_df = cnt_df[cnt_df.sampleID.notnull()]
cnt_df['CntrlName'] = cnt_df['phenotype'] + cnt_df['conc'].astype(str)
cnt_df['ShortName'] = cnt_df.ShortName.fillna(cnt_df.CntrlName)

# Dropping Unenriched samples
cnt_df = cnt_df[~cnt_df.sampleID.str.contains('unenriched')]
annotation_df = cnt_df[['barcode', 'ShortName', 'locus_tag', 'phenotype', 'conc']].drop_duplicates()

libraries = [lib for lib in cnt_df.library.unique() if type(lib) == str]
libraries.remove('library_14_1')
print(len(libraries))
days = ['_d1', '_d2', '_d3', '_d4']



In [None]:
dataDir = Path("/Users/ansintsova/git_repos/nguyenb_tnseq/data/08_21")
cnt_df.to_csv(dataDir/'old_counts.csv')

In [None]:
libraries

In [None]:
cnt_df.groupby('library').experiment.nunique().reset_index().sort_values('library')

In [None]:
mice_per_library = cnt_df.groupby(['library', 'day']).sampleID.nunique().reset_index()
mice_per_library
# for library in mice_per_library.library.unique():
#     print (mice_per_library[mice_per_library.library == library])

## Summary:

From the above results, not enough data for library_14_1, should be dropped. At least attach warning to results.

# Filtering the data: <a id='filtering'></a>

## 1. [Checking for linearity of control dilutions](#check-linearity)

## 2. [Filtering out samples with skewed WT fitness ](#check-wt-fitness)


[Back to the start](#start)

[Next to Method 1](#Method-1)

[Next to Method 2](#Method-2)


# Checking for linearity of control dilutions <a id='check-linearity'></a>


- Cutting off samples with $R^2$ < 0.8

In [None]:
cnt_df.head()

In [None]:
corr_df, good_samples = calculate_correlation(cnt_df, control_file, for_each='sampleID')
print(len(good_samples))

In [None]:
cnt_df.mouse.nunique()

In [None]:
len(set([c.split("_")[0] for c in good_samples]))

In [None]:
def viz_linearity(cnt_df, phenotype='wt', day='d0', library = 'library_10_2'):
    query = f"(phenotype == '{phenotype}') & (day == '{day}') & (library == '{library}')"
    df = cnt_df.query(query)[['barcode', 'sampleID', 'cnt', 'conc', 'library']]
    df['lconc'] = np.log(df.conc)
    df['lcnt'] = np.log(df.cnt +1)
    fig = px.scatter(df.sort_values('sampleID'), 
                     x="lconc", y="lcnt", facet_col="sampleID", facet_col_wrap=3,height=3000, width=800,
                     trendline='ols')
    return df, fig

In [None]:
wdf, fig= viz_linearity(cnt_df, day='d2', library='library_10_2')

# Filtering out samples with skewed WT fitness <a id='check-wt-fitness'></a>

- Median $log_2FC$ within -1/1 for WT barcodes
- Filtering out technical artifacts of unknown origin

### How to:
- Only used 'good samples' identified above
- Calculate barcode fitness for each lbirary.
    - For each experiment in each library:
        - Perform VST transformation
        - Calculate fitness for each barcode $\frac{2^{vst-barcode-counts}}{2^{vst-inoculum-counts}}$
- Calculate WT barcode fitnesses 
- Identify samples with abs($log_2FC$) > 1

[Back to the start](#start)

[Next to Method 1](#Method-1)

[Next to Method 2](#Method-2)


In [None]:
skewed_samples = []
no_inoc_samples = []
all_wt_fit = []
for library in libraries:
    print(library)
    library_vst, library_barcode_fitness = get_barcode_fitness_by_library(cnt_df, library, good_samples, outdir, filter_below=0)
    library_wt_fitness = get_wt_fitness_by_library(library_barcode_fitness, annotation_df, phenotype='wt')
    all_wt_fit.append(library_wt_fitness.assign(library=library))
    no_inoc_samples += list(library_wt_fitness.isna().all()[lambda x: x].index)
    skewed_samples += list(get_skewed(library_wt_fitness))

In [None]:
no_inoc_samples = ['w435_d3_dnaid1457_TV3522A',
 'w436_d3_dnaid1457_TV3522A',
 'w441_d4_dnaid1457_TV3522C',
 'w443_d4_dnaid1457_TV3522C',
 'w445_d4_dnaid1457_TV3522D',
 'w446_d3_dnaid1457_TV3522D',
 'w446_d4_dnaid1457_TV3522D']

In [None]:
skewed_samples

In [None]:
with open(Path(outdir)/'skewed_samples.txt', 'w') as fh:
    for s in skewed_samples:
        fh.write(f"{s}\n")

# Method 1: <a id='Method-1'></a>


- Take all the library samples, run DESeq to get fitness values


[Back to the start](#start)

In [None]:
with open(Path(outdir)/"skewed_samples.txt", 'r') as fh:
    skewed_samples = [s.strip() for s in fh.readlines()]

In [None]:
skewed_samples

## Method 1

- Get counts for each gene as sum of all transposons mapped to that gene

In [None]:
gene_df = cnt_df.groupby(['sampleID', 'ShortName', 'mouse', 'day', 'library', 'tissue', 'dnaid', 'experiment' ]).cnt.sum().reset_index().rename({'ShortName':'barcode'}, axis=1)

In [None]:
results = []
vsts = {}
for library in libraries:
    print(library)
    exp_df =  gene_df[gene_df.library == library].copy()
    library_samples = [s for s in good_samples if s in exp_df.sampleID.unique() and s not in skewed_samples  and s not in no_inoc_samples]
    fit, res, vst = analyze_library2(exp_df, sample_id="sampleID", 
                              good_samples=library_samples, 
                              dnaid=library.replace("_", "-"), experiment='', 
                              control_file=control_file, 
                              to_filter=1000, outdir=outdir)
    fit.columns = ['gene', 'baseMean', 'log2FC', 'lfcSE', 'stat', 'lfc_pvalue', 'lfc_padj', 'day', 'n_samples']
    res.columns = ['gene', 'day', 'gene_FC', 'sigma', 'z-score', 'CI', 'zscore_pval', 'zscore_padj']
    final = fit[['gene', 'log2FC', 'lfcSE', 'lfc_pvalue', 'lfc_padj', 'day', 'n_samples']].merge(res[['gene', 'day', 'z-score', 'CI', 'zscore_pval', 'zscore_padj']], how='outer', on=['gene', 'day'])
    final = final[['gene', 'day', 'log2FC', 'lfcSE', 'lfc_pvalue', 'lfc_padj', 'z-score', 'CI', 'zscore_pval', 'zscore_padj']].assign(library=library)
    results.append(final)
    vsts[library] = vst
    
final_m1_1000 = pd.concat(results)

In [None]:
import pickle



a_file = open(Path(outdir)/"27-07-vsts.pkl", "wb")
pickle.dump(vsts, a_file)
a_file.close()



In [None]:

# results = []
# for library in libraries:
#     print(library)
#     exp_df =  gene_df[gene_df.library == library].copy()
#     library_samples = [s for s in good_samples if s in exp_df.sampleID.unique() and s not in skewed_samples  and s not in no_inoc_samples]
#     fit, res = analyze_library2(exp_df, sample_id="sampleID", 
#                               good_samples=library_samples, 
#                               dnaid=library.replace("_", "-"), experiment='', 
#                               control_file=control_file, 
# #                              to_filter=100, outdir=outdir)
#     fit.columns = ['gene', 'baseMean', 'log2FC', 'lfcSE', 'stat', 'lfc_pvalue', 'lfc_padj', 'day', 'n_samples']
#     res.columns = ['gene', 'day', 'gene_FC', 'sigma', 'z-score', 'CI', 'zscore_pval', 'zscore_padj']
#     final = fit[['gene', 'log2FC', 'lfcSE', 'lfc_pvalue', 'lfc_padj', 'day', 'n_samples']].merge(res[['gene', 'day', 'z-score', 'CI', 'zscore_pval', 'zscore_padj']], how='outer', on=['gene', 'day'])
#     final = final[['gene', 'day', 'log2FC', 'lfcSE', 'lfc_pvalue', 'lfc_padj', 'z-score', 'CI', 'zscore_pval', 'zscore_padj']].assign(library=library)
#     results.append(final)
# final_m1_100 = pd.concat(results)

In [None]:
final_m1_1000.to_csv(Path(outdir)/'27-07-results.csv')

In [None]:
final_m1_1000[final_m1_1000.gene.str.len() < 10].gene.nunique()

In [None]:
final_m1_1000[(final_m1_1000.gene.str.len() < 10) & (final_m1_1000.zscore_padj<0.05)].groupby('day').gene.nunique()

In [None]:
final_m1_1000[(final_m1_1000.gene.str.len() < 10) & (final_m1_1000.zscore_padj<0.05)].groupby(['library','day']).gene.nunique()

In [None]:
final_m1_100[final_m1_100.gene.str.len() < 10].gene.nunique()

In [None]:
final_m1_100[(final_m1_100.gene.str.len() < 10) & (final_m1_100.zscore_padj<0.05)].groupby('day').gene.nunique()

In [None]:
approx_pos = cnt_df[['ShortName','library',  'sstart', 'sseqid']].drop_duplicates().groupby(['ShortName',  'library', 'sseqid']).sstart.min().reset_index().rename({'ShortName':'gene'}, axis = 1)

In [None]:
approx_pos.to_csv(Path(outdir)/"approx_pos.csv")

In [None]:
cnt_df[['ShortName','library',  'sstart', 'sseqid']][cnt_df.ShortName == 'siiE'].drop_duplicates()

In [None]:
final_m1_1000_pos = final_m1_1000.merge(approx_pos, on = ['gene', 'library'] )

In [None]:
final_m1_1000_pos

## Remove skewed and samples with no inoculum, filter 1000

In [None]:
test = final_m1_1000_pos[(final_m1_1000_pos.day == 'd2')&(final_m1_1000_pos.sseqid == 'FQ312003.1')]
test2 = test[test.zscore_padj < 0.05]

In [None]:
test2[(test2.sstart > 1300000) & (test2.sstart<1500000)]

In [None]:
test = final_m1_1000_pos[(final_m1_1000_pos.day == 'd1')&(final_m1_1000_pos.sseqid == 'FQ312003.1')]
test2 = test[(test.zscore_padj < 0.05) & (abs(np.log2(test.CI)) > 1)]


plt.figure(figsize = (40, 10))
plt.scatter(test.sstart, test.CI, color='grey', alpha=0.2)
plt.scatter(test2.sstart, test2.CI, color='red', alpha=0.5)
plt.yscale('log')

In [None]:
test = final_m1_1000_pos[(final_m1_1000_pos.day == 'd2')&(final_m1_1000_pos.sseqid == 'FQ312003.1')]
test2 = test[(test.zscore_padj < 0.05) & (abs(np.log2(test.CI)) > 1)]

plt.figure(figsize = (40, 10))
plt.scatter(test.sstart, test.CI, color='grey', alpha=0.2)
plt.scatter(test2.sstart, test2.CI, color='red', alpha=0.5)
plt.yscale('log')

In [None]:
final_m1_1000_pos.sseqid.unique()

In [None]:
test2

In [None]:
import seaborn as sns
test = final_m1_1000_pos[(final_m1_1000_pos.sseqid == 'FQ312003.1')]
test2 = test[(test.zscore_padj < 0.05) & (abs(np.log2(test.CI)) > 1)].sort_values('day')
plt.figure(figsize = (40, 10))
sns.stripplot(x = test2.sstart, y=test2.day)



In [None]:
df_final2 = test[['CI', 'day', 'gene', 'library']].pivot(index=['gene', 'library'], columns='day', values='CI').reset_index().set_index('gene')

In [None]:
df_final2.dropna()

In [None]:
test = final_m1_1000_pos[(final_m1_1000_pos.day == 'd3')&(final_m1_1000_pos.sseqid == 'FQ312003.1')]
test2 = test[(test.zscore_padj < 0.05) & (abs(np.log2(test.CI)) > 1)]

plt.figure(figsize = (40, 10))
plt.scatter(test.sstart, test.CI, color='grey',alpha=0.2)
plt.scatter(test2.sstart, test2.CI, color='red', alpha=0.5)
plt.yscale('log')

In [None]:
test = final_m1_1000_pos[(final_m1_1000_pos.day == 'd4')&(final_m1_1000_pos.sseqid == 'FQ312003.1')]
test2 = test[(test.zscore_padj < 0.05) & (abs(np.log2(test.CI)) > 1)]


plt.figure(figsize = (40, 10))
plt.scatter(test.sstart, test.CI, color='grey', alpha=0.2)
plt.scatter(test2.sstart, test2.CI, color='red', alpha=0.5)
plt.yscale('log')

In [None]:
results = []
for library in ['library_10_2']:
    print(library)
    lib_df = cnt_df[cnt_df.library == library].copy()
    #Removing some noise
    lib_df = lib_df[~((lib_df.libcnt.isna()) & (lib_df.phenotype.isna()))]
    library_samples = [s for s in good_samples if s in lib_df.sampleID.unique() and s not in skewed_samples 
                      and s not in no_inoc_samples]
    fit, res = analyze_library(lib_df, sample_id="sampleID", 
                          good_samples=library_samples, 
                          dnaid=library.replace("_", "-"), experiment='', 
                          control_file=control_file, 
                          to_filter=100, outdir=outdir)
    results.append(res)
    
method1_skewed_removed = pd.concat(results)

In [None]:
m1_comp = method1_skewed_removed.merge(m1_b_res, on=['gene', 'day'])

In [None]:
m1_comp

In [None]:
plt.scatter(m1_comp.ci_x, m1_comp.ci_y)
plt.xscale('log')
plt.yscale('log')

## Remove skewed and samples with no inoculum, filter 100

In [None]:
results = []
for library in libraries:
    print(library)
    lib_df = cnt_df[cnt_df.library == library].copy()
    #Removing some noise
    lib_df = lib_df[~((lib_df.libcnt.isna()) & (lib_df.phenotype.isna()))]
    library_samples = [s for s in good_samples if s in lib_df.sampleID.unique() and s not in skewed_samples 
                      and s not in no_inoc_samples]
    fit, res = analyze_library(lib_df, sample_id="sampleID", 
                          good_samples=library_samples, 
                          dnaid=library.replace("_", "-"), experiment='', 
                          control_file=control_file, 
                          to_filter=100, outdir=outdir)
    results.append(res)
    
method1_skewed_removed_100 = pd.concat(results)

In [None]:
results = []
for library in libraries:
    print(library)
    lib_df = cnt_df[cnt_df.library == library].copy()
    #Removing some noise
    lib_df = lib_df[~((lib_df.libcnt.isna()) & (lib_df.phenotype.isna()))]
    library_samples = [s for s in good_samples if s in lib_df.sampleID.unique() if s not in no_inoc_samples]
    fit, res = analyze_library(lib_df, sample_id="sampleID", 
                          good_samples=library_samples, 
                          dnaid=library.replace("_", "-"), experiment='', 
                          control_file=control_file, 
                          to_filter=1000, outdir=outdir)
    results.append(res)
    
method1_with_skewed = pd.concat(results)

In [None]:
method1_skewed_removed_100.to_csv(Path(outdir)/"21-07-method1_skewed_removed_100.csv")

In [None]:
method1_skewed_removed.to_csv(Path(outdir)/"21-07-method1_skewed_removed.csv")
method1_with_skewed.to_csv(Path(outdir)/"21-07-method1_with_skewed.csv")

# Method 2: <a id='Method-2'></a>

[Back to the start](#start)

In [None]:
good_samples_noskew = [s for s in good_samples if s not in skewed_samples and s not in no_inoc_samples]
vsts = {}
fits = []
cis = []
wt_fits = []
ssa_ci = []
resultsFitList = []
resultsCIList=[]
# Fitness Results
for library in libraries:
    print(library)
    library_vst, library_barcode_fitness = get_barcode_fitness_by_library(cnt_df, library, good_samples_noskew, outdir, filter_below=1000)
    library_gene_fitness = get_gene_fitness_by_library(library_barcode_fitness, annotation_df)
    library_wt_fitness = get_wt_fitness_by_library(library_barcode_fitness, annotation_df, phenotype='wt')
    library_ssa_fitness = get_wt_fitness_by_library(library_barcode_fitness, annotation_df, phenotype='ssaV_invG')
    library_gene_ci = library_gene_fitness.set_index('ShortName').apply(lambda x: x / library_wt_fitness.median()[x.name]).reset_index()
    library_ssa_ci = library_ssa_fitness.median()/library_wt_fitness.median()
    meltGeneFit = melt_sampleID(library_gene_fitness, idVar=['ShortName'], value_name='fitness')
    meltWtFit = melt_sampleID(library_wt_fitness, idVar=['barcode', 'phenotype', 'conc'], value_name='fitness')
    meltGeneCI = melt_sampleID(library_gene_ci, idVar=['ShortName'], value_name='ci')
    resultsFit = get_library_results(meltGeneFit, meltWtFit, library)
    resultsCI = get_library_results_ci(meltGeneCI, library_ssa_ci, library)
    
    fits.append(meltGeneFit.assign(library=library))
    cis.append(meltGeneCI.assign(library=library))
    wt_fits.append(meltWtFit.assign(library=library))
    resultsFitList.append(resultsFit)
    resultsCIList.append(resultsCI)
    ssa_ci.append(pd.DataFrame(library_ssa_ci, columns=['ssa_ci']).assign(library=library))
    vsts[library]= library_vst
      
m2_fits_no_skew = pd.concat(fits)
m2_ci_no_skew = pd.concat(cis)
m2_wt_fits_no_skew = pd.concat(wt_fits)
m2_results_fit_no_skew = pd.concat(resultsFitList)
m2_results_ci_no_skew = pd.concat(resultsCIList)
m2_ssa_ci_no_skew = pd.concat(ssa_ci)

In [None]:
sample_map = {s:['median'] for s in library_vst}
sample_map

In [None]:
genes

In [None]:
sample_map = {s:['median'] for s in vsts['library_14_2']}
test_gene = (vsts['library_14_2'].reset_index()
 .merge(annotation_df, on='barcode').drop(['locus_tag', 'phenotype', 'conc'], axis=1)
 .set_index('barcode')
 .groupby('ShortName').agg(sample_map))
test_gene.columns = [c[0] for c in test_gene.columns]
genes = test_gene.var(axis=1).sort_values(ascending=False).head(50).index
test_gene = test_gene.loc[genes]
df, pc1, pc2 = get_pca_df(test_gene)
plotPCA(df, pc1,pc2)

In [None]:
from sklearn.decomposition import PCA

def get_pca_df(library_vst, num_genes=500):
    var_bcs = library_vst.var(axis=1).sort_values(ascending=False).head(num_genes).index
    df = library_vst.loc[var_bcs]
    meta = library_vst.T.reset_index().rename({'index':'sampleID'}, axis=1)
    new = meta.sampleID.str.split("_", expand=True)
    new.columns = ['mouse', 'day', 'dnaid', 'experiment']
    meta = pd.concat([meta[['sampleID']], new], axis=1).set_index('sampleID')
    pDf, pc1_var, pc2_var = find_pc1_pc2(df, meta)
    return pDf, pc1_var, pc2_var


def plotPCA(pDf, pc1, pc2, title=""):
    fig = px.scatter(pDf.sort_values('day'), x='PC1', y='PC2', color='day', symbol='experiment', hover_data=['mouse'],
              template='simple_white', title=title,
              color_discrete_sequence=px.colors.qualitative.G10,
                labels ={'PC1': f'PC1, {pc1}%',
                        'PC2': f'PC2, {pc2}%',
                        'day': 'Day',
                        'experiment': 'Experiment'})


    fig.update_traces(marker=dict(size=12,
                                  line=dict(width=2,
                                            color='DarkSlateGrey')),
                      selector=dict(mode='markers'))

    fig.update_layout(
        font_family="Arial",
        font_size=14,
        title_font_size=24,
        title_x=0.5
    )
    return fig

In [None]:
pDf, pc1, pc2 = get_pca_df(vsts['library_10_2'], 50)
plotPCA(pDf, pc1, pc2)

In [None]:
figs = []

for library in libraries:
    pDf, pc1, pc2 = get_pca_df(vsts[library])
    figs.append(plotPCA(pDf, pc1, pc2, library))
                


In [None]:
with open(Path(outdir)/'PCA_barcode_abundance.html', 'a') as f:
    for fig in figs:
        f.write(fig.to_html())
    

In [None]:
figs[0]

In [None]:
figs[1]

In [None]:
figs[2]

In [None]:
figs[3]

In [None]:
test_gene.corr()

In [None]:
import seaborn as sns
fig = plt.figure(figsize=(20,20))
ax = sns.clustermap(
    test_gene, 
    cmap=sns.diverging_palette(10, 220, n=256),
    
    figsize=(20,20)
)

In [None]:
def find_pc1_pc2(df, meta):
    df = df.T
    pca = PCA(n_components=2)
    principalComponents = pca.fit_transform(df)
    pDf = (pd.DataFrame(data=principalComponents, columns=['PC1', 'PC2'])
           .set_index(df.index))

    pc1_var = round(pca.explained_variance_ratio_[0] * 100, 2)
    pc2_var = round(pca.explained_variance_ratio_[1] * 100, 2)
    pDf2 = pDf.merge(meta, left_index=True, right_index=True)
    return pDf2, pc1_var, pc2_var


def plotPCA(pDf, pc1_var, pc2_var, colorby, col, nameby="", el=False):
    sns.set_style("ticks")
    sns.set_context("notebook", font_scale=2.2)
    group = pDf[colorby].unique()
    assert len(group) <= len(col)
    fig = plt.figure(figsize=(25, 15))
    for g, c in zip(group, col):
        df = pDf[pDf[colorby] == g]
        x, y = df[["PC1"]].values, df[["PC2"]].values
        ax = plt.scatter(x, y, c=c, s=150, label=g)
        if el:
            pts = np.asarray([[float(a), float(b)] for a, b in zip(x, y)])
            plot_point_cov(pts, nstd=2, alpha=0.1, color=c)
        if nameby:
            labels = df[nameby]
            for label, pc1, pc2 in zip(labels, x, y):
                plt.annotate(label, xy=(pc1, pc2), xytext=(-5, 7), textcoords="offset points",fontsize=14)
        plt.xlabel('Principal Component 1, {} %'.format(pc1_var), )
        plt.ylabel('Principal Component 2, {} %'.format(pc2_var), )
        #plt.xticks(fontsize=16)
        #plt.yticks(fontsize=16)
        plt.legend(frameon=True)
    return fig


In [None]:
test_df.columns = [c[1] for c in test_df.columns]
test_df

In [None]:
from sklearn.decomposition import PCA

new = vsts[0].sampleID.str.split("_", expand=True)
new.columns = ['mouse', 'day', 'dnaid', 'experiment']
meta = pd.concat([vsts[0][['library', 'sampleID']], new], axis=1).set_index('sampleID')

pDf, pc1_var, pc2_var = find_pc1_pc2(test_df, meta)



In [None]:
meta[meta.day == 'd0']

In [None]:
import seaborn as sns
fig = plt.figure(figsize=(20,30))
ax = sns.clustermap(
    library_gene_fitness[[c for c in library_gene_fitness.columns if 'd0' not in c]].corr(), 
    
    cmap=sns.diverging_palette(10, 220, n=256),
    square=True
)

In [None]:
library_gene_fitness.corr()

In [None]:
good_samples_noskew = [s for s in good_samples if s not in skewed_samples and s not in no_inoc_samples]
fits = []
cis = []
wt_fits = []
ssa_ci = []
resultsFitList = []
resultsCIList=[]
# Fitness Results
for library in libraries:
    print(library)
    library_vst, library_barcode_fitness = get_barcode_fitness_by_library(cnt_df, library, good_samples_noskew, outdir, filter_below=1000)
    library_gene_fitness = get_gene_fitness_by_library(library_barcode_fitness, annotation_df)
    library_wt_fitness = get_wt_fitness_by_library(library_barcode_fitness, annotation_df, phenotype='wt')
    library_ssa_fitness = get_wt_fitness_by_library(library_barcode_fitness, annotation_df, phenotype='ssaV_invG')
    library_gene_ci = library_gene_fitness.set_index('ShortName').apply(lambda x: x / library_wt_fitness.median()[x.name]).reset_index()
    library_ssa_ci = library_ssa_fitness.median()/library_wt_fitness.median()
    meltGeneFit = melt_sampleID(library_gene_fitness, idVar=['ShortName'], value_name='fitness')
    meltWtFit = melt_sampleID(library_wt_fitness, idVar=['barcode', 'phenotype', 'conc'], value_name='fitness')
    meltGeneCI = melt_sampleID(library_gene_ci, idVar=['ShortName'], value_name='ci')
    resultsFit = get_library_results(meltGeneFit, meltWtFit, library)
    resultsCI = get_library_results_ci(meltGeneCI, library_ssa_ci, library)
    
    fits.append(meltGeneFit.assign(library=library))
    cis.append(meltGeneCI.assign(library=library))
    wt_fits.append(meltWtFit.assign(library=library))
    resultsFitList.append(resultsFit)
    resultsCIList.append(resultsCI)
    ssa_ci.append(pd.DataFrame(library_ssa_ci, columns=['ssa_ci']).assign(library=library))
    
m2_fits_no_skew = pd.concat(fits)
m2_ci_no_skew = pd.concat(cis)
m2_wt_fits_no_skew = pd.concat(wt_fits)
m2_results_fit_no_skew = pd.concat(resultsFitList)
m2_results_ci_no_skew = pd.concat(resultsCIList)
m2_ssa_ci_no_skew = pd.concat(ssa_ci)

In [None]:

fits = []
cis = []
wt_fits = []
ssa_ci = []
resultsFitList = []
resultsCIList=[]
# Fitness Results
for library in libraries:
    print(library)
    library_vst, library_barcode_fitness = get_barcode_fitness_by_library(cnt_df, library, good_samples, outdir, filter_below=1000)
    library_gene_fitness = get_gene_fitness_by_library(library_barcode_fitness, annotation_df)
    library_wt_fitness = get_wt_fitness_by_library(library_barcode_fitness, annotation_df, phenotype='wt')
    library_ssa_fitness = get_wt_fitness_by_library(library_barcode_fitness, annotation_df, phenotype='ssaV_invG')
    library_gene_ci = library_gene_fitness.set_index('ShortName').apply(lambda x: x / library_wt_fitness.median()[x.name]).reset_index()
    library_ssa_ci = library_ssa_fitness.median()/library_wt_fitness.median()
    meltGeneFit = melt_sampleID(library_gene_fitness, idVar=['ShortName'], value_name='fitness')
    meltWtFit = melt_sampleID(library_wt_fitness, idVar=['barcode', 'phenotype', 'conc'], value_name='fitness')
    meltGeneCI = melt_sampleID(library_gene_ci, idVar=['ShortName'], value_name='ci')
    resultsFit = get_library_results(meltGeneFit, meltWtFit, library)
    resultsCI = get_library_results_ci(meltGeneCI, library_ssa_ci, library)
    
    fits.append(meltGeneFit.assign(library=library))
    cis.append(meltGeneCI.assign(library=library))
    wt_fits.append(meltWtFit.assign(library=library))
    resultsFitList.append(resultsFit)
    resultsCIList.append(resultsCI)
    ssa_ci.append(pd.DataFrame(library_ssa_ci, columns=['ssa_ci']).assign(library=library))
    
m2_fits_skew = pd.concat(fits)
m2_ci_skew = pd.concat(cis)
m2_wt_fits_skew = pd.concat(wt_fits)
m2_results_fit_skew = pd.concat(resultsFitList)
m2_results_ci_skew = pd.concat(resultsCIList)
m2_ssa_ci_skew = pd.concat(ssa_ci)

In [None]:
m2_fits_no_skew.to_csv(Path(outdir)/"21-07-m2_fits_no_skew")
m2_ci_no_skew.to_csv(Path(outdir)/"21-07-m2_ci_no_skew")
m2_wt_fits_no_skew.to_csv(Path(outdir)/"21-07-m2_wt_fits_no_skew")
m2_results_fit_no_skew.to_csv(Path(outdir)/"21-07-m2_results_fit_no_skew")
m2_results_ci_no_skew.to_csv(Path(outdir)/"21-07-m2_results_ci_no_skew")
m2_ssa_ci_no_skew.to_csv(Path(outdir)/"21-07-m2_ssa_ci_no_skew")

In [None]:
m2_fits_skew.to_csv(Path(outdir)/"21-07-m2_fits_skew")
m2_ci_skew.to_csv(Path(outdir)/"21-07-m2_ci_skew")
m2_wt_fits_skew.to_csv(Path(outdir)/"21-07-m2_wt_fits_skew")
m2_results_fit_skew.to_csv(Path(outdir)/"21-07-m2_results_fit_skew")
m2_results_ci_skew.to_csv(Path(outdir)/"21-07-m2_results_ci_skew")
m2_ssa_ci_skew.to_csv(Path(outdir)/"21-07-m2_ssa_ci_skew")

In [None]:
m2_results_fit_no_skew.sample(10)

In [None]:
# CI Results: 
library_ssa_fitness = get_wt_fitness_by_library(library_barcode_fitness, annotation_df, phenotype='ssaV_invG')

meltSsaFit = melt_sampleID(library_ssa_fitness, idVar=['barcode', 'phenotype', 'conc'], value_name='fitness')

In [None]:
library_gene_fitness

In [None]:
library_gene_fitness

In [None]:
meltGeneFit

In [None]:
meltSsaFit[['sampleID', 'fitness']].set_index('sampleID')/meltWtFit[['sampleID', 'fitness']].set_index('sampleID')

In [None]:
meltSsaFit

In [None]:
library_ssa_fitness

In [None]:
y = meltGeneFit[['ShortName', 'day']].drop_duplicates()
y[y.day !='d0'].shape

In [None]:
x = results.merge(meltGeneFit[['ShortName', 'day']].drop_duplicates(), on=['ShortName', 'day'])

In [None]:
results.shape

In [None]:
fit_dfs = []
gene_fit_dfs = []
ci_dfs = []
res_dfs = []
wt_fit_dfs = []
ssa_ci_dfs = []

for library in ['library_10_2']:
    print(library)
    lib_df = cnt_df[cnt_df.library == library].copy()
    #Removing some noise
    lib_df = lib_df[~((lib_df.libcnt.isna()) & (lib_df.phenotype.isna()))]
    library_samples = [s for s in good_samples if s in lib_df.sampleID.unique() and s not in skewed_samples]
    sdf, edf, design = generate_DE_dataset(lib_df, library_samples, sample_id='sampleID', filter_below=1000)
    _, vst_df = get_fitness_results(outdir, library.replace("_", "-"), '', sdf, edf, design)
    method2_results = method2_analysis2(vst_df, annotation_df, library_samples, sample_id='sampleID', hits=0.05)
    
    for df in method2_results:
        if not df.empty:
            df['library'] = library
    all_fitness_df, gene_fitness_df, ci_df, results_df, wt_fitness_df, ssa_ci_df = method2_results    
    fit_dfs.append(all_fitness_df)
    gene_fit_dfs.append(gene_fitness_df)
    ci_dfs.append(ci_df)
    res_dfs.append(results_df)
    wt_fit_dfs.append(wt_fitness_df)
    ssa_ci_dfs.append(ssa_ci_df)
    
fit2 = pd.concat(fit_dfs)
fit2_gene = pd.concat(gene_fit_dfs)
wt_fit2 = pd.concat(wt_fit_dfs)
ssa_ci2 = pd.concat(ssa_ci_dfs)

In [None]:
res2 = pd.concat(res_dfs)

In [None]:
res2.sample(5, random_state=100)

In [None]:
results_df.groupby('day').padj.count()
#print('Tested 1888 genes/barcodes')
for day in ['d1', 'd2', 'd3', 'd4']:
    print(f'Number of significant hits on {day}: {res2[(res2.day == day)&(res2.ci_padj < 0.05)].shape[0]}')
    
    

In [None]:
def get_control_df(fitness, phenotype='wt'):
    fitness.columns = [c.replace("unenriched_", "unenriched-") for c in fitness.columns]
    fitness = fitness.drop(['day'], axis=1)

    wt = fitness[fitness.phenotype == phenotype].dropna(axis=1).drop(['inoculum'], axis=1)
    wt = wt.melt(id_vars=['barcode', 'phenotype', 'conc', 'library'], var_name='sampleID', value_name='fitness')
    new = wt.sampleID.str.split("_", expand=True)
    new.columns = ['mouse', 'day', 'dnaid', 'experiment']
    wt = wt.merge(new, left_index=True, right_index=True)
    return wt

wt = get_control_df(all_fitness_df)

In [None]:
wt

In [None]:
day = 'd3'
wt_d1 = wt[wt.day == day]
fig = px.box(wt_d1, x='mouse', y=np.log2(wt_d1['fitness']), color='mouse',  hover_data=['conc', 'fitness'],
        template='simple_white', title = f'WT-{day}',
              labels={"y": "log2(Fitness)",
                     "conc": "Dilution", "fitness": "Fitness"})
fig.add_hline(y=0, line_width=3, line_dash="dash")

In [None]:
wt_fit2

In [None]:
filter_skewed = wt.groupby('sampleID').agg({'fitness': [lambda x: x.quantile(0.25),  lambda x: x.quantile(0.75)]}).reset_index()
filter_skewed.columns = ['sampleID', 'lowQ', 'highQ']
filter_skewed = filter_skewed[~(filter_skewed.lowQ<1.1)&(filter_skewed.highQ>0.9)]
to_drop = filter_skewed.sampleID.values

In [None]:
to_drop

In [None]:
res2.loc['rfaI']

In [None]:
wt_fit2.groupby('day').wt.mean()

In [None]:
def gene_ranksums(gene_values, wt_values):
    return ranksums(gene_values, wt_values)[1]

In [None]:
gv = ci2[(ci2.ShortName == 'rfaI') & (ci2.day == 'd4')].CI.values
wv = ssa_ci2.loc[ssa_ci2.day == 'd4'].CI.values
gene_ranksums(gv, wv)

In [None]:
res2.loc['rfaI']

In [None]:
# Day 1

In [None]:
res1.columns

In [None]:
t.day.str.split("_")

In [None]:
ivars = ['gene', 'locus', 'num_barcodes', 'library', 'barcode', 'sstart', 'sseqid']
vvars = ['num_samples', 'fitness_mean', 'fitness_std', 'ci', 'zscore', 'pval', 'padj']
df_list = []
for v in vvars:
    t = res1.reset_index().melt(id_vars = ivars, value_vars=[c for c in res1.columns if v in c], value_name=v, var_name='day')
    t['day'] = t.day.str.split("_", expand=True)[0]
    df_list.append(t)
res1m = pd.concat(df_list, axis=1)
res1m = res1m.loc[:, ~res1m.columns.duplicated()]

In [None]:
def significant()

In [None]:
res1m.groupby(['library', 'day']).agg({'padj': [significant]})

In [None]:
res2.groupby(['library', 'day']).agg({'ci_padj': [significant]})

In [None]:
res1m[(res1m.library == 'library_10_2') & (res1m.day=='d1')].sort_values('padj').head(25)

In [None]:
wt_fit2[wt_fit2.library == 'library_11_1']

In [None]:
for library in ['library_10_2']:
    print(library)
    lib_df = cnt_df[cnt_df.library == library].copy()
    library_samples = [s for s in good_samples if s in lib_df.sampleID.unique()]
    vst_df = run_VST_transformation(lib_df, library.replace("_", '-'), good_samples, 
                                    outdir, sample_id='sampleID').set_index('barcode')
    method2_results = method2_analysis(vst_df, annotation_df, library_samples, sample_id='sampleID', hits=0.05)

In [None]:
vst_df.sample(5, random_state=5)

In [None]:
all_fitness_df, gene_fitness_df, ci_df, results_df, wt_fitness_df, ssa_ci_df = method2_results

In [None]:
results_df.sample(5, random_state=42)

In [None]:
results_df.ci_hits.sum()

In [None]:
cnt_df.head()

In [None]:
vst_test = cnt_df[(cnt_df.library=='library_10_2') & (cnt_df.day == 'd1')][['barcode', 'cnt', 'sampleID']].drop_duplicates()

vst_test = vst_test.set_index('barcode').pivot(columns='sampleID').fillna(0)
vst_test.columns = [c[1] for c in vst_test.columns]

In [None]:
vst_test

In [None]:
px.scatter(x=vst_test.mean(axis=1), y=vst_test.var(axis=1)/vst_test.mean(axis=1), log_x=True, log_y=True)

In [None]:
step1 = np.log(vst_test)
step2 = step1.mean(axis=1)
step3 = step2.replace([np.inf, -np.inf], np.nan).dropna()

step4 = step1.T.apply(lambda x: x - x.mean()).T.replace([np.inf, -np.inf], np.nan).dropna()
step5 = step4.median()

In [None]:
scaling_factors = step5.apply(math.exp)

In [None]:
scaling_factors

In [None]:
scaling_factors['ad926_d1_dnaid2017_TV4592A']

In [None]:
norm_vst = vst_test.apply(lambda x: x/scaling_factors[x.name])

In [None]:
import math
math.exp(7.69)

In [None]:
px.scatter(x=norm_vst.mean(axis=1), y=norm_vst.var(axis=1)/vst_test.mean(axis=1), log_x=True, log_y=True)