In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import plotnine as p9
import sys
sys.path.append("/Users/ansintsova/git_repos/nguyenb_tnseq/code/mbarq_analysis")
import quality_control as qc
import method1_analysis as m1
import os
import scipy 
from  statsmodels.stats.multitest import multipletests

In [None]:
dataDir = Path("/Users/ansintsova/git_repos/nguyenb_tnseq/data/08_21")
controlFile = dataDir/"controls.txt"
geneDf = pd.read_csv(dataDir/"15-10-filtered-gene-level.csv", index_col=0)
bcDf = pd.read_csv(dataDir/"15-10-filtered-barcode-level.csv", index_col=0)

In [None]:
geneDf['experiment'] = geneDf['dnaid'] + "-" + geneDf['experiment']
geneDf.head()

In [None]:
test = geneDf[geneDf.library == 'library_14_2']
t = test[['sampleName', 'ShortName', 'barcode_cnt']].pivot(index='ShortName', columns='sampleName').head()
t.columns = [c[1] for c in t.columns]
t

In [None]:
# Takes already pre-filtered dataset, and sets up input for DESeq2

def generate_DE_dataset(exp_df, sample_id='sampleID', feat_id = 'ShortName'):

    sample_data = (exp_df[[sample_id, 'mouse', 'day', 'tissue', 'dnaid', 'experiment']]
                   .set_index(sample_id).drop_duplicates())
    sample_data, design = qc.check_sdf(sample_data)
    
    expr_data = exp_df[[sample_id, feat_id, 'barcode_cnt']].drop_duplicates().pivot(index=feat_id, columns=sample_id).fillna(0)
    expr_data.columns = [c[1] for c in expr_data.columns]
    expr_data = expr_data[list(sample_data.index)].reset_index()
    print(f"Number of unique experiments after filtering: {sample_data.experiment.nunique()}")
    print(f"Design: {design}")
    return sample_data, expr_data, design

In [None]:
a,b,c= generate_DE_dataset(test, sample_id='sampleName', feat_id = 'ShortName')

In [None]:
a[['day', 'experiment']].sample(5)

In [None]:
def get_fitness_results(fitness_dir, experiment, sdf, edf, design, feat_id):
    sdf_path = Path(fitness_dir) / f"{experiment}_sdf.csv"
    edf_path = Path(fitness_dir) / f"{experiment}_edf.csv"
    sdf.to_csv(sdf_path)
    edf.set_index(feat_id).to_csv(edf_path)
    #rpath = Path(__file__).parent.absolute()
    rscript = '/Users/ansintsova/git_repos/nguyenb_tnseq/code/notebooks/07_2021/DEseq.R'
    cmd = f'Rscript {rscript} {sdf_path} {edf_path} {design}'
    print(cmd)
    r = qc.run_command(cmd.split())
    fitness = pd.concat(
        [pd.read_table(f, sep=' ').assign(day=f.stem.split("_")[-1]) for f in Path(fitness_dir).iterdir() if
         f"{experiment}_fitness" in f.stem])
    vst_counts = pd.read_csv(Path(fitness_dir)/f"{experiment}_vst.csv")
    for file in [f for f in Path(fitness_dir).iterdir() if f"{experiment}_fitness" in f.stem]:
        os.remove(file)
    os.remove(sdf_path)
    os.remove(edf_path)
    os.remove(Path(fitness_dir)/f"{experiment}_vst.csv")
    n_samples = sdf.groupby('day').mouse.nunique().to_dict()
    fitness['n_samples'] = fitness.day.map(n_samples)
    fitness = fitness.reset_index().rename({'index': 'barcode'}, axis=1)
    return fitness, vst_counts

In [None]:
f, c = get_fitness_results(dataDir, 'library_14_2', a,b,c,'ShortName')

In [None]:
f =f.rename({'gene':'barcode'}, axis=1)

In [None]:
f[(f.padj<0.1) &(len(f.gene)) < 10].sample(5)

In [None]:
def sigma(lfcSE):
    return np.sqrt(lfcSE.pow(2).sum()) / len(lfcSE)


def calculate_2dist_zscore(u1, s1, u2, s2):
    return (u1 - u2) / np.sqrt((s1 ** 2) + (s2 ** 2))


def to_list(x):
    bc_list = list(x)
    if len(bc_list) == 1:
        return bc_list[0]
    return ", ".join(list(set(x)))


def calculate_comparisons2(fitness, df, control_file):
    """

    fitness: DESeq2 output, log2FoldChange value for each barcode comparing each time point with inoculum
    df: df for 1 experiment and 1 dnaid
    controls: control meta df?
    """
    days = sorted(list(fitness['day'].unique()))
    # days.remove('d0')
    controls = pd.read_table(control_file, names=['barcode', 'phenotype', 'conc'])

    controls = pd.read_table(control_file, names=['barcode', 'phenotype', 'conc'])
    
    controls['CntrlName']= controls['phenotype'] + controls['conc'].astype(str)
    controls_bc = controls[controls.phenotype == 'wt'].barcode.values

    cntrl_df = fitness[fitness.barcode.isin(controls_bc)]
    
    gene_df = fitness[~fitness.barcode.isin(controls_bc)].rename({'barcode':'ShortName'}, axis=1)
    gene_mean = gene_df.groupby(['ShortName', 'day']).agg(
            {'log2FoldChange': ['mean', 'median'], 'lfcSE': [sigma]}).reset_index()
    gene_mean.columns = ['gene', 'day', 'gene_FC', 'gene_FC_median', 'sigma']
    cntrl_mean = cntrl_df.groupby(['day']).agg({'log2FoldChange': ['mean', 'median'], 'lfcSE': [sigma]})
    cntrl_mean.columns = ['cntrl_FC', 'cntrl_FC_median', 'cntrl_sigma']
    cntrl_mean = cntrl_mean.reset_index()
    gene_mean = gene_mean.merge(cntrl_mean, how='left', on='day')
    
    gene_mean['zscore'] = gene_mean.apply(
            lambda x: calculate_2dist_zscore(x['gene_FC'], x['sigma'], x['cntrl_FC'], x['cntrl_sigma']), axis=1)

    gene_mean['ci'] = gene_mean.apply(lambda x: 2 ** x['gene_FC'] / 2 ** x['cntrl_FC'], axis=1)
    gene_mean = gene_mean[['gene', 'day', 'gene_FC',  'sigma', 'zscore', 'ci']]
    results = gene_mean.copy()
    results['pval'] = results.zscore.apply(lambda x: scipy.stats.norm.sf(abs(x)) * 2)
    results['padj'] = results.groupby('day').pval.transform(lambda x: multipletests(x, alpha=0.05, method='fdr_bh')[1])
    return results

def final_fitness_table(fitness, exp_df, control_file, results):
    barcode_info = exp_df[['barcode', 'locus_tag', 'ShortName', 'library']].drop_duplicates()
    fitness = fitness.merge(barcode_info, how='left', on=['barcode'])
    bc_per_gene = fitness.groupby(['library', 'ShortName']).agg({'barcode': ['nunique', to_list]}).reset_index()
    bc_per_gene.columns = ['library', 'ShortName', 'num_barcodes', 'barcode']
    num_samples = fitness[['day', 'n_samples']].drop_duplicates()
    control_fit = get_control_fitness(fitness, control_file)

    fit_summary = (results.merge(num_samples, how='left', on='day')
                   .merge(bc_per_gene, how='left', left_on='gene', right_on='ShortName')
                   )
    return pd.concat([fit_summary, control_fit])

In [None]:
results = calculate_comparisons2(f, test, controlFile)

#final = final_fitness_table(f, test, controlFile, results)

In [None]:
results[(results.day =='d1') & (results.padj < 0.05)]

In [None]:
def calculte_comparisons(fitness, df, control_file):
    """

    fitness: DESeq2 output, log2FoldChange value for each barcode comparing each time point with inoculum
    df: df for 1 experiment and 1 dnaid
    controls: control meta df?
    """
    days = sorted(list(fitness['day'].unique()))
    # days.remove('d0')
    controls = pd.read_table(control_file, names=['barcode', 'phenotype', 'conc'])

    # Get all entries that were mapped to a gene
    gene_bc = df[df.phenotype.isna()].barcode.values
    gene_df = fitness[fitness.barcode.isin(gene_bc)]  # subsetting only on barcodes present in fitness table
    # Add gene annotation to the fitness table
    gene_df = gene_df.merge(df[['barcode', 'ShortName']], how='left', on='barcode').drop_duplicates()
    
    # Calculate mean log2FoldChange and sigma for each gene (for all )
    
    gene_mean = gene_df.groupby(['ShortName', 'day']).agg(
        {'log2FoldChange': ['mean', 'median'], 'lfcSE': [sigma]}).reset_index()
    gene_mean.columns = ['gene', 'day', 'gene_FC', 'gene_FC_median', 'sigma']
    # Get all the WITS barcodes
    controls_bc = controls[controls.phenotype == 'wt'].barcode.values
   
    cntrl_df = fitness[fitness.barcode.isin(controls_bc)]
    
    # Calculate mean log2FoldChange and sigma for the control barcodes (for all barcodes)
    cntrl_mean = cntrl_df.groupby(['day']).agg({'log2FoldChange': ['mean', 'median'], 'lfcSE': [sigma]})
    cntrl_mean.columns = ['cntrl_FC', 'cntrl_FC_median', 'cntrl_sigma']
    cntrl_mean = cntrl_mean.reset_index()
    
    # Calculate zscore and competitive index (CI) for each gene
    gene_mean = gene_mean.merge(cntrl_mean, how='left', on='day')
    gene_mean['zscore'] = gene_mean.apply(
        lambda x: calculate_2dist_zscore(x['gene_FC'], x['sigma'], x['cntrl_FC'], x['cntrl_sigma']), axis=1)
    gene_mean['ci'] = gene_mean.apply(lambda x: 2 ** x['gene_FC'] / 2 ** x['cntrl_FC'], axis=1)
    gene_mean = gene_mean[['gene', 'day', 'gene_FC', 'gene_FC_median', 'sigma', 'zscore', 'ci']]
    
    # Get all barcodes that were not mapped to a gene
    all_cntrl_bc = controls.barcode.values
    others_bc = df[(df.locus_tag.isna()) & ~(df.barcode.isin(all_cntrl_bc))].barcode.values
    other_df = fitness[fitness.barcode.isin(others_bc)]
    if not other_df.empty:
        other_df = other_df.merge(cntrl_mean, how='left', on='day')

        # Calculate zscore and CI for each barcode
        other_df['zscore'] = other_df.apply(
            lambda x: calculate_2dist_zscore(x['log2FoldChange'], x['lfcSE'], x['cntrl_FC'], x['cntrl_sigma']), axis=1)
        other_df['ci'] = other_df.apply(lambda x: 2 ** x['log2FoldChange'] / 2 ** x['cntrl_FC'], axis=1)
        other_df['gene_FC'] = other_df['gene_FC_median'] = other_df['log2FoldChange']
        other_df['sigma'] = other_df['lfcSE']
        other_df = other_df[['barcode', 'day', 'gene_FC', 'gene_FC_median', 'sigma','zscore', 'ci']].rename({'barcode': 'gene'}, axis=1)
        
    # Concatenate the gene and barcode results
        results = pd.concat([gene_mean, other_df])
    else:
        results = gene_mean.copy()
    # Calculate p-values for the genes/barcodes
    results['pval'] = results.zscore.apply(lambda x: scipy.stats.norm.sf(abs(x)) * 2)
    results['padj'] = results.groupby('day').pval.transform(lambda x: multipletests(x, alpha=0.05, method='fdr_bh')[1])
    return results

In [None]:
bcDf['experiment'] = bcDf['dnaid'] + "-" + bcDf['experiment']
bcDf.head()

test2 = bcDf[bcDf.library == 'library_14_2']
t = test2[['sampleName', 'barcode', 'barcode_cnt']].drop_duplicates().pivot(index='barcode', columns='sampleName').head()
t.columns = [c[1] for c in t.columns]
t

In [None]:
a,b,c= generate_DE_dataset(test2, sample_id='sampleName', feat_id = 'barcode')


In [None]:
f, c = get_fitness_results(dataDir, 'library_14_2', a,b,c, 'barcode')

In [None]:
test3 = test2.set_index('ShortName')

In [None]:
results2 = calculte_comparisons(f, test2, controlFile)

In [None]:
results2[(results.day =='d1') & (results.padj < 0.05)]

In [None]:
comp = results.merge(results2, on=['gene', 'day'], how='inner')
comp_sig = comp[(comp.padj_x<0.05) | (comp.padj_y < 0.05)]

In [None]:
plt.figure(figsize=(8,8))
sns.regplot(data=comp, x='zscore_x', y='zscore_y', )
plt.hlines(0, -15, 8, color='gray', alpha=0.3)
plt.vlines(0, -22, 8, color='gray', alpha=0.3)
plt.xlim(-15, 8)
plt.ylim(-21, 7)
plt.xlabel('Z-scores (gene level)')
plt.ylabel('Z-scores (barcode level)')