In [None]:
# %load ../snippets/basic_settings.py
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path


sns.set_context("notebook", font_scale=1.1)
pd.set_option("display.max_columns", 100)
pd.set_option("display.max_rows", 100)
plt.rcParams["figure.figsize"] = (16, 12)
plt.rcParams['savefig.dpi'] = 200
plt.rcParams['figure.autolayout'] = False
plt.rcParams['axes.labelsize'] = 18
plt.rcParams['axes.titlesize'] = 20
plt.rcParams['font.size'] = 16
plt.rcParams['lines.linewidth'] = 2.0
plt.rcParams['lines.markersize'] = 8
plt.rcParams['legend.fontsize'] = 14
plt.rcParams['text.usetex'] = False  # True activates latex output in fonts!
plt.rcParams['font.family'] = "serif"
plt.rcParams['font.serif'] = "cm"
pd.set_option('display.float_format', lambda x: '{:,.2f}'.format(x))

In [None]:
import scipy
from  statsmodels.stats.multitest import multipletests
import subprocess
import os

# Loading the data

In [None]:
root = Path("/nfs/nas22/fs2202/biol_micro_bioinf_nccr/hardt/nguyenb/tnseq")
dataDir = root/"scratch/08_21/counts/"
controls_file = root/"data/metadata/controls.txt"
outDir = root/"scratch/08_21/results/nguyenb"
metafile = root/"scratch/08_21/complete_metadata.tsv"
cnt_file = outDir/'09-11-2021-annotated_gene_counts_after_qc.csv'
cnts = pd.read_csv(cnt_file, index_col=0)
cnts['experiment'] = cnts['sampleID'].str.split('_', expand=True)[0]  + "_" + cnts['experiment']

# Setting up for DESeq2

## Creating datasets for DESeq2

In [None]:
# Takes already pre-filtered dataset, and sets up input for DESeq2
# Assume looked through # of samples etc and specified correct design

def generate_DE_dataset(exp_df, sample_id='sampleID', feat_id = 'ShortName', cnt_col='barcode_cnt',  
                        other_cols = ('mouse', 'day', 'tissue', 'dnaid', 'experiment')):

    sample_data = (exp_df[[sample_id] +[c for c in other_cols]]
                   .set_index(sample_id).drop_duplicates())
    
    expr_data = (exp_df[[sample_id, feat_id, cnt_col]].drop_duplicates()
                 .pivot(index=feat_id, columns=sample_id)
                 .fillna(0))
    
    expr_data.columns = [c[1] for c in expr_data.columns]
    expr_data = expr_data[list(sample_data.index)].reset_index()

    return sample_data, expr_data

## Running DESeq2 Script

In [None]:
def run_command(args):
    """Run command, transfer stdout/stderr"""
    result = subprocess.run(args)
    try:
        result.check_returncode()
    except subprocess.CalledProcessError as e:
        raise e
        

def get_fitness_results(fitness_dir, library, sdf, edf, design, r_path, feat_id ):
    sdf_path = Path(fitness_dir) / f"{library}_sdf.csv"
    edf_path = Path(fitness_dir) / f"{library}_edf.csv"
    sdf.to_csv(sdf_path)
    edf.set_index(feat_id).to_csv(edf_path)
    cmd = f'Rscript {r_path} {sdf_path} {edf_path} {library} {design} {fitness_dir}'
    print(cmd)
    r = run_command(cmd.split())
    os.remove(sdf_path)
    os.remove(edf_path)

## Calcuate z-scores relative to wt tags

In [None]:
def sigma(lfcSE):
    return np.sqrt(lfcSE.pow(2).sum()) / len(lfcSE)


def calculate_2dist_zscore(u1, s1, u2, s2):
    return (u1 - u2) / np.sqrt((s1 ** 2) + (s2 ** 2))


def to_list(x):
    bc_list = list(x)
    if len(bc_list) == 1:
        return bc_list[0]
    return ", ".join(list(set(x)))


def calculate_comparisons2(fitness, df, control_file, treat_var='day'):
    """

    fitness: DESeq2 output, log2FoldChange value for each barcode comparing each time point with inoculum
    df: df for 1 experiment and 1 dnaid
    controls: control meta df?
    """
    days = sorted(list(fitness[treat_var].unique()))
    # days.remove('d0')
    controls = pd.read_table(control_file, names=['barcode', 'phenotype', 'conc'])

    controls['CntrlName']= controls['phenotype'] +'-'+ controls['conc'].astype(str)
    controls_bc = controls[controls.phenotype == 'wt'].CntrlName.values
    #controls_bc = controls[controls.phenotype == 'wt'].barcode.values
    cntrl_df = fitness[fitness.barcode.isin(controls_bc)]

    gene_df = fitness[~fitness.barcode.isin(controls_bc)].rename({'barcode':'ShortName'}, axis=1)
    gene_mean = gene_df.groupby(['ShortName', treat_var]).agg(
            {'log2FoldChange': ['mean', 'median'], 'lfcSE': [sigma]}).reset_index()
    gene_mean.columns = ['gene', treat_var, 'gene_FC', 'gene_FC_median', 'sigma']
    cntrl_mean = cntrl_df.groupby([treat_var]).agg({'log2FoldChange': ['mean', 'median'], 'lfcSE': [sigma]})
    cntrl_mean.columns = ['cntrl_FC', 'cntrl_FC_median', 'cntrl_sigma']
    cntrl_mean = cntrl_mean.reset_index()
    gene_mean = gene_mean.merge(cntrl_mean, how='left', on=treat_var)
    
    gene_mean['zscore'] = gene_mean.apply(
            lambda x: calculate_2dist_zscore(x['gene_FC'], x['sigma'], x['cntrl_FC'], x['cntrl_sigma']), axis=1)

    gene_mean['ci'] = gene_mean.apply(lambda x: 2 ** x['gene_FC'] / 2 ** x['cntrl_FC'], axis=1)
    gene_mean = gene_mean[['gene', treat_var, 'gene_FC',  'sigma', 'zscore', 'ci']]
    results = gene_mean.copy()
    results['pval'] = results.zscore.apply(lambda x: scipy.stats.norm.sf(abs(x)) * 2)
    results['padj'] = results.groupby(treat_var).pval.transform(lambda x: multipletests(x, alpha=0.05, method='fdr_bh')[1])
    return results


# Analyzing the data

In [None]:
libraries = cnts.library.unique()
final_results = []
final_fitness = []
for library in libraries: 
    lib_cnts = cnts[cnts.library == library]
    sdf, edf = generate_DE_dataset(lib_cnts, other_cols = ('mouse', 'day', 'tissue', 'experiment'))
    if sdf.experiment.nunique() == 1:`
        design = 'day'
    else:
        design = "day+experiment"
    print(library)
    print(design)
    get_fitness_results(outDir, library, sdf, edf, design, 
                    root/"code/notebooks/11_21_nguyenb_analysis/DEseq.R", 'ShortName')
    fitness_files = [f for f in outDir.glob(f"{library}*results*csv")]
    fitness_df = (pd.concat([pd.read_csv(f, sep=' ').assign(day=f.stem.split("-")[-1]) for f in fitness_files])
                    .assign(library=library)
                    .reset_index()
                    .rename({"index":"barcode"}, axis=1 ))
    final_fitness.append(fitness_df)
    results = calculate_comparisons2(fitness_df, lib_cnts, controls_file, treat_var='day').assign(library=library)
    final_results.append(results)
fres = pd.concat(final_results)
ffit = pd.concat(final_fitness)

# Writing out the results

In [None]:
(fres[['library', 'gene', 'day', 'zscore', 'pval', 'padj', 'ci']]
 .sort_values(['library', 'day'])
 .to_csv(outDir/"24-11-2021-all-libraries-zscores.csv"))

In [None]:
ffit = ffit.rename({'barcode':'gene'}, axis=1)

(ffit[['library', 'gene','day', 'baseMean', 'log2FoldChange', 'lfcSE', 'stat', 'pvalue', 'padj']]
 .sort_values(['library', 'day'])
 .to_csv(outDir/'24-11-2021-deseq2-output-all-libraries.csv'))


