In [None]:
# %load ../snippets/basic_settings.py
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path


sns.set_context("notebook", font_scale=1.1)
pd.set_option("display.max_columns", 100)
pd.set_option("display.max_rows", 100)
plt.rcParams["figure.figsize"] = (16, 12)
plt.rcParams['savefig.dpi'] = 200
plt.rcParams['figure.autolayout'] = False
plt.rcParams['axes.labelsize'] = 18
plt.rcParams['axes.titlesize'] = 20
plt.rcParams['font.size'] = 16
plt.rcParams['lines.linewidth'] = 2.0
plt.rcParams['lines.markersize'] = 8
plt.rcParams['legend.fontsize'] = 14
plt.rcParams['font.family'] = "serif"
plt.rcParams['font.serif'] = "cm"
#pd.set_option('display.float_format', lambda x: '{:,.2f}'.format(x))
pd.reset_option('display.float_format')

In [None]:
import scipy
from  statsmodels.stats.multitest import multipletests
import subprocess
import os

# Loading the Data

In [None]:
root = Path("/nfs/nas22/fs2202/biol_micro_bioinf_nccr/hardt/nguyenb/tnseq")
dataDir = root/"scratch/08_21/counts/"
controls_file = root/"data/metadata/controls.txt"
outDir = root/"scratch/08_21/results/nguyenb"
metafile = root/"scratch/08_21/complete_metadata.tsv"
cnt_file = outDir/'05-01-2022-annotated_gene_counts_after_qc_inoculum_only.csv'
cnts = pd.read_csv(cnt_file, index_col=0)
cnts['experiment'] = cnts['sampleID'].str.split('_', expand=True)[0]  + "_" + cnts['experiment']

In [None]:
cnts.groupby(['library', 'experiment']).sampleID.nunique()

- Have to drop `library_11_2` -> no replicates

# Running DESeq2

## Creating datasets for DESeq2

In [None]:
# Takes already pre-filtered dataset, and sets up input for DESeq2
# Assume looked through # of samples etc and specified correct design

def generate_DE_dataset(exp_df, sample_id='sampleID', feat_id = 'ShortName', cnt_col='barcode_cnt',  
                        other_cols = ('mouse', 'day', 'tissue', 'dnaid', 'experiment')):

    sample_data = (exp_df[[sample_id] +[c for c in other_cols]]
                   .set_index(sample_id).drop_duplicates())
    
    expr_data = (exp_df[[sample_id, feat_id, cnt_col]].drop_duplicates()
                 .pivot(index=feat_id, columns=sample_id)
                 .fillna(0))
    
    expr_data.columns = [c[1] for c in expr_data.columns]
    expr_data = expr_data[list(sample_data.index)].reset_index()

    return sample_data, expr_data

## Running the script

In [None]:
def run_command(args):
    """Run command, transfer stdout/stderr"""
    result = subprocess.run(args)
    try:
        result.check_returncode()
    except subprocess.CalledProcessError as e:
        raise e
        

def get_fitness_results(fitness_dir, library, sdf, edf, design, r_path, feat_id ):
    sdf_path = Path(fitness_dir) / f"{library}_sdf.csv"
    edf_path = Path(fitness_dir) / f"{library}_edf.csv"
    sdf.to_csv(sdf_path)
    edf.set_index(feat_id).to_csv(edf_path)
    cmd = f'Rscript {r_path} {sdf_path} {edf_path} {library} {design} {fitness_dir}'
    print(cmd)
    r = run_command(cmd.split())
    os.remove(sdf_path)
    os.remove(edf_path)

## Calculate z-score relative to wt tags

In [None]:
def sigma(lfcSE):
    return np.sqrt(lfcSE.pow(2).sum()) / len(lfcSE)


def calculate_2dist_zscore(u1, s1, u2, s2):
    return (u1 - u2) / np.sqrt((s1 ** 2) + (s2 ** 2))


def to_list(x):
    bc_list = list(x)
    if len(bc_list) == 1:
        return bc_list[0]
    return ", ".join(list(set(x)))


def calculate_comparisons2(fitness, df, control_file, treat_var='day'):
    """

    fitness: DESeq2 output, log2FoldChange value for each barcode comparing each time point with inoculum
    df: df for 1 experiment and 1 dnaid
    controls: control meta df?
    """
    controls = pd.read_table(control_file, names=['barcode', 'phenotype', 'conc'])

    controls['CntrlName']= controls['phenotype'] +'-'+ controls['conc'].astype(str)
    controls_bc = controls[controls.phenotype == 'wt'].CntrlName.values
    cntrl_df = fitness[fitness.barcode.isin(controls_bc)]

    gene_df = fitness[~fitness.barcode.isin(controls_bc)].rename({'barcode':'gene'}, axis=1)
    gene_mean = gene_df[['gene', treat_var, 'log2FoldChange', 'lfcSE']]
    
    cntrl_mean = cntrl_df.groupby([treat_var]).agg({'log2FoldChange': ['mean', 'median'], 'lfcSE': [sigma]})
    cntrl_mean.columns = ['cntrl_FC', 'cntrl_FC_median', 'cntrl_sigma']
    cntrl_mean = cntrl_mean.reset_index()
    gene_mean = gene_mean.merge(cntrl_mean, how='left', on=treat_var)
    
    gene_mean['zscore'] = gene_mean.apply(
            lambda x: calculate_2dist_zscore(x['log2FoldChange'], x['lfcSE'], x['cntrl_FC'], x['cntrl_sigma']), axis=1)

    gene_mean['log2CI'] = gene_mean.apply(lambda x: x['log2FoldChange'] - x['cntrl_FC'], axis=1)
    gene_mean = gene_mean[['gene', treat_var, 'log2FoldChange',  'lfcSE', 'zscore', 'log2CI']]
    results = gene_mean.copy()
    results['pval'] = results.zscore.apply(lambda x: scipy.stats.norm.sf(abs(x)) * 2)
    results['padj'] = results.groupby(treat_var).pval.transform(lambda x: multipletests(x, alpha=0.05, method='fdr_bh')[1])
    return results

# Analysing the data

In [None]:
libraries = list(cnts.library.unique())
libraries.remove('library_11_2')
libraries

In [None]:
final_results = []
final_fitness = []
for library in libraries:
    exp_df = cnts[cnts.library == library]
    sdf, edf = generate_DE_dataset(exp_df, sample_id='sampleID', feat_id = 'ShortName', cnt_col='barcode_cnt',  
                        other_cols = ('mouse', 'day', 'tissue', 'experiment'))
    get_fitness_results(outDir, library, sdf, edf, "mouse", 
                    root/"code/notebooks/01_22_enriched_vs_unenriched/DESeq_enriched_vs_unenriched.R", 
                    'ShortName')
    fitness_file = list(outDir.glob(f"{library}*results-inoculum*csv"))[0]
    fitness_df = (pd.read_csv(fitness_file, sep=' ')
                    .assign(library=library)
                    .reset_index()
                    .rename({"index":"barcode"}, axis=1 )
                    .assign(mouse='inoculum'))
    final_fitness.append(fitness_df)
    results = (calculate_comparisons2(fitness_df, exp_df, controls_file, treat_var='mouse')
               .assign(library=library))
    final_results.append(results)
    
fres = pd.concat(final_results)
ffit = pd.concat(final_fitness)


In [None]:
num_lib = fres_annotated.groupby('gene').agg({'library':['nunique'], 
                                            'zscore':['median', 'min', 'max'],
                                            'ci':['median', 'min', 'max'], 
                                            'padj': [lambda x: sum(x<0.05)]}).reset_index()
num_lib.columns = ['gene', 'num_libs_present', 'zscore_median', 'zscore_min',
                  'zscore_max', 'log2CI_median', 'log2CI_min', 'log2CI_max', 'num_of_times_was_hit']

In [None]:
pd.reset_option('display.float_format')
fres[(fres.padj < 0.05) & (abs(fres.log2CI) > 0.7)]

# Annotations

In [None]:
def get_feat_id(x):
    if x is np.nan:
        return x
    elif 'gene-' in x or 'cds-' in x:
        return x.split(';')[0].split('-')[1]
    else:
        return x.split(';')[0]
    
def get_gene_name(x):
    if x is np.nan:
        return x
    elif 'ID=gene' in x:
        return x.split('Name=')[1].split(';')[0]
    elif 'ID=cds' in x and 'gene=' in x:
        return x.split('gene=')[1].split(';')[0]
    elif 'ID=cds' in x and 'Parent=' in x:
        return x.split('Parent=')[1].split(';')[0].split('-')[1]
    else:
        return x.split(';')[0].strip('ID=')

In [None]:
dataDir = "../../../data/metadata"
gff_file = Path(dataDir)/"GCA_000210855.2_ASM21085v2_genomic.gff"
emap_file = Path(dataDir)/"SL1344.emapper.annotations"

gff = pd.read_table(gff_file, skiprows=7, header=None)
gff.columns = ['chr', 'loc', 'feat', 'start', 'end', 'dn', 'strand', 'dn2', 'desc']

gff['feat_id'] = gff['desc'].apply(get_feat_id)
gff['Name'] = gff['desc'].apply(get_gene_name)

gene_to_cds = gff[gff.feat == 'CDS'][['feat_id', 'Name', 'start']]
emap = pd.read_table(emap_file, skiprows=4)
go_map = gene_to_cds.merge(emap, left_on='feat_id', right_on='#query', how='outer')
go_map = go_map.drop(go_map.tail(3).index)

In [None]:
summary_ann = num_lib.merge(go_map,  left_on='gene', right_on='Name')
fres_ann = fres.merge(go_map, left_on='gene', right_on='Name')

In [None]:
summary_ann.to_csv(outDir/'05-01-2022-inoculum-results-summary-annotated.csv')
fres_ann.to_csv(outDir/'05-01-2022-inoculum-results-annotated.csv')

# Exploratory

- would want no trend in unenriched samples (i.e. random why there are diffent amounts in the libraries)
- want a trend after enrichment, i.e. different starting amounts influenced how strains grew)


In [None]:
goi = num_lib[(num_lib.num_libs_present > 4)].gene.values

In [None]:
test = cnts[(cnts.mouse == 'unenriched_inoculum') & (cnts.ShortName.isin(goi))][['ShortName', 'sampleID', 'barcode_cnt']]

In [None]:
test2 = test.pivot(columns='sampleID', index='ShortName', values='barcode_cnt').fillna(0)

In [None]:
test3 = np.log2(test2/test2.sum()*1000000+0.5)

In [None]:
test3.var(axis=1).sort_values().tail(10)
# Genes with variable starting amounts

In [None]:
np.log2(0.5)

In [None]:
c1 = 'xylR'
c2 = 'rmbA'
test4 = pd.DataFrame([test3.loc[c1], test3.loc[c2]]).T
test4 = test4[(test4[c1] > -1) & (test4[c2] > -1)]
plt.plot(test4[c1], test4[c2], 'k.')
plt.ylim(0, 14)
plt.xlim(10, 14)

In [None]:
etest3.var(axis=1).sort_values().tail(10)

In [None]:
etest = cnts[(cnts.mouse == 'inoculum') & (cnts.ShortName.isin(goi))][['ShortName', 'sampleID', 'barcode_cnt']]
etest2 = etest.pivot(columns='sampleID', index='ShortName', values='barcode_cnt').fillna(0)
etest3 = np.log2(etest2/etest2.sum()*1000000+0.5)
c1 = 'xylR'
c2 = 'rmbA'
etest4 = pd.DataFrame([etest3.loc[c1], etest3.loc[c2]]).T
etest4 = etest4[(etest4[c1] > -1) & (etest4[c2] > -1)]
plt.plot(etest4[c1], etest4[c2], 'k.')
plt.ylim(0, 14)
plt.xlim(10, 14)