In [None]:
import pandas as pd
from pathlib import Path
from tnseq2.src.analysis import *

import numpy as np
from scipy.stats import ranksums
import matplotlib.pyplot as plt
import chart_studio
import chart_studio.tools as tls
import chart_studio.plotly as py
import plotly.express as px
%matplotlib inline
import cufflinks as cf
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)
cf.go_offline()

In [None]:
user_name = 'ansintsova'
api_key = "ZN7bd2M7Asfvb0iDMoPN"
tls.set_credentials_file(username=user_name, api_key=api_key)

# Normalize counts:

- Using lib10 mice for this analysis

1. Rarefaction -> is this appropriate, no one does this outside of microbiome data, and questionable for microbiome as well.
2. DESeq2 VST transformation -> more acceptable
3. TPMs

Compare this to results produced by DESeq2 alone

- Get the raw counts for library_10_2

In [None]:
root ="/Users/ansintsova/git_repos/nguyenb_tnseq/data/13_04_results"
results = "results"
counts = "counts"
dnaids = ['dnaid1315', 'dnaid1428', 'dnaid1429', 'dnaid2015', 'dnaid2016', 'dnaid2017', 'dnaid2018', 'dnaid2019',
         'dnaid2023', 'dnaid2024', 'dnaid2025', 'dnaid2026', 'dnaid2027', 'dnaid2028', 'dnaid2029' ]
cnt_df = load_files(dnaids, Path(root)/counts)
lib10_cnt = cnt_df[cnt_df.library == 'library_10_2'].copy()
lib10_cnt['sampleIDExp'] = lib10_cnt['sampleID'] + "_"+ lib10_cnt['dnaid'] + "_" + lib10_cnt['experiment']

## Identify good samples and subset count DF

In [None]:
control_file = Path(root)/'controls.txt'
corr_df, good_samples = calculate_correlation(lib10_cnt, control_file, for_each='sampleIDExp')
lib10_cnt = lib10_cnt[lib10_cnt.sampleIDExp.isin(good_samples)]

In [None]:
lib10_cnt.head()

## Normalize data

### VST

In [None]:
sdf = lib10_cnt[['sampleID', 'mouse', 'day', 'tissue', 'dnaid', 'experiment', 'sampleIDExp']].set_index('sampleIDExp').drop_duplicates()
edf = (lib10_cnt[['barcode', 'sampleIDExp', 'experiment', 'mouse', 'day', 'tissue', 'dnaid', 'cnt']].drop_duplicates()
       .pivot(index='barcode', columns='sampleIDExp', values='cnt'))
edf = edf[list(sdf.index)]
edf = edf.fillna(0)
sdf.to_csv(Path(root)/results/'30_04_lib10_sdf.csv')
edf.to_csv(Path(root)/results/'30_04_lib10_edf.csv')
# # Run DESeq2
vst = pd.read_csv(Path(root)/results/'30_14_lib10_vsd.csv').rename({'Unnamed: 0':'barcode'}, axis=1).set_index('barcode')

## CLR

- ignoring 0 does not make sense for the transformation
- have to use pseudocounts


In [None]:
import numpy as np
from skbio.stats.composition import clr

def clr_on_array_with_0(a):
    a = np.ma.masked_equal(a, 0)
    transformed = clr(a)
    transformed[transformed.mask] = a.fill_value
    transformed = transformed.data
    return transformed


def clr_on_array_with_pseudocount(a):
    transformed = clr(a+1)
    return transformed

clr_df= edf.copy().apply(clr_on_array_with_0)
clr2_df = edf.copy().apply(clr_on_array_with_pseudocount)

In [None]:
clr2_df.head()

In [None]:
clr_df.ad927_d1_dnaid2017_TV4592A[clr_df.ad927_d1_dnaid2017_TV4592A==0]

In [None]:
px.scatter(x=clr2_df.ad927_d1_dnaid2017_TV4592A, y = clr_df.ad927_d1_dnaid2017_TV4592A)

In [None]:
px.scatter(x=clr2_df.ad927_d1_dnaid2017_TV4592A, y = logbpm.ad927_d1_dnaid2017_TV4592A)

In [None]:
px.scatter(x=clr2_df.ad927_d1_dnaid2017_TV4592A, y = vst.ad927_d1_dnaid2017_TV4592A)

In [None]:
px.scatter(x=np.log2(rare.ad927_d1_dnaid2017_TV4592A+1), y = vst.ad927_d1_dnaid2017_TV4592A)

### Rarefaction

In [None]:
lib10_cnt.head()

In [None]:
good_samples_rare = list(good_samples).copy()
good_samples_rare.remove('am487_d1_dnaid2027_TV5563A') #Has too few reads
lib10_cnt_rare = lib10_cnt[lib10_cnt.sampleIDExp.isin(good_samples_rare)]
edf_rare = (lib10_cnt_rare[['barcode', 'sampleIDExp', 'experiment', 'mouse', 'day', 'tissue', 'dnaid', 'cnt']].drop_duplicates()
       .pivot(index='barcode', columns='sampleIDExp', values='cnt'))
edf_rare = edf_rare.fillna(0)
edf_rare.to_csv(Path(root)/'results/03_05_lib10_rare_edf.csv')
## Rarefy with vegan in R

#rare = pd.read_csv(Path(root)/'results/05_05_lib10_rarefied_edf.csv').rename({'Unnamed: 0':'barcode'}, axis=1).set_index('barcode')

rare = (pd.read_csv(Path(root)/'results/05_05_lib10_rarefied_edf.csv')
        .set_index('Unnamed: 0').T)
rare.index.name = 'barcode'

### TPMs (or barcodes per million, BPMs)

In [None]:
per_million = edf.sum()/1000000
bpm = edf/per_million
logbpm = np.log2(bpm +1)

In [None]:
annotation_df = cnt_df[['barcode', 'ShortName', 'locus_tag', 'phenotype', 'conc']].drop_duplicates()

- Calculate mean for the inoculum samples
- Calculate fitness

In [None]:
import statsmodels
def gene_ranksums(gene_values, wt_values):
    return ranksums(gene_values, wt_values)[1]

def fdr_correction(pvals):
    return statsmodels.stats.multitest.multipletests(pvals, alpha=0.05, method='fdr_bh')[1]


def get_median_for_gene_on_a_day(df, annotation, day, grp_by='ShortName'):
    df = df.merge(annotation, on='barcode')
    samples = dict.fromkeys(([c for c in df.columns if day in c]), ['median'])
    day_median_value = df.groupby('ShortName').agg(samples)
    day_median_value.columns = [c[0] for c in day_median_value.columns]
    return day_median_value


def get_fitness(cnts, annotation, day, good_samples=good_samples, hits=0.05, rare=False):
    # Calculate a mean value for all inoculum samples
    
    cnts['inoculum'] = cnts[[c for c in cnts.columns if 'd0' in c]].mean(axis=1)
    cnts = cnts.dropna(subset=['inoculum'])
    cnts = cnts[cnts.inoculum > 0]
    # Caclucalte fitness for each barcode
    if rare:
        fitness = cnts.apply(lambda x: x/cnts['inoculum']).reset_index()
    else:
        fitness = cnts.apply(lambda x: 2**x/2**cnts['inoculum']).reset_index()
    # Add gene annotation
    
        
    fitness = fitness.merge(annotation, on='barcode')
    # Calculate fitness for each gene
    # Drop controls for now
    gene_df = fitness[fitness.phenotype.isna()]
    # Calculate median value for each ShortName
    samples = dict.fromkeys(([c for c in fitness.columns if day in c]), ['median'])
    day_fitness = gene_df.groupby('ShortName').agg(samples)
    day_fitness.columns = [c[0] for c in day_fitness.columns]

    # Calculate median fitness for wt barcodes
    wt_fitness = fitness[fitness.phenotype == 'wt'][[c for c in good_samples if day in c]].median()
    
    # Calculate rnasksums test for wt vs each gene, multi-test correction using Benjamini/Hochberg (non-negative)
    pvals = day_fitness.apply(gene_ranksums,  axis=1, wt_values = wt_fitness)
    padj = fdr_correction(pvals.values)
    results = pd.DataFrame([pvals.values, padj], columns=pvals.index, index=['pval', 'padj']).T
    
    # Calculate CI
    day_ci = day_fitness.apply(lambda x: x/wt_fitness[x.name])
    ssa_fitness = fitness[fitness.phenotype == 'ssaV_invG'][[c for c in good_samples if day in c]].median()
    ssa_ci = ssa_fitness/wt_fitness
    
    pvals_ci = day_ci.apply(gene_ranksums,  axis=1, wt_values = ssa_ci)
    ci_res = pd.DataFrame([pvals_ci.values, fdr_correction(pvals_ci.values)], columns=pvals.index, index=['ci_pval', 'ci_padj']).T
    
    # Calculate median and mean fitness and median and mean CI
    
    day_fitness['median_fitness'] = day_fitness.median(axis=1)
    day_fitness['mean_fitness'] =  day_fitness.mean(axis=1)
    day_fitness = day_fitness.merge(results, left_index=True, right_index=True)
    day_ci['median_CI'] = day_ci.median(axis=1)
    day_ci['mean_CI'] = day_ci.mean(axis=1)
    
    day_ci = day_ci.merge(ci_res, left_index=True, right_index=True)
    controls = pd.concat([wt_fitness, ssa_fitness], axis=1)
    controls.columns = ['wt', 'ssaV_invG']
    day_fitness['hits'] = day_fitness['padj'] < hits
    day_ci['hits'] = day_ci['ci_padj'] < hits
    return day_fitness, day_ci, controls, ssa_ci

### Looking at the Inoculum

In [None]:
# VST

vst_cnt_d0 = get_median_for_gene_on_a_day(vst, annotation_df, 'd0')
vst_inoculum_cnts = vst_cnt_d0.median(axis=1)
vst_inoculum_cnts.name = 'median_inoculm_value'

# Rare
rare_cnt_d0 = get_median_for_gene_on_a_day(rare, annotation_df, 'd0')
rare_inoculum_cnts = rare_cnt_d0.median(axis=1)
rare_inoculum_cnts.name = 'median_inoculm_value'
# BPM
bpm_cnt_d0 = get_median_for_gene_on_a_day(logbpm, annotation_df, 'd0')
bpm_inoculum_cnts = bpm_cnt_d0.median(axis=1)
bpm_inoculum_cnts.name = 'median_inoculm_value'
inoculum_cnts = pd.DataFrame([vst_inoculum_cnts, rare_inoculum_cnts, bpm_inoculum_cnts], index=['VST', 'RARE', 'BPM']).T

In [None]:
inoculum_cnts.head()

In [None]:
import ipywidgets as widgets
from ipywidgets import interact, interact_manual

@interact
def scatter_plot(normalization=['VST', 'RARE', 'BPM']):
    fig = px.histogram(inoculum_cnts, x=normalization, nbins=100)
    fig.show()

### Looking at Day 1

In [None]:
vst_inoculum_cnts.name = 'median_inoculm_value'

In [None]:
# VST
vst_fit_d1, vst_ci_d1, vst_controls_d1, vst_ssa_ci_d1 = get_fitness(vst, annotation_df, 'd1', good_samples=good_samples)

vst_fit_d1 = vst_fit_d1.merge(2**vst_inoculum_cnts, on='ShortName')

# RARE

rare_fit_d1, rare_ci_d1, rare_controls_d1, rare_ssa_ci_d1 = get_fitness(rare, annotation_df, 'd1', good_samples=good_samples_rare, rare=True)

rare_fit_d1 = rare_fit_d1.merge(rare_inoculum_cnts, on='ShortName')
# BPM

bpm_fit_d1, bpm_ci_d1, bpm_controls_d1, bpm_ssa_ci_d1 = get_fitness(logbpm, annotation_df, 'd1', good_samples=good_samples)

bpm_fit_d1 = bpm_fit_d1.merge(bpm_inoculum_cnts, on='ShortName')

In [None]:
vst_fit_d1.hits.sum()

In [None]:
rare_ci_d1.loc['dcuR']

In [None]:
vst_controls_d1 

In [None]:
vst_fitness = px.scatter(vst_fit_d1, x='median_fitness', y='padj',color='hits', size='median_inoculm_value', 
                     log_y=True, hover_data=[vst_fit_d1.index, vst_fit_d1.median_inoculm_value])
vst_fitness.update_yaxes(autorange="reversed")

vst_fitness.write_html(f'{root}/results/test.html')

In [None]:
deseq_d1.head()

In [None]:
@interact
def fitness_plot(normalization=['VST', 'RARE', 'BPM',]):
    if normalization == 'VST':
        df = vst_fit_d1.sort_values('hits')
        
    elif normalization =='RARE':
        df = rare_fit_d1.sort_values('hits')

    else:
        df = bpm_fit_d1.sort_values('hits')
    fig = px.scatter(df, x='median_fitness', y='padj',color='hits',
                     color_discrete_map={True: 'red', False:'blue'},  size='median_inoculm_value',
                      log_y=True,  template = "simple_white", hover_data=[df.index, df.median_inoculm_value])
    fig.update_yaxes(autorange="reversed")
    fig.show()

In [None]:
vst_ci_d1.sample(5)

In [None]:
vst_ci_d1.head()

In [None]:
@interact
def fitness_plot(normalization=['VST', 'RARE', 'BPM',]):
    if normalization == 'VST':
        df = vst_ci_d1.sort_values('hits')
        
    elif normalization =='RARE':
        df = rare_ci_d1.sort_values('hits')

    else:
        df = bpm_ci_d1.sort_values('hits')
    fig = px.scatter(df, x='median_CI', y='ci_padj',color='hits',
                     color_discrete_map={True: px.colors.qualitative.Plotly[1], False:px.colors.qualitative.Plotly[0]}, 
                      log_y=True, template = "simple_white", hover_data=[df.index])
    fig.update_yaxes(autorange="reversed")
    fig.update_traces(marker=dict(size=10,
                              line=dict(width=1,
                                        color='black')),
                  selector=dict(mode='markers'))
    fig.show()

In [None]:
fig = px.scatter(deseq_d1, x='median_fitness', y='padj',color='hits',
                     color_discrete_map={True: 'red', False:'blue'},  
                      log_y=True,  hover_data={'gene':df.ShortName})
fig.update_yaxes(autorange="reversed")
fig.show()

In [None]:
vst_fit_d1.hits.sum()
rare_fit_d1.hits.sum()

In [None]:
vst_fit_d1[(vst_fit_d1.hits == True)& (vst_fit_d1.median_inoculm_value < 50) ]

In [None]:
rare_fit_d1[(rare_fit_d1.hits == True) & (rare_fit_d1.median_inoculm_value < 50)]

In [None]:
from plotly.subplots import make_subplots
@interact
def fitness_plot(normalization=['VST', 'RARE', 'BPM']):
    if normalization == 'VST':
        df = vst_ci_d1
        cntrl = vst_ssa_ci_d1
    elif normalization =='RARE':
        df = rare_ci_d1
        cntrl = rare_ssa_ci_d1
    else:
        df = bpm_ci_d1
        cntrl = bpm_ssa_ci_d1
  
    fig = px.scatter(df, x='median_CI', y='ci_padj',color='hits', 
                     log_y=True, log_x=True, hover_data=[df.index])
    fig.update_yaxes(autorange="reversed")
    #fig.add_scatter(x=cntrl, y = [2.96]*len(cntrl))
    fig.show()

In [None]:
vst_ssa_ci_d1

In [None]:
fig = px.scatter(vst_controls_d1, x='wt', y='ycoord', 
                     log_y=True, hover_data=[vst_controls_d1.index])
fig.update_yaxes(autorange="reversed")
fig.show()

In [None]:
test1 = vst_controls_d1.T
vst_controls_d1['hits'] = 'wt'

test = pd.concat([vst_ci_d1, vst_controls_d1.T])
vst_controls_d1

In [None]:
rare_ssa_ci_d1

In [None]:
px.box(vst_controls_d1, y='wt',points='all',)

In [None]:
? px.box

In [None]:
logbmp_fit1, logbmp_ci1, logbmp_controls, logbmp_ssa_ci = get_fitness(logbpm, annotation_df, 'd1')

In [None]:
logbmp_res = logbmp_fit1[logbmp_fit1.padj<0.05].sort_values('padj')

In [None]:
vst = get_fitness(vst_df, annotation_df, 'd1')
vst.head()

In [None]:
vst_fit1, vst_ci1, vst_controls, vst_ssa_ci = 

In [None]:
rare_fit1, rare_ci1, rare_controls, rare_ssa_ci = get_fitness(rare, annotation_df, 'd1', good_samples_rare)

In [None]:
rare_res = rare_fit1[rare_fit1.padj<0.05].sort_values('padj')

In [None]:
vst_sig = vst_fit1[vst_fit1.padj < 0.05].sort_values('padj')

In [None]:
len(set(vst_sig.index).intersection(set(rare_res.index)))

In [None]:
rare_res.shape

In [None]:
# Calculate a mean value for all inoculum samples
cnts = rare.copy()
cnts['inoculum'] = cnts[[c for c in cnts.columns if 'd0' in c]].mean(axis=1)
cnts = cnts[cnts.inoculum > 10]
cnts2 = cnts.reset_index().merge(annotation_df, on='barcode')
day='d1'
# Caclucalte fitness for each barcode
fitness = cnts.apply(lambda x: x/cnts['inoculum']).reset_index()
# Add gene annotation
fitness = fitness.merge(annotation_df, on='barcode')
# Calculate fitness for each gene
# Drop controls for now
gene_df = fitness[fitness.phenotype.isna()]
#Calculate median value for each ShortName
samples = dict.fromkeys(([c for c in fitness.columns if day in c]), ['median'])
day_fitness = gene_df.groupby('ShortName').agg(samples)
day_fitness.columns = [c[0] for c in day_fitness.columns]

# Calculate median fitness for wt barcodes
wt_fitness = fitness[fitness.phenotype == 'wt'][[c for c in good_samples_rare if day in c]].median()

# Calculate rnasksums test for wt vs each gene, multi-test correction using Benjamini/Hochberg (non-negative)
pvals = day_fitness.apply(gene_ranksums,  axis=1, wt_values = wt_fitness)
padj = fdr_correction(pvals.values)
results = pd.DataFrame([pvals.values, padj], columns=pvals.index, index=['pval', 'padj']).T

# Calculate CI
day_ci = day_fitness.apply(lambda x: x/wt_fitness[x.name])
ssa_fitness = fitness[fitness.phenotype == 'ssaV_invG'][[c for c in good_samples_rare if day in c]].median()
ssa_ci = ssa_fitness/wt_fitness

pvals_ci = day_ci.apply(gene_ranksums,  axis=1, wt_values = ssa_ci)
ci_res = pd.DataFrame([pvals_ci.values, fdr_correction(pvals_ci.values)], columns=pvals.index, index=['ci_pval', 'ci_padj']).T
day_fitness['median_fitness'] = day_fitness.median(axis=1)
day_fitness['mean_fitness'] =  day_fitness.mean(axis=1)
day_fitness = day_fitness.merge(results, left_index=True, right_index=True)
day_ci['median_CI'] = day_ci.median(axis=1)
day_ci['mean_CI'] = day_ci.mean(axis=1)
day_ci = day_ci.merge(ci_res, left_index=True, right_index=True)
controls = pd.concat([wt_fitness, ssa_fitness], axis=1)
controls.columns = ['wt', 'ssaV_invG']


In [None]:
vst_fit1.loc[day_fitness.index].head()

In [None]:
#plt.plot(vst_fit1.loc[day_fitness.index].median_fitness, day_fitness.median_fitness, 'k.')
px.scatter( x=vst_fit1.loc[day_fitness.index].median_fitness, y=day_fitness.median_fitness, 
          labels = {'x': 'VST', 'y': 'RARE'})

In [None]:
rare_sig = day_fitness[day_fitness.padj<0.05].sort_values('padj')

In [None]:
rare_sig.head(50)

In [None]:
set(rare_sig.index) - set(vst_sig.index)

In [None]:
vst_fit1.loc['iolC']

In [None]:
day_fitness.loc['iolC']

In [None]:
len(set(vst_sig.index))

In [None]:
deseq_d1.sample(10)

In [None]:
deseq_d1 = pd.read_csv(Path(root)/results/'d0_d1_deseq_results.csv').rename({'Unnamed: 0':'barcode'}, axis=1)
deseq_d1 = deseq_d1.merge(annotation_df, on='barcode').dropna(subset=['padj', 'ShortName'])
deseq_d1 = deseq_d1.sort_values(['ShortName', 'padj'])
deseq_d1['rank'] = deseq_d1.groupby(['ShortName']).cumcount()

deseq_d1 = deseq_d1[deseq_d1['rank'] == 0].copy()
deseq_d1.drop('rank', axis=1, inplace=True)
deseq_d1 = deseq_d1[['ShortName', 'baseMean', 'log2FoldChange', 'padj']]
deseq_d1.columns = ['ShortName', 'median_inoculm_value', 'median_fitness', 'padj']
deseq_d1 = deseq_d1.set_index('ShortName')
deseq_d1['hits'] = deseq_d1['padj'] < 0.05

In [None]:
deseq_d1


In [None]:
d1_ci_df = d1_fitness.apply(lambda x: x/wt_fitness[x.name])

In [None]:
d1_ci_df.head()

In [None]:
ssa_fitness = fitness_df[fitness_df.phenotype == 'ssaV_invG'][[c for c in good_samples if 'd1' in c]].median()
ssa_fitness

In [None]:
ssa_ci = ssa_fitness/wt_fitness

In [None]:
pvals_ci = d1_ci_df.apply(gene_ranksums,  axis=1, wt_values = ssa_ci)
fdr_correction(pvals_ci.values)
ci_res = pd.DataFrame([pvals_ci.values, fdr_correction(pvals_ci.values)], columns=pvals.index, index=['pval', 'padj']).T
    

In [None]:
ci_res = pd.DataFrame([pvals_ci.values, fdr_correction(pvals_ci.values)], columns=pvals.index, index=['pval', 'padj']).T
    

In [None]:
ci_res[ci_res.padj < 0.05].sort_values('padj').shape

In [None]:

sample1 = d1_fitness.loc['dcuB'].values
sample2 = wt_fitness.values
ranksums(sample1, sample2)


In [None]:
ssa_fitness = fitness_df[fitness_df.phenotype == 'ssaV_invG'][good_samples].median()
hyb_fitness = fitness_df[fitness_df.phenotype == 'hyb'][good_samples].median()

In [None]:
d1_fitness.T.hybA.hist(bins=20)

In [None]:
cnt_df[cnt_df.phenotype == 'hyb']