In [None]:
import pandas as pd
from pathlib import Path
from tnseq2.src.analysis import *
from tnseq2.src.method2_analysis import *
import numpy as np
from scipy.stats import ranksums
import matplotlib.pyplot as plt
import chart_studio
import chart_studio.tools as tls
import chart_studio.plotly as py
import plotly.express as px
%matplotlib inline
import cufflinks as cf
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)
cf.go_offline()
import dash_bio as dashbio

## Data Normalization

- Count data normalization is essential to get valid results from the analysis.
- I have looked at a number of different transformations, and suggest VST (implemented in DESeq2 package) as the most appropriate transformation for this data set.
- Here I compare VST transformation to rarefaction (which is not a good way to normalize data for this experiment, and produces unstable results, i.e. results vary depending on the rarefaction level) as well as the original analysis already presented. 


### Data Used:

- All experiments using library 10_2 are being analyzed together


### Filtering bottleneck samples:

- as before


In [None]:
root ="/Users/ansintsova/git_repos/nguyenb_tnseq/data/13_04_results"
results = "results"
counts = "counts"
control_file = Path(root)/'controls.txt'
dnaids = ['dnaid1315', 'dnaid1428', 'dnaid1429', 'dnaid2015', 'dnaid2016', 'dnaid2017', 'dnaid2018', 'dnaid2019',
         'dnaid2023', 'dnaid2024', 'dnaid2025', 'dnaid2026', 'dnaid2027', 'dnaid2028', 'dnaid2029' ]
cnt_df = load_files(dnaids, Path(root)/counts)
lib10_cnt = cnt_df[cnt_df.library == 'library_10_2'].copy()
lib10_cnt['sampleIDExp'] = lib10_cnt['sampleID'] + "_"+ lib10_cnt['dnaid'] + "_" + lib10_cnt['experiment']

annotation_df = cnt_df[['barcode', 'ShortName', 'locus_tag', 'phenotype', 'conc']].drop_duplicates()

control_file = Path(root)/'controls.txt'
corr_df, good_samples = calculate_correlation(lib10_cnt, control_file, for_each='sampleIDExp')
good_samples = list(good_samples)
good_samples.remove('unenriched_inoculum_d0_dnaid2017_TV4592A')
#good_samples

In [None]:
lib10_cnt

In [None]:
outdir = '/Users/ansintsova/git_repos/nguyenb_tnseq/data/01_06'

_, final = analyze_library(lib10_cnt, sample_id="sampleIDExp", 
                          good_samples=good_samples, 
                          dnaid='library10', experiment='2', 
                          control_file=control_file, cutoff=0.8, 
                          to_filter=1000, outdir=outdir)

## VST Transformation

In [None]:
import datetime as dt
dt.date.today().strftime("%Y-%m-%d")

In [None]:
tvst = run_VST_transformation(lib10_cnt, 'lib10_tst', good_samples, '/Users/ansintsova/git_repos/nguyenb_tnseq/data/01_06',
                       sample_id='sampleIDExp')

In [None]:
tvst.head()

In [None]:
# sdf = lib10_cnt[['sampleID', 'mouse', 'day', 'tissue', 'dnaid', 'experiment', 'sampleIDExp']].set_index('sampleIDExp').drop_duplicates()
# edf = (lib10_cnt[['barcode', 'sampleIDExp', 'experiment', 'mouse', 'day', 'tissue', 'dnaid', 'cnt']].drop_duplicates()
#        .pivot(index='barcode', columns='sampleIDExp', values='cnt'))
# edf = edf[list(sdf.index)]
# edf = edf.fillna(0)
# sdf.to_csv(Path(root)/results/'30_04_lib10_sdf.csv')
# edf.to_csv(Path(root)/results/'30_04_lib10_edf.csv')
# Run DESeq2 script
vst = pd.read_csv(Path(root)/'results/30_14_lib10_vsd.csv').rename({'Unnamed: 0':'barcode'}, axis=1).set_index('barcode')
vst = vst.drop('unenriched_inoculum_d0_dnaid2017_TV4592A', axis=1)


In [None]:
vst.head()

## Rarefaction

In [None]:
samples = [c.strip() for c in lib10_cnt.sampleIDExp.unique()]
samples.remove('am487_d1_dnaid2027_TV5563A')
samples.remove('unenriched_inoculum_d0_dnaid2017_TV4592A')
lib10_cnt_rare = lib10_cnt[lib10_cnt.sampleIDExp.isin(samples)]
edf_rare = (lib10_cnt_rare[['barcode', 'sampleIDExp', 'experiment', 'mouse', 'day', 'tissue', 'dnaid', 'cnt']].drop_duplicates()
       .pivot(index='barcode', columns='sampleIDExp', values='cnt'))
edf_rare = edf_rare.fillna(0)
edf_rare.to_csv(Path(root)/'results/03_05_lib10_rare_edf.csv')

## Rarefy with vegan in R

rare = (pd.read_csv(Path(root)/'results/05_05_lib10_rarefied_edf.csv')
        .set_index('Unnamed: 0').T)
rare.index.name = 'barcode'
rare = rare.drop('unenriched_inoculum_d0_dnaid2017_TV4592A', axis =1)


In [None]:
rarevsvst = px.scatter(x=rare.ad926_d1_dnaid2017_TV4592A, y=vst.ad926_d1_dnaid2017_TV4592A, 
          log_x=True, color_discrete_sequence= [px.colors.qualitative.Plotly[3]],
          labels= {'x': 'Rarefied Counts', 'y': 'VST Counts'},
          template = "simple_white")

rarevsvst.write_html("/Users/ansintsova/Documents/SushiLab/31-05-2021-Lab-Meeting/Rare_vs_VST.html")

In [None]:
import numpy as np
from skbio.stats.composition import clr

def clr_on_array_with_0(a):
    a = np.ma.masked_equal(a, 0)
    transformed = clr(a)
    transformed[transformed.mask] = a.fill_value
    transformed = transformed.data
    return transformed


def clr_on_array_with_pseudocount(a):
    transformed = clr(a+1)
    return transformed

clr_df= edf.copy().apply(clr_on_array_with_0)
clr2_df = edf.copy().apply(clr_on_array_with_pseudocount)

In [None]:
clrvsvst = px.scatter(x=clr2_df.ad926_d1_dnaid2017_TV4592A, y=vst.ad926_d1_dnaid2017_TV4592A, 
          color_discrete_sequence= [px.colors.qualitative.Plotly[3]],
          labels= {'x': 'CLR Norm Counts', 'y': 'VST Counts'},
          template = "simple_white")
clrvsvst.write_html("/Users/ansintsova/Documents/SushiLab/31-05-2021-Lab-Meeting/CLR_vs_VST.html")

## Load DESeq2 results

In [None]:
deseq1 = (pd.read_csv('/Users/ansintsova/git_repos/nguyenb_tnseq/data/13_04_results/results/d0_d1_deseq_results.csv')
          .rename({'Unnamed: 0': 'barcode'}, axis=1))

deseq1 = deseq1.merge(annotation_df, on='barcode')
deseqhits = deseq1[deseq1.padj < 0.05].ShortName.dropna().values
deseq1 = deseq1.groupby('ShortName').log2FoldChange.median().reset_index()
deseq1['fitness'] = 2**deseq1.log2FoldChange
deseq1.log2FoldChange.hist(bins=50)

## Analysis: 

- Calculate mean inoculum value for each barcode based on all inoculum (d0) samples (drop the uneriched sample).
- **For rarefied data, drop any barcodes with mean 0 in the inoculum**
- Caclulate fitness for each barcode, for rarefied data as counts on specific day/ counts in the inoculum, for VST data (which is on log2 scale) 2^counts on specific day/ 2^counts in teh inoculum 
- To get gene fitness, take median fitness of barcodes mapped to that gene
- WT fitness is the median fitness of all wt WITS barcodes
- Use Mann–Whitney U test (also called the Mann–Whitney–Wilcoxon (MWW), Wilcoxon rank-sum test) to test for significance of each gene 
- multi-test correction using Benjamini/Hochberg (non-negative)

- calculate CI at gene fitness/ wt fitness
- test for significane as before using ssaV_invG barcodes as controls
- calculate median/mean fitness/CI for each gene across all mice



In [None]:
import statsmodels
def gene_ranksums(gene_values, wt_values):
    return ranksums(gene_values, wt_values)[1]

def fdr_correction(pvals):
    return statsmodels.stats.multitest.multipletests(pvals, alpha=0.05, method='fdr_bh')[1]


def get_median_for_gene_on_a_day(df, annotation, day, grp_by='ShortName'):
    df = df.merge(annotation, on='barcode')
    samples = dict.fromkeys(([c for c in df.columns if day in c]), ['median'])
    day_median_value = df.groupby('ShortName').agg(samples)
    day_median_value.columns = [c[0] for c in day_median_value.columns]
    return day_median_value


def get_fitness(cnts, annotation, day, good_samples, hits=0.01, rare=False):
    # Calculate a mean value for all inoculum samples
    
    cnts['inoculum'] = cnts[[c for c in cnts.columns if 'd0' in c and 'unenriched' not in c]].mean(axis=1)
    cnts = cnts.dropna(subset=['inoculum']) # do I have any NAs?
    cnts = cnts[cnts.inoculum > 0]
    # Caclucalte fitness for each barcode
    if rare:
        fitness = cnts.apply(lambda x: x/cnts['inoculum']).reset_index()
    else:
        fitness = cnts.apply(lambda x: 2**x/2**cnts['inoculum']).reset_index()
    # Add gene annotation
    
    fitness = fitness.merge(annotation, on='barcode')
    # Calculate fitness for each gene
    # Drop controls for now
    gene_df = fitness[fitness.phenotype.isna()]
    # Calculate median value for each ShortName
    samples = dict.fromkeys(([c for c in good_samples if day in c]), ['median'])
    day_fitness = gene_df.groupby('ShortName').agg(samples)
    day_fitness.columns = [c[0] for c in day_fitness.columns]

    # Calculate median fitness for wt barcodes
    wt_fitness = fitness[fitness.phenotype == 'wt'][[c for c in good_samples if day in c]].median()
    
    # Calculate rnasksums test for wt vs each gene, multi-test correction using Benjamini/Hochberg (non-negative)
    pvals = day_fitness.apply(gene_ranksums,  axis=1, wt_values = wt_fitness)
    padj = fdr_correction(pvals.values)
    results = pd.DataFrame([pvals.values, padj], columns=pvals.index, index=['pval', 'padj']).T
    
    # Calculate CI
    day_ci = day_fitness.apply(lambda x: x/wt_fitness[x.name])
    ssa_fitness = fitness[fitness.phenotype == 'ssaV_invG'][[c for c in good_samples if day in c]].median()
    ssa_ci = ssa_fitness/wt_fitness
    
    pvals_ci = day_ci.apply(gene_ranksums,  axis=1, wt_values = ssa_ci)
    ci_res = pd.DataFrame([pvals_ci.values, fdr_correction(pvals_ci.values)], columns=pvals.index, index=['ci_pval', 'ci_padj']).T
    results = results.merge(ci_res, left_index=True, right_index=True)
    # Calculate median and mean fitness and median and mean CI
    
    results['median_fitness'] = day_fitness.median(axis=1)
    results['mean_fitness'] =  day_fitness.mean(axis=1)
    results['median_CI'] = day_ci.median(axis=1)
    results['mean_CI'] = day_ci.mean(axis=1)
    
    #day_ci = day_ci.merge(ci_res, left_index=True, right_index=True)
    controls = pd.concat([wt_fitness, ssa_fitness], axis=1)
    controls.columns = ['wt', 'ssaV_invG']
    controls['day'] = day.strip('_')
    ssa_ci = pd.DataFrame(ssa_ci).assign(day=day.strip("_"))
    ssa_ci.columns = ['CI', 'day']
    results['hits'] = results['padj'] < hits
    results['ci_hits'] = results['ci_padj'] < hits
    return fitness, day_fitness, day_ci, results, controls, ssa_ci

In [None]:
# VST

vst_cnt_d0 = get_median_for_gene_on_a_day(vst[good_samples], annotation_df, 'd0')
vst_inoculum_cnts = vst_cnt_d0.median(axis=1)
vst_inoculum_cnts.name = 'median_inoculum_value'

# Rare
rare_cnt_d0 = get_median_for_gene_on_a_day(rare[samples], annotation_df, 'd0')
rare_inoculum_cnts = rare_cnt_d0.median(axis=1)
rare_inoculum_cnts.name = 'median_inoculum_value'


In [None]:
days = ['_d0', '_d1', '_d2', '_d3', '_d4']
vst_cnt_genes = []
for day in days:
    print(day)
    df = get_median_for_gene_on_a_day(vst[good_samples], annotation_df, day)
    vst_cnt_genes.append(df)
fdf = pd.concat(vst_cnt_genes, axis=1)
fdf.sample(5)

In [None]:
fdf.to_csv(Path(root)/"results/gene_counts_vst_transform.csv")

## Day 1

In [None]:
# VST
fitness, vst_fit_d1, vst_ci_d1, vst_d1_results, vst_controls_d1, vst_ssa_ci_d1 = get_fitness(vst, annotation_df, 'd1', good_samples=good_samples,  hits=0.05)

#vst_fit_d1 = vst_fit_d1.merge(2**vst_inoculum_cnts, on='ShortName')

# RARE

f, rare_fit_d1, rare_ci_d1, rare_day1_results, rare_controls_d1, rare_ssa_ci_d1 = get_fitness(rare, annotation_df, 'd1', good_samples=samples, rare=True,  hits=0.05)

rare_fit_d1 = rare_fit_d1.merge(rare_inoculum_cnts, on='ShortName')

d1_cntrl = vst_controls_d1.merge(rare_controls_d1, left_index=True, right_index=True, on='day')
d1_cntrl.columns = ['vst_wt', 'vst_ssaV', 'day', 'rare_wt', 'rare_ssaV', ]
d1_cntrl.melt(id_vars='day')

#px.box(d1_cntrl.melt(), x="variable", y="value",  color='variable')

In [None]:
fitness.head()

In [None]:
?pd.melt

In [None]:
def get_control_df(fitness, phenotype='wt'):
    wt = fitness[fitness.phenotype == phenotype].dropna(axis=1).drop(['inoculum'], axis=1)
    wt = wt.melt(id_vars=['barcode', 'phenotype', 'conc'], var_name='sampleExpID', value_name='fitness')
    new = wt.sampleExpID.str.split("_", expand=True)
    new.columns = ['mouse', 'day', 'dnaid', 'experiment']
    wt = wt.merge(new, left_index=True, right_index=True)
    return wt

In [None]:
wt = fitness[fitness.phenotype == 'wt'].dropna(axis=1).drop(['inoculum'], axis=1)
wt = wt.melt(id_vars=['barcode', 'phenotype', 'conc'], var_name='sampleExpID', value_name='fitness')
new = wt.sampleExpID.str.split("_", expand=True)
new.columns = ['mouse', 'day', 'dnaid', 'experiment']
wt = wt.merge(new, left_index=True, right_index=True)
wt_d1 = wt[wt.day == 'd1']


In [None]:
hyb = get_control_df(fitness, phenotype='hyb')
hyb

In [None]:
px.box(wt_d1, x='mouse', y=np.log2(wt_d1['fitness']), color= 'conc', hover_data=['conc'])

In [None]:
wt

In [None]:
px.box(wt, x='day', y=np.log2(wt['fitness']), color= 'mouse', hover_data=['conc'])

In [None]:
hyb_d1 = hyb[hyb.day == 'd1']

In [None]:
hyb_d1

In [None]:
px.box(hyb, x='day', y=np.log2(hyb['fitness']),  color='mouse', hover_data=['mouse'])

In [None]:
lib10_wt = lib10_cnt[lib10_cnt.phenotype == 'wt'][['barcode','cnt', 'conc', 'mouse', 'day', 'experiment']]

In [None]:
lib10_cnt.groupby('sampleIDExp').cnt.sum().sort_values()

In [None]:
wt

In [None]:
px.strip(wt[wt.day == 'd1'], x='mouse', y='fitness', color= 'conc', hover_data=['conc'])

In [None]:
px.strip(wt[wt.day == 'd1'], x='mouse', y='fitness', color= 'conc', hover_data=['conc'])

In [None]:
test = vst[[c for c in vst if 'am732_d1' in c]].reset_index()
test.head()

In [None]:
lib10_wt[(lib10_wt.experiment == 'TV5585A') & (lib10_wt.day == 'd0')].sort_values('conc')

In [None]:
t2  = lib10_wt[(lib10_wt.mouse == 'am732') & (lib10_wt.day == 'd1')].sort_values('conc').merge(test, how='left', on='barcode' )

In [None]:
t2

In [None]:
px.scatter(t2, x=np.log2(t2['conc']), y='am732_d1_dnaid2028_TV5585A', trendline='ols', )

In [None]:
data = lib10_wt[lib10_wt.experiment =='TV5585A']
x = lib10_wt.day.nunique()
y = lib10_wt.mouse.nunique()
p9.options.figure_size = (x*3, y*2.5)
g = (p9.ggplot(data, p9.aes(x='conc', y='cnt'))
  + p9.geom_point()
  + p9.geom_smooth(method="lm")
  + p9.theme_classic()
  + p9.theme(text=p9.element_text(size=14),
             axis_text_x=p9.element_text(rotation=90, hjust=1))
    #+ p9.geom_text(p9.aes(label='Rlab', x=0.0001, y=.1))
  + p9.ylab("Count")
  + p9.xlab("Expected Abundance")
  + p9.scale_y_log10()
  + p9.scale_x_log10()
  + p9.facet_grid('mouse~day'))
g

In [None]:
final

In [None]:
final= final.set_index('gene')
final

In [None]:
lib10_cnt[lib10_cnt.ShortName == 'zwf']

In [None]:
test = vst_d1_results[['mean_CI', 'ci_hits']].merge(final.d1_ci, left_index=True, right_index=True, )
test

In [None]:
compare = vst_d1_results.merge(final, left_index=True, right_index=True).reset_index()
compare = compare[['index', 'padj', 'ci_padj', 'mean_fitness', 'mean_CI', 'hits', 'ci_hits',
                    'd1_fitness_mean', 'd1_ci', 'd1_zscore', 'd1_padj']]
compare.columns = ['index', 'method2_padj', 'method2_padj_ci', 'method2_fitness', 'method2_ci', 'method2_hits', 
                  'method2_ci_hits', 'method1_fitness', 'method1_ci', 'method1_zscore', 'method1_padj']
compare['method1_hits'] = compare['method1_padj'] < 0.05
compare.sample(10)

In [None]:
compare.method2_hits.sum()

In [None]:
fig = px.density_heatmap(compare, x='method1_fitness',y=np.log2(compare.method2_fitness),
                        labels= {'method1_fitness': 'Method 1 Fitness (DESeq log2FoldChange)',
                                'y': 'log2(Method 2 Fitness)'})
fig

In [None]:
lib10_cnt[lib10_cnt.day == 'd0'].head()

In [None]:
lib10_cnt[lib10_cnt.ShortName== 'yneB']

In [None]:
lib10_cnt[(lib10_cnt.ShortName == 'SL1344_0033') & (lib10_cnt.day == 'd0')].cnt.hist(bins=20)

In [None]:
x = final[['d1_fitness_mean', 'num_barcodes']].drop_duplicates()
x[x.num_barcodes>1].sort_values('num_barcodes')

In [None]:
compare

In [None]:
fig = px.scatter(compare, x='method1_zscore', y=compare['method2_ci'],hover_data=['index'], log_y=True,
             template='simple_white', color='method2_ci_hits', symbol='method1_hits',
          labels={'method1_fitness': 'log2FC as calculated by DESeq',
                 'method1_zscore': 'Method 1 Z-Score',
                  'method2_hits': 'Method 2 padj < 0.05',
                  'method1_hits': 'Method 1 padj < 0.05',
                 'index': 'gene'}, trendline='ols')


fig.update_traces(marker=dict(size=12,
                              line=dict(width=2,
                                        color='DarkSlateGrey')),
                  selector=dict(mode='markers'))

In [None]:
fig = px.scatter(compare, x='method1_fitness', y=np.log2(compare.method2_fitness),hover_data=['index'], 
           color='method2_hits', symbol = 'method1_hits', template='simple_white', 
          labels={'method1_fitness': 'log2FC as calculated by DESeq',
                 'y': 'log2(Method 2 Fitness)',
                  'method2_hits': 'Method 2 padj < 0.05',
                  'method1_hits': 'Method 1 padj < 0.05',
                 'index': 'gene'})


fig.update_traces(marker=dict(size=12,
                              line=dict(width=2,
                                        color='DarkSlateGrey')),
                  selector=dict(mode='markers'))

In [None]:
compare.method2_ci_hits.sum()

In [None]:
fig = px.density_heatmap(compare, x='method1_ci',y='method2_ci')
fig

In [None]:
fig = px.scatter(compare, x='method1_ci', y='method2_ci', color='method2_ci_hits', 
                 log_x=True, log_y=True, hover_data=['index'], 
                template='simple_white', 
                labels={'method1_ci': 'Method 1 CI', 'method2_ci': 'Method 2 CI', 
                       'method2_ci_hits': 'Method 2 CI padj < 0.05', 'index': 'gene'})
fig.update_traces(marker=dict(size=12,
                              line=dict(width=2,
                                        color='DarkSlateGrey')),
                  selector=dict(mode='markers'))
fig

In [None]:
vst_fit_d1.sample(10)

In [None]:
vst_d1_results

# Analysis

In [None]:
vst_fit_d1.head()

In [None]:

days = ['_d1', '_d2', '_d3', '_d4']

fitness_dfs = []
ci_dfs = []
results_dfs = []
wt_fitness = []
ssa_ci = []
ci_notmelted = []
for day in days:
    print(day)
    vst_fit, vst_ci, results, vst_controls, vst_ssa_ci = get_fitness(vst, annotation_df, day, 
                                                                        good_samples=good_samples,  hits=0.05)
      
    df = vst_fit.reset_index().melt( id_vars = ['ShortName'], var_name='sampleID', value_name='Fitness')    
    #df['sampleID'] = df['sampleID'].str.replace(day, '')
    df['day'] = day.strip('_')
    
    fitness_dfs.append(df)
    
    ci_notmelted.append(vst_ci)
    df2 = vst_ci.reset_index().melt(id_vars = ['ShortName'], var_name='sampleID', value_name='CI')    
    #df2['sampleID'] = df2['sampleID'].str.replace(day, '')
    df2['day'] = day.strip('_')
    
    ci_dfs.append(df2)
    
    results['day'] = day.strip("_")
    results_dfs.append(results)

    wt_fitness.append(vst_controls)
    ssa_ci.append(vst_ssa_ci)

fitness_df = pd.concat(fitness_dfs)
ci_df = pd.concat(ci_dfs)
results_df = pd.concat(results_dfs)
wt_fitness_df = pd.concat(wt_fitness)
ssa_ci_df = pd.concat(ssa_ci)


In [None]:
days = ['_d1', '_d2', '_d3', '_d4']

rare_fitness_dfs = []
rare_ci_dfs = []
rare_results_dfs = []
rare_wt_fitness = []
rare_ssa_ci = []

for day in days:
    print(day)
    vst_fit, vst_ci, results, vst_controls, vst_ssa_ci = get_fitness(rare, annotation_df, 'd1', good_samples=samples, rare=True,  hits=0.05)
      
    df = vst_fit.reset_index().melt( id_vars = ['ShortName'], var_name='sampleID', value_name='Fitness')    
    #df['sampleID'] = df['sampleID'].str.replace(day, '')
    df['day'] = day.strip('_')
    
    fitness_dfs.append(df)
    
    
    df2 = vst_ci.reset_index().melt(id_vars = ['ShortName'], var_name='sampleID', value_name='CI')    
    #df2['sampleID'] = df2['sampleID'].str.replace(day, '')
    df2['day'] = day.strip('_')
    
    ci_dfs.append(df2)
    
    results['day'] = day.strip("_")
    results_dfs.append(results)

    wt_fitness.append(vst_controls)
    ssa_ci.append(vst_ssa_ci)

rare_fitness_df = pd.concat(fitness_dfs)
rare_ci_df = pd.concat(ci_dfs)
rare_results_df = pd.concat(results_dfs)
rare_wt_fitness_df = pd.concat(wt_fitness)
rare_ssa_ci_df = pd.concat(ssa_ci)

In [None]:
fitness_df.to_csv(Path("/Users/ansintsova/git_repos/nguyenb_tnseq/data/28_05_results")/"library_10_2.fitness.csv")
ci_df.to_csv(Path("/Users/ansintsova/git_repos/nguyenb_tnseq/data/28_05_results")/"library_10_2.ci.csv")
wt_fitness_df.to_csv(Path("/Users/ansintsova/git_repos/nguyenb_tnseq/data/28_05_results")/"library_10_2.wt_fitness.csv")
ssa_ci_df.to_csv(Path("/Users/ansintsova/git_repos/nguyenb_tnseq/data/28_05_results")/"library_10_2.ssa_ci.csv")

In [None]:
results_df = results_df.reset_index()
rare_results_df = rare_results_df.reset_index()

In [None]:
deseq1

In [None]:
rd1 = results_df[results_df.day == 'd1'].merge(rare_results_df[rare_results_df.day == 'd1'], on='ShortName').merge(deseq1, on='ShortName')
rd1 = rd1.merge(vst_inoculum_cnts, on='ShortName')
rd1['median_inoculum'] = 2**rd1['median_inoculum_value']
rd1.columns= [c.replace('_x', '_vst').replace("_y", '_rare') for c in rd1.columns]
rd1['deseq_hit'] = rd1.ShortName.apply(lambda x: True if x in deseqhits else False)
rd1['rareDeseq'] = rd1['deseq_hit'].astype(int)*3 + rd1['hits_rare'].astype(int)
rd1['rareDeseq'] = rd1['rareDeseq'].replace({0: 'Not a hit', 1: 'RARE hit', 3: 'DESEQ hit', 4: 'DESEQ and RARE hit'})
rd1['vstDeseq'] = rd1['deseq_hit'].astype(int)*3 + rd1['hits_vst'].astype(int)
rd1['vstDeseq'] = rd1['vstDeseq'].replace({0: 'Not a hit', 1: 'VST hit', 3: 'DESEQ hit', 4: 'DESEQ and VST hit'})
fig6 = px.scatter(x=rd1.median_fitness_rare, template="simple_white",
                 labels = {'x': 'Median Fitness','y':'log2FoldChange'},
           y=rd1.log2FoldChange, color=rd1.rareDeseq,  hover_data=[rd1.ShortName], )
#fig.update_xaxes(range=[-10, 5])
fig6

In [None]:
fig5 = px.scatter(x=rd1.median_fitness_vst, 
           y=rd1.log2FoldChange, color=rd1.vstDeseq,  hover_data=[rd1.ShortName], template='simple_white',
                labels = {'x': 'Median Fitness','y':'log2FoldChange'},)
#fig.update_xaxes(range=[-10, 5])
fig5

In [None]:
rd1.head()

In [None]:
fig1 = px.scatter(rd1, x='median_fitness_vst', y='padj_vst', color='hits_vst',
                color_discrete_map={True: px.colors.qualitative.Plotly[1], False:px.colors.qualitative.Plotly[0]}, size='median_inoculum',
                  labels= {'median_fitness_vst': 'Median Fitness', 'padj_vst': 'Adjusted p-value',
                           'hits_vst': 'Adjusted p-vaue < 0.05'},
                  template = 'simple_white',
                  title = 'Fitness (VST Data)',
                      log_y=True,  hover_data={'gene':rd1.ShortName})
fig1.update_yaxes(autorange="reversed")


fig1.show()

In [None]:
fig2 = px.scatter(rd1, x='median_fitness_rare', y='padj_rare', color='hits_rare',
                color_discrete_map={True: px.colors.qualitative.Plotly[1], False:px.colors.qualitative.Plotly[0]}, size='median_inoculum',
                  labels= {'median_fitness_rare': 'Median Fitness', 'padj_rare': 'Adjusted p-value',
                           'hits_rare': 'Adjusted p-vaue < 0.05'},
                  template = 'simple_white',
                  title = 'Fitness (Rarefied Data)',
                      log_y=True,  hover_data={'gene':rd1.ShortName})
fig2.update_yaxes(autorange="reversed")
fig2.show()

In [None]:
rd1.head()

In [None]:
fig3 = px.scatter(rd1, x='median_CI_rare', y='ci_padj_rare', color='ci_hits_rare',
                color_discrete_map={True: px.colors.qualitative.D3[1], False:px.colors.qualitative.D3[0]}, size='median_inoculum',
                  labels= {'median_CI_rare': 'Median CI', 'ci_padj_rare': 'Adjusted p-value',
                           'ci_hits_rare': 'Adjusted p-vaue < 0.05'},
                  title = 'Comptetive Indices (Rarefied Data)',
                  template = 'simple_white',
                      log_y=True,  hover_data={'gene':rd1.ShortName})
fig3.update_yaxes(autorange="reversed")
fig3.show()

In [None]:
((rd1['ci_hits_vst'].astype(int)+rd1['ci_hits_rare'].astype(int))==2).sum()

In [None]:
311/rd1['ci_hits_vst'].sum()

In [None]:
fig4 = px.scatter(rd1, x='median_CI_vst', y='ci_padj_vst', color='ci_hits_vst',
                color_discrete_map={True: px.colors.qualitative.D3[1], False:px.colors.qualitative.D3[0]}, size='median_inoculum',
                  labels= {'median_CI_vst': 'Median CI', 'ci_padj_vst': 'Adjusted p-value',
                           'ci_hits_vst': 'Adjusted p-vaue < 0.05'},
                  template = 'simple_white',
                  title = 'Comptetive Indices (VST Data)',
                      log_y=True,  hover_data={'gene':rd1.ShortName})
fig4.update_yaxes(autorange="reversed")
fig4.show()

In [None]:
with open('/Users/ansintsova/Documents/SushiLab/31-05-2021-Lab-Meeting/volcanos.html', 'a') as f:
    f.write(fig1.to_html(full_html=False, include_plotlyjs='cdn'))
    f.write(fig2.to_html(full_html=False, include_plotlyjs='cdn'))
    f.write(fig3.to_html(full_html=False, include_plotlyjs='cdn'))
    f.write(fig4.to_html(full_html=False, include_plotlyjs='cdn'))
    f.write(fig5.to_html(full_html=False, include_plotlyjs='cdn'))
    f.write(fig6.to_html(full_html=False, include_plotlyjs='cdn'))

## Visualize Fitness Results : Day 1

### VST


In [None]:
fig = px.scatter(vst_fit_d1, x='median_fitness', y='padj',color='hits', size='median_inoculum_value', 
                 log_y=True, log_x=True, hover_data=[vst_fit_d1.index, vst_fit_d1.median_inoculum_value], 
                color_discrete_sequence=['blue', "red"],)
fig.update_yaxes(autorange="reversed")
fig.show()

In [None]:
fig = px.scatter(rare_fit_d1, x='median_fitness', y='padj',color='hits', size='median_inoculum_value', 
                 log_y=True, log_x=True, hover_data=[rare_fit_d1.index, rare_fit_d1.median_inoculum_value], 
                color_discrete_sequence=["red", "blue"])
fig.update_yaxes(autorange="reversed")
fig.show()

## Visualize CI Results :

### VST

In [None]:
fig = px.scatter(vst_ci_d1, x='median_CI', y='ci_padj',color='hits', 
                 log_y=True, log_x=True, hover_data=[vst_ci_d1.index], 
                color_discrete_sequence=['blue', "red"],)
fig.update_yaxes(autorange="reversed")
fig.show()

### Rarefied

In [None]:
fig = px.scatter(rare_ci_d1, x='median_CI', y='ci_padj',color='hits', 
                 log_y=True, log_x=True, hover_data=[rare_ci_d1.index], 
                color_discrete_sequence=['blue', "red"],)
fig.update_yaxes(autorange="reversed")
fig.show()

### Visualize Fitness Results : Day 2
## VST

In [None]:
fig = px.scatter(vst_fit_d1, x='median_fitness', y='padj',color='hits', size='median_inoculum_value', 
                 log_y=True, log_x=True, hover_data=[vst_fit_d1.index, vst_fit_d1.median_inoculum_value], 
                color_discrete_sequence=['blue', "red"],)
fig.update_yaxes(autorange="reversed")
fig.show()

In [None]:
sample_info = ci_df.sampleID.str.split("_", expand=True)
sample_info.columns = ['mouse', 'dnaid', 'experiment']

In [None]:
ci= ci_df.merge(sample_info, left_index=True, right_index=True)

In [None]:
ci.sample(5)

In [None]:
?ci.pivot

In [None]:
sdf = ci[['sampleID', 'mouse', 'day', 'dnaid', 'experiment']].drop_duplicates().set_index('sampleID')
sdf

In [None]:
edf = pd.concat(ci_notmelted, axis=1)

In [None]:
edf.head()

In [None]:
#import skmisc
from matplotlib.patches import Ellipse
from sklearn.decomposition import PCA
from matplotlib.patches import Ellipse
from sklearn.decomposition import PCA

"""
Plotting PCA elipses:
__author__:
"""

def plot_point_cov(points, nstd=2, ax=None, **kwargs):
    """
    Plots an `nstd` sigma ellipse based on the mean and covariance of a point
    "cloud" (points, an Nx2 array).
    Parameters
    ----------
        points : An Nx2 array of the data points.
        nstd : The radius of the ellipse in numbers of standard deviations.
            Defaults to 2 standard deviations.
        ax : The axis that the ellipse will be plotted on. Defaults to the
            current axis.
        Additional keyword arguments are pass on to the ellipse patch.
    Returns
    -------
        A matplotlib ellipse artist
    """
    pos = points.mean(axis=0)
    cov = np.cov(points, rowvar=False)
    return plot_cov_ellipse(cov, pos, nstd, ax, **kwargs)


def plot_cov_ellipse(cov, pos, nstd=2, ax=None, **kwargs):
    """
    Plots an `nstd` sigma error ellipse based on the specified covariance
    matrix (`cov`). Additional keyword arguments are passed on to the
    ellipse patch artist.
    Parameters
    ----------
        cov : The 2x2 covariance matrix to base the ellipse on
        pos : The location of the center of the ellipse. Expects a 2-element
            sequence of [x0, y0].
        nstd : The radius of the ellipse in numbers of standard deviations.
            Defaults to 2 standard deviations.
        ax : The axis that the ellipse will be plotted on. Defaults to the
            current axis.
        Additional keyword arguments are pass on to the ellipse patch.
    Returns
    -------
        A matplotlib ellipse artist
    """
    def eigsorted(cov):
        vals, vecs = np.linalg.eigh(cov)
        order = vals.argsort()[::-1]
        return vals[order], vecs[:,order]

    if ax is None:
        ax = plt.gca()

    vals, vecs = eigsorted(cov)
    theta = np.degrees(np.arctan2(*vecs[:,0][::-1]))

    # Width and height are "full" widths, not radius
    width, height = 2 * nstd * np.sqrt(vals)
    ellip = Ellipse(xy=pos, width=width, height=height, angle=theta, **kwargs)

    ax.add_artist(ellip)
    return ellip
#____________________________________________________




def plotPCA(pDf, pc1_var, pc2_var, colorby, col, nameby="", el=False):
    sns.set_style("ticks")
    sns.set_context("notebook", font_scale=2.2)
    group = pDf[colorby].unique()
    assert len(group) <= len(col)
    fig = plt.figure(figsize=(8, 8))
    for g, c in zip(group, col):
        df = pDf[pDf[colorby] == g]
        x, y = df[["PC1"]].values, df[["PC2"]].values
        ax = plt.scatter(x, y, c=c, s=150, label=g)
        if el:
            pts = np.asarray([[float(a), float(b)] for a, b in zip(x, y)])
            plot_point_cov(pts, nstd=2, alpha=0.1, color=c)
        if nameby:
            labels = df[nameby]
            for label, pc1, pc2 in zip(labels, x, y):
                plt.annotate(label, xy=(pc1, pc2), xytext=(-5, 7), textcoords="offset points",fontsize=14)
        plt.xlabel('Principal Component 1, {} %'.format(pc1_var), )
        plt.ylabel('Principal Component 2, {} %'.format(pc2_var), )
        #plt.xticks(fontsize=16)
        #plt.yticks(fontsize=16)
        plt.legend(frameon=True)
    return fig

def find_pc1_pc2(df, meta):
    df = df.T
    pca = PCA(n_components=2)
    principalComponents = pca.fit_transform(df)
    
    pDf = (pd.DataFrame(data=principalComponents, columns=['PC1', 'PC2'])
           .set_index(df.index))
    pc1_var = round(pca.explained_variance_ratio_[0] * 100, 2)
    pc2_var = round(pca.explained_variance_ratio_[1] * 100, 2)
    pDf2 = pDf.merge(meta, left_index=True, right_index=True)
    return pDf2, pc1_var, pc2_var

In [None]:
pDf, pc1, pc2 = find_pc1_pc2(edf, sdf)

In [None]:
pDf

In [None]:
pca = px.scatter(pDf, x='PC1', y='PC2', color='mouse', symbol='day', 
          color_discrete_sequence= px.colors.qualitative.Dark24, 
                hover_data=[pDf.experiment], )


pca.update_traces(marker=dict(size=14,
                              line=dict(width=2,
                                        color='DarkSlateGrey')),
                  selector=dict(mode='markers')
                 )

In [None]:
pca.write_html("/Users/ansintsova/Documents/SushiLab/31-05-2021-Lab-Meeting/PCA.html")

In [None]:
sdf = pd.Series(fdf.columns).str.split('_', expand = True)
sdf.columns = ['mouse', 'day', 'dnaid', 'experiment']
sdf = pd.DataFrame(fdf.columns).merge(sdf, left_index=True, right_index=True).set_index(0)

In [None]:
pDf, pc1, pc2 = find_pc1_pc2(fdf, sdf)

In [None]:
?px.scatter