In [None]:
import pandas as pd
from tnseq2.src.analysis import *
from pathlib import Path
import plotnine as p9
import seaborn as sns
from sklearn.decomposition import PCA


import numpy as np
from scipy.stats import ranksums
import matplotlib.pyplot as plt
import chart_studio
import chart_studio.tools as tls
import chart_studio.plotly as py
import plotly.express as px
%matplotlib inline
import cufflinks as cf
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)
cf.go_offline()




import dash_bio as dashbio


In [None]:
root = "/Users/ansintsova/git_repos/nguyenb_tnseq/data/13_04_results"
counts = 'counts'
results = 'results'
control_file = Path(root)/'controls.txt'

In [None]:
dnaids = ['dnaid1315', 'dnaid1428', 'dnaid1429', 'dnaid2015', 'dnaid2016', 'dnaid2017', 'dnaid2018', 'dnaid2019',
         'dnaid2023', 'dnaid2024', 'dnaid2025', 'dnaid2026', 'dnaid2027', 'dnaid2028', 'dnaid2029' ]

def load_merged_results(results_dir):
    df = pd.concat([pd.read_csv(file, index_col=0) for file in results_dir.iterdir() if 'merged_results.csv' in file.name])
    return df

def load_singltons(results_dir):
    singles = ['dnaid1315', 'dnaid1428', 'dnaid1429', 'dnaid2019', 'dnaid2027']
    df_list = []
    for s in singles:
        df = pd.read_csv(Path(results_dir)/f'{s}_final_results.csv').assign(dnaid=s)
        df = df.rename({'Unnamed: 0': 'gene'}, axis =1)
        df_list.append(df)
    
    return pd.concat(df_list)
    
    
def get_results(results_dir):
    df = load_merged_results(Path(root)/results)
    df2 = load_singltons(Path(root)/results)
    return pd.concat([df, df2])

results_df = get_results(Path(root)/results)
cnt_df = load_files(dnaids, Path(root)/counts)

## Find all experiments that were done with the same library

In [None]:
results_df[['library', 'experiment', 'dnaid']].drop_duplicates().dropna().groupby('library').experiment.apply(list)

- Libraries library_10_2 and library_14_2 were each used in 3 different experiments. Use that as starting point
- Let's also look at overlap between those 2 libraries (at a gene level)
    - As shown below, overlap is not huge, working with each individually. Later can look at those 632 genes.

In [None]:
overlap_10_14 = results_df.copy()
overlap_10_14 = overlap_10_14[overlap_10_14.library.isin(['library_10_2', 'library_14_2'])]

In [None]:
grouped = overlap_10_14.groupby('gene').library.nunique()
print(f"Overlap is {grouped[grouped >1].shape[0]} genes")

In [None]:
lib10=results_df[results_df.library == 'library_10_2'].copy()
lib14=results_df[results_df.library == 'library_14_2'].copy()

In [None]:
print(f"Library 10_2 has {lib10.gene.nunique()} genes")
print(f"Library 14_2 has {lib14.gene.nunique()} genes")

## Analysis Plan

### PCA: 

- Let's look only at barcodes present > 1000 in the inoculum

    - based raw counts
    - mean normalized relative abundances
    - based on relative abundances
    - based on vst transformed counts
    - based on clr transformed counts
    - based on log2FC for each barcode
    - based on z-score for each gene

## Raw Counts

In [None]:
lib10_cnt = cnt_df[cnt_df.library == 'library_10_2'].copy()
lib10_cnt = lib10_cnt[['barcode', 'sampleID', 'experiment', 'mouse', 'day', 'tissue', 'dnaid', 'cnt']].drop_duplicates()
lib10_cnt['sampleID'] = lib10_cnt['sampleID']+ "_" + lib10_cnt['dnaid'] + '_' + lib10_cnt['experiment']
lib10_sdata = lib10_cnt[['sampleID', 'mouse', 'day', 'tissue', 'dnaid', 'experiment']].set_index('sampleID').drop_duplicates()
lib10_cnt = lib10_cnt.pivot(index='barcode', columns='sampleID', values='cnt')
lib10_cnt = lib10_cnt.fillna(0)
columns_to_filter = [f for f in lib10_cnt.columns if 'inoculum' in f]
lib10_cnt = lib10_cnt[(lib10_cnt[columns_to_filter] >= 1000).all(1)]

## Scaled Raw Counts

In [None]:
from sklearn.preprocessing import RobustScaler

scaler = RobustScaler()
lib10_relab = lib10_cnt.copy().apply(lambda x: x/x.sum())
lib10_cnt_robust = pd.DataFrame(scaler.fit_transform(lib10_relab.T).T)
lib10_cnt_robust.columns = lib10_cnt.columns
lib10_cnt_robust.index = lib10_cnt.index
lib10_cnt_robust.head()
#pDf_robust, pc1_robust, pc2_robust = find_pc1_pc2(lib10_cnt_robust, lib10_sdata)
#plotPCA(pDf_robust, pc1_robust, pc2_robust, colorby='experiment', nameby='mouse', col=list(sns.color_palette()));

In [None]:
lib10_cnt_robust_noinoc = lib10_cnt_robust[[c for c in lib10_cnt_robust.columns if 'inoculum' not in c]]
#pDf_robust2, pc1_robust2, pc2_robust2 = find_pc1_pc2(lib10_cnt_robust_noinoc, lib10_sdata)
#plotPCA(pDf_robust2, pc1_robust2, pc2_robust2, colorby='day', nameby='mouse', col=list(sns.color_palette()));

## Proportions

- Proportions by themselves are not very informative 
- With RobustScaler can see separation between inoculum and the rest of the samples

In [None]:

#pDf_relab, pc1_relab, pc2_relab = find_pc1_pc2(lib10_relab, lib10_sdata)
#plotPCA(pDf_relab, pc1_relab, pc2_relab, colorby='experiment', nameby='mouse', col=list(sns.color_palette()));

## CLR Transformed Data
- CLR and VST transformed data show very similar results
- Potentially identify mice that are outliers and should be removed? 
- Everything else clusters together

In [None]:
import numpy as np
from skbio.stats.composition import clr
lib10_clr = lib10_cnt.apply(lambda x: clr(x+1))

#pDf_clr, pc1_clr, pc2_clr = find_pc1_pc2(lib10_clr, lib10_sdata)
#plotPCA(pDf_clr, pc1_clr, pc2_clr, colorby='day', nameby='mouse', col=list(sns.color_palette()));

## VST Normalized Data

In [None]:
sdf = lib10_sdata
edf = lib10_cnt[list(sdf.index)]
sdf.to_csv(Path(root)/results/'lib10_sdf.csv')
edf.to_csv(Path(root)/results/'lib10_edf.csv')
# Run DESeq2
lib10_vst = pd.read_csv(Path(root)/results/'lib10_vsd.csv').rename({'Unnamed: 0':'barcode'}, axis=1).set_index('barcode')
#pDf_vst, pc1_vst, pc2_vst = find_pc1_pc2(lib10_vst, lib10_sdata)

#plotPCA(pDf_vst, pc1_vst, pc2_vst, colorby='day', nameby='mouse', col=list(sns.color_palette()));

In [None]:
outliers = ['am732', 'am730', 'am484']
lib10_clr_nooutliers = lib10_clr[[c for c in lib10_clr.columns if 'am732' not in c and 'am730' not in c and 'am484' not in c]]
#pDf_clrNO, pc1_clrNO, pc2_clrNO = find_pc1_pc2(lib10_clr_nooutliers, lib10_sdata)
#plotPCA(pDf_clrNO, pc1_clrNO, pc2_clrNO, colorby='day', nameby='mouse', col=list(sns.color_palette()));

In [None]:
import matplotlib.pyplot as plt

plt.plot(lib10_vst.ad926_d2_dnaid2017_TV4592A, np.log2(lib10_cnt.ad926_d2_dnaid2017_TV4592A+1), '.k')

In [None]:
lib10_relab.ad926_d1_dnaid2017_TV4592A.hist(bins=50)

In [None]:
lib10_clr.ad926_d1_dnaid2017_TV4592A.hist(bins=50)

In [None]:
lib10_vst.ad926_d1_dnaid2017_TV4592A.hist(bins=50)

In [None]:
def find_pc1_pc2(df, meta):
    df = df.T
    pca = PCA(n_components=2)
    principalComponents = pca.fit_transform(df)
    pDf = (pd.DataFrame(data=principalComponents, columns=['PC1', 'PC2'])
           .set_index(df.index))

    pc1_var = round(pca.explained_variance_ratio_[0] * 100, 2)
    pc2_var = round(pca.explained_variance_ratio_[1] * 100, 2)
    pDf2 = pDf.merge(meta, left_index=True, right_index=True)
    return pDf2, pc1_var, pc2_var


def plotPCA(pDf, pc1_var, pc2_var, colorby, col, nameby="", el=False):
    sns.set_style("ticks")
    sns.set_context("notebook", font_scale=2.2)
    group = pDf[colorby].unique()
    assert len(group) <= len(col)
    fig = plt.figure(figsize=(25, 15))
    for g, c in zip(group, col):
        df = pDf[pDf[colorby] == g]
        x, y = df[["PC1"]].values, df[["PC2"]].values
        ax = plt.scatter(x, y, c=c, s=150, label=g)
        if el:
            pts = np.asarray([[float(a), float(b)] for a, b in zip(x, y)])
            plot_point_cov(pts, nstd=2, alpha=0.1, color=c)
        if nameby:
            labels = df[nameby]
            for label, pc1, pc2 in zip(labels, x, y):
                plt.annotate(label, xy=(pc1, pc2), xytext=(-5, 7), textcoords="offset points",fontsize=14)
        plt.xlabel('Principal Component 1, {} %'.format(pc1_var), )
        plt.ylabel('Principal Component 2, {} %'.format(pc2_var), )
        #plt.xticks(fontsize=16)
        #plt.yticks(fontsize=16)
        plt.legend(frameon=True)
    return fig


In [None]:
lib10_vst.T.head()

In [None]:
# p9.options.figure_size=(10,15)
# g3 =(p9.ggplot(data=fdf,
#            mapping=p9.aes(x='PC1', y='PC2', color='strain', ))
#     + p9.geom_point(size=3)
#      + p9.facet_wrap("~genotype", ncol=2)
#     + p9.theme_bw()
#     + p9.theme(text=p9.element_text(size=14))
# )

## Clustering

In [None]:
from scipy.cluster.hierarchy import dendrogram, ward

In [None]:
linkage_array = ward(lib10_clr.T)

In [None]:
test = lib10_clr.T
rows = list(test.index)
cols = list(test.columns)

In [None]:
lib10_clr.corr()

In [None]:
?dashbio.Clustergram

In [None]:
lib10_clr.corr()

In [None]:
clustergram= dashbio.Clustergram(
    data=lib10_clr.corr().values,
    return_computed_traces=True,
    row_labels=list(lib10_clr.corr().index),
    column_labels=list(lib10_clr.corr().columns),
#     color_threshold={
#         'row': 250,
#         'col': 700
#     },
    #link_fun = scipy.cluster.hierarchy.ward, 
    height=1000,
    width=1200,
    hidden_labels='column',
    standardize ='none',
    generate_curves_dict=True,
)


In [None]:
sns.set(font_scale=.8)
lut = dict(zip(lib10_sdata.mouse.unique(), sns.color_palette("hls", 19)))
row_colors = lib10_sdata.mouse.map(lut)

sns.clustermap(np.log2(lib10_cnt +1).T, method='average', metric='correlation', figsize=(20, 20), 
               row_colors=row_colors, dendrogram_ratio=(0.5,0.2), cmap ="vlag")

In [None]:
sns.set(font_scale=.8)
lut = dict(zip(lib10_sdata.mouse.unique(), sns.color_palette("hls", 19)))
row_colors = lib10_sdata.mouse.map(lut)

sns.clustermap(lib10_vst.T, method='average', metric='correlation', figsize=(20, 20), 
               row_colors=row_colors, dendrogram_ratio=(0.5,0.2), cmap ="vlag")

In [None]:
sns.set(font_scale=.8)
lut = dict(zip(lib10_sdata.experiment.unique(), sns.color_palette("hls", 3)))
row_colors = lib10_sdata.experiment.map(lut)

sns.clustermap(lib10_clr.T, method='average', metric='correlation', 
               figsize=(20, 20), row_colors=row_colors, dendrogram_ratio=(0.5,0.2), cmap='vlag')

In [None]:
sns.set(font_scale=.8)
lut = dict(zip(lib10_sdata.day.unique(), sns.color_palette("hls", 5)))
row_colors = lib10_sdata.day.map(lut)

sns.clustermap(lib10_clr.T, method='average', metric='correlation', figsize=(20, 20), row_colors=row_colors, 
               cmap='vlag', dendrogram_ratio=(0.5,0.2))

In [None]:
# sns.set(font_scale=.8)
# lut = dict(zip(lib10_sdata.day.unique(), sns.color_palette("hls", 5)))
# row_colors = lib10_sdata.day.map(lut)

# sns.clustermap(lib10_clr.T, method='ward', figsize=(20, 20), row_colors=row_colors, dendrogram_ratio=(0.5,0.2))

- Check if the outlier mice have stronger bottlenecks. Surprising, because from day1 are different

## Log2FC for each barcode

- Can't get any meaningful signal here
- Re-run analysis without shrinkage

In [None]:
lib10_res = results_df[results_df.library == 'library_10_2'].copy()
lib10_res = lib10_res[['gene', 'experiment', 'dnaid'] + [c for c in lib10_res.columns if 'fitness_mean' in c]].dropna()


lib10_res = lib10_res.melt(id_vars=['gene', 'experiment', 'dnaid'], var_name='day', value_name='fitness')
lib10_res_samples = lib10_res[abs(lib10_res.fitness) > 1].gene.values
lib10_res['day'] = lib10_res['day'].str.split("_", expand=True)[0]
weird_cases = lib10_res.groupby(['gene', 'day']).experiment.count().reset_index()
weird_cases = weird_cases[weird_cases.experiment > 3].gene.values
lib10_res = lib10_res[lib10_res.gene.isin(lib10_res_samples)]
lib10_res = lib10_res[~lib10_res.gene.isin(weird_cases)].drop_duplicates()

lib10_res['sampleID'] = lib10_res.experiment +"_" + lib10_res.dnaid + '_' + lib10_res.day
lib10_res = lib10_res.pivot(index='gene', columns='sampleID', values='fitness').dropna()

In [None]:
sns.set(font_scale=.8)
sns.clustermap(lib10_res.T, method='ward', 
               figsize=(20, 20), row_colors=row_colors, dendrogram_ratio=(0.5,0.2))

## Control barcodes

In [None]:
controls = pd.read_table(control_file, index_col=0, names = ['barcode', 'phenotype', 'conc'])
controls_bc = controls.barcode.values
#lib10_cntrl_clr = lib10_clr.loc[controls_bc]
#lib10_cntrl_vst = lib10_vst.loc[controls_bc]

In [None]:
wt_cntrs = controls[controls.phenotype == 'wt'].reset_index()['barcode'].values
lib10_wt_cntrl_clr = lib10_cntrl_clr.loc[wt_cntrs]

In [None]:
# import matplotlib as mpl
# sns.set(font_scale=.8)
# lut = dict(zip(lib10_sdata.mouse.unique(), sns.color_palette("hls", 19)))
# row_colors = lib10_sdata.mouse.map(lut)
# #controls = controls.set_index('barcode')
# lut2 = dict(zip(controls.phenotype.unique(), sns.color_palette("hls", 5)))
# col_colors = controls.phenotype.map(lut2)

# g = sns.clustermap(lib10_cntrl_vst.T, method='average', metric='correlation', figsize=(20, 20), 
#                row_colors=row_colors,col_cluster=False, col_colors=col_colors, dendrogram_ratio=(0.5,0.2), cmap ="vlag")


# # Creating Legend
# #function_legend = [mpl.patches.Patch(color=c, label=l) for l,c in function_to_color.items()]
# pg_legend = [mpl.patches.Patch(color=c, label=l) for l,c in lut2.items()]

# # Displaying function legend
# l2 = g.ax_heatmap.legend(handles=pg_legend, bbox_to_anchor=(-0.5,1.2), frameon=True)

In [None]:
import matplotlib as mpl
sns.set(font_scale=.8)
lut = dict(zip(lib10_sdata.mouse.unique(), sns.color_palette("hls", 19)))
row_colors = lib10_sdata.mouse.map(lut)
#controls = controls.set_index('barcode')
lut2 = dict(zip(controls.phenotype.unique(), sns.color_palette("hls", 5)))
col_colors = controls.phenotype.map(lut2)

g = sns.clustermap(lib10_cntrl_clr.T, method='average', metric='correlation', figsize=(20, 20), 
               row_colors=row_colors,col_cluster=False, col_colors=col_colors, dendrogram_ratio=(0.5,0.2), cmap ="vlag")


# Creating Legend
#function_legend = [mpl.patches.Patch(color=c, label=l) for l,c in function_to_color.items()]
pg_legend = [mpl.patches.Patch(color=c, label=l) for l,c in lut2.items()]

# Displaying function legend
l2 = g.ax_heatmap.legend(handles=pg_legend, bbox_to_anchor=(-0.5,1.2), frameon=True)

In [None]:
import matplotlib as mpl
sns.set(font_scale=.8)
lut = dict(zip(lib10_sdata.mouse.unique(), sns.color_palette("hls", 19)))
row_colors = lib10_sdata.mouse.map(lut)
#controls = controls.set_index('barcode')
#lut2 = dict(zip(controls.phenotype.unique(), sns.color_palette("hls", 5)))
#col_colors = controls.phenotype.map(lut2)

g = sns.clustermap(lib10_wt_cntrl_clr.T, method='average', metric='correlation', figsize=(20, 20), 
               row_colors=row_colors,col_cluster=False, col_colors=col_colors, dendrogram_ratio=(0.5,0.2), cmap ="vlag")


# Creating Legend
#function_legend = [mpl.patches.Patch(color=c, label=l) for l,c in function_to_color.items()]
pg_legend = [mpl.patches.Patch(color=c, label=l) for l,c in lut2.items()]

# Displaying function legend
l2 = g.ax_heatmap.legend(handles=pg_legend, bbox_to_anchor=(-0.5,1.2), frameon=True)

In [None]:
import matplotlib as mpl
sns.set(font_scale=.8)
lut = dict(zip(lib10_sdata.day.unique(), sns.color_palette("hls", 5)))
row_colors = lib10_sdata.day.map(lut)
#controls = controls.set_index('barcode')
lut2 = dict(zip(controls.phenotype.unique(), sns.color_palette("hls", 5)))
col_colors = controls.phenotype.map(lut2)

g = sns.clustermap(lib10_cntrl_clr.T, method='average', metric='correlation', figsize=(20, 20), 
               row_colors=row_colors,col_cluster=False, col_colors=col_colors, dendrogram_ratio=(0.5,0.2), cmap ="vlag")


# Creating Legend
#function_legend = [mpl.patches.Patch(color=c, label=l) for l,c in function_to_color.items()]
pg_legend = [mpl.patches.Patch(color=c, label=l) for l,c in lut2.items()]

# Displaying function legend
l2 = g.ax_heatmap.legend(handles=pg_legend, bbox_to_anchor=(-0.5,1.2), frameon=True)

In [None]:
lib10_wt = (cnt_df[(cnt_df.phenotype == 'wt')&(cnt_df.library == 'library_10_2')]
            .copy()[['barcode','cnt', 'conc', 'sampleName', 'experiment', 'dnaid', 'sampleID', 'day', 'mouse']])

lib10_wt['logcnt'] = np.log2(lib10_wt['cnt'])
# corr_df = lib14_wt.groupby(['experiment','sampleID'])[['conc', 'logcnt']].corr().reset_index()
# corr_df = corr_df[corr_df['level_2'] == 'conc'].drop(['level_2', 'conc'], axis=1)
# corr_df.columns = ['experiment','sampleID', 'R']
# corr_df['Rlab'] = corr_df.R.apply(lambda x: f'R = {round(x, 2)}')
# lib14_wt = lib14_wt.merge(corr_df, on=['experiment', 'sampleID'])

lib10_wt.experiment.unique()

In [None]:
data = lib10_wt[lib10_wt.experiment =='TV4592A']
x = lib10_wt.day.nunique()
y = lib10_wt.mouse.nunique()
p9.options.figure_size = (x*3, y*3)
g = (p9.ggplot(data, p9.aes(x='conc', y='cnt'))
  + p9.geom_point()
  + p9.geom_smooth(method="lm")
  + p9.theme_classic()
  + p9.theme(text=p9.element_text(size=14),
             axis_text_x=p9.element_text(rotation=90, hjust=1))
    #+ p9.geom_text(p9.aes(label='Rlab', x=0.0001, y=.1))
  + p9.ylab("Count")
  + p9.xlab("Expected Abundance")
  + p9.scale_y_log10()
  + p9.scale_x_log10()
  + p9.facet_grid('mouse~day'))
g

In [None]:
data = lib10_wt[lib10_wt.experiment =='TV5563A']
x = lib10_wt.day.nunique()
y = lib10_wt.mouse.nunique()
p9.options.figure_size = (x*3, y*3)
g = (p9.ggplot(data, p9.aes(x='conc', y='cnt'))
  + p9.geom_point()
  + p9.geom_smooth(method="lm")
  + p9.theme_classic()
  + p9.theme(text=p9.element_text(size=14),
             axis_text_x=p9.element_text(rotation=90, hjust=1))
    #+ p9.geom_text(p9.aes(label='Rlab', x=0.0001, y=.1))
  + p9.ylab("Count")
  + p9.xlab("Expected Abundance")
  + p9.scale_y_log10()
  + p9.scale_x_log10()
  + p9.facet_grid('mouse~day'))
g

In [None]:
data = lib10_wt[lib10_wt.experiment =='TV5585A']
x = lib10_wt.day.nunique()
y = lib10_wt.mouse.nunique()
p9.options.figure_size = (x*3, y*2.5)
g = (p9.ggplot(data, p9.aes(x='conc', y='cnt'))
  + p9.geom_point()
  + p9.geom_smooth(method="lm")
  + p9.theme_classic()
  + p9.theme(text=p9.element_text(size=14),
             axis_text_x=p9.element_text(rotation=90, hjust=1))
    #+ p9.geom_text(p9.aes(label='Rlab', x=0.0001, y=.1))
  + p9.ylab("Count")
  + p9.xlab("Expected Abundance")
  + p9.scale_y_log10()
  + p9.scale_x_log10()
  + p9.facet_grid('mouse~day'))
g

# Library 14_2

In [None]:
lib14_cnt = cnt_df[cnt_df.library == 'library_14_2'].copy()
lib14_cnt = lib14_cnt[['barcode', 'sampleID', 'experiment', 'mouse', 'day', 'tissue', 'dnaid', 'cnt']].drop_duplicates()
lib14_cnt['sampleID'] = lib14_cnt['sampleID']+ "_" + lib14_cnt['dnaid'] + '_' + lib14_cnt['experiment']
lib14_sdata = lib14_cnt[['sampleID', 'mouse', 'day', 'tissue', 'dnaid', 'experiment']].set_index('sampleID').drop_duplicates()
lib14_cnt = lib14_cnt.pivot(index='barcode', columns='sampleID', values='cnt')
lib14_cnt = lib14_cnt.fillna(0)
columns_to_filter = [f for f in lib14_cnt.columns if 'inoculum' in f]
lib14_cnt = lib14_cnt[(lib14_cnt[columns_to_filter] >= 1000).all(1)]

## VST Normalized Data

In [None]:
sdf14 = lib14_sdata
edf14 = lib14_cnt[list(sdf14.index)]
sdf14.to_csv(Path(root)/results/'lib14_sdf.csv')
edf14.to_csv(Path(root)/results/'lib14_edf.csv')
# Run DESeq2
lib14_vst = pd.read_csv(Path(root)/results/'lib14_vsd.csv').rename({'Unnamed: 0':'barcode'}, axis=1).set_index('barcode')
pDf_vst, pc1_vst, pc2_vst = find_pc1_pc2(lib14_vst, lib14_sdata)

plotPCA(pDf_vst, pc1_vst, pc2_vst, colorby='experiment', nameby='day', col=list(sns.color_palette()));

In [None]:
lib10_sdata.mouse.nunique()

## CLR Data Transformation

In [None]:
import numpy as np
from skbio.stats.composition import clr
lib14_clr = lib14_cnt.apply(lambda x: clr(x+1))

pDf_clr, pc1_clr, pc2_clr = find_pc1_pc2(lib14_clr, lib14_sdata)

plotPCA(pDf_clr, pc1_clr, pc2_clr, colorby='experiment', nameby='mouse', col=list(sns.color_palette()));

In [None]:
sns.set(font_scale=.8)
lut = dict(zip(lib14_sdata.mouse.unique(), sns.color_palette("hls", 14)))
row_colors = lib14_sdata.mouse.map(lut)

sns.clustermap(lib14_clr.T, method='average', metric='correlation', figsize=(20, 20), 
               row_colors=row_colors, dendrogram_ratio=(0.5,0.2), cmap='vlag')

In [None]:
sns.set(font_scale=.8)
lut = dict(zip(lib14_sdata.day.unique(), sns.color_palette("hls", 6)))
row_colors = lib14_sdata.day.map(lut)

sns.clustermap(lib14_clr.T, method='average', metric='correlation', figsize=(20, 20), 
               row_colors=row_colors, dendrogram_ratio=(0.5,0.2), cmap='vlag')

In [None]:
sns.set(font_scale=.8)
lut = dict(zip(lib14_sdata.experiment.unique(), sns.color_palette("hls", 3)))
row_colors = lib14_sdata.experiment.map(lut)

sns.clustermap(lib14_clr.T, method='average', metric='correlation', figsize=(20, 20), 
               row_colors=row_colors, dendrogram_ratio=(0.5,0.2), cmap='vlag')

In [None]:
lib14_wt = (cnt_df[(cnt_df.phenotype == 'wt')&(cnt_df.library == 'library_14_2')]
            .copy()[['barcode','cnt', 'conc', 'sampleName', 'experiment', 'dnaid', 'sampleID', 'day', 'mouse']])

lib14_wt['logcnt'] = np.log2(lib14_wt['cnt'])
corr_df = lib14_wt.groupby(['experiment','sampleID'])[['conc', 'logcnt']].corr().reset_index()
corr_df = corr_df[corr_df['level_2'] == 'conc'].drop(['level_2', 'conc'], axis=1)
corr_df.columns = ['experiment','sampleID', 'R']
corr_df['Rlab'] = corr_df.R.apply(lambda x: f'R = {round(x, 2)}')
lib14_wt = lib14_wt.merge(corr_df, on=['experiment', 'sampleID'])

lib14_wt.experiment.unique()

In [None]:
data = lib14_wt[lib14_wt.experiment =='TV4592B']
x = lib14_wt.day.nunique()
y = lib14_wt.mouse.nunique()
p9.options.figure_size = (x*3, y*3)
g = (p9.ggplot(data, p9.aes(x='conc', y='cnt'))
  + p9.geom_point()
  + p9.geom_smooth(method="lm")
  + p9.theme_classic()
  + p9.theme(text=p9.element_text(size=14),
             axis_text_x=p9.element_text(rotation=90, hjust=1))
    #+ p9.geom_text(p9.aes(label='Rlab', x=0.0001, y=.1))
  + p9.ylab("Count")
  + p9.xlab("Expected Abundance")
  + p9.scale_y_log10()
  + p9.scale_x_log10()
  + p9.facet_grid('mouse~day'))
g

In [None]:
data = lib14_wt[lib14_wt.experiment =='TV5490C']
x = lib14_wt.day.nunique()
y = lib14_wt.mouse.nunique()
p9.options.figure_size = (x*3, y*3)
g = (p9.ggplot(data, p9.aes(x='conc', y='cnt'))
  + p9.geom_point()
  + p9.geom_smooth(method="lm")
  + p9.theme_classic()
  + p9.theme(text=p9.element_text(size=14),
             axis_text_x=p9.element_text(rotation=90, hjust=1))
    #+ p9.geom_text(p9.aes(label='Rlab', x=0.0001, y=.1))
  + p9.ylab("Count")
  + p9.xlab("Expected Abundance")
  + p9.scale_y_log10()
  + p9.scale_x_log10()
  + p9.facet_grid('mouse~day'))
g

In [None]:
data = lib14_wt[lib14_wt.experiment =='TV5536B']
x = lib14_wt.day.nunique()
y = lib14_wt.mouse.nunique()
p9.options.figure_size = (x*3, y*3)
g = (p9.ggplot(data, p9.aes(x='conc', y='cnt'))
  + p9.geom_point()
  + p9.geom_smooth(method="lm")
  + p9.theme_classic()
  + p9.theme(text=p9.element_text(size=14),
             axis_text_x=p9.element_text(rotation=90, hjust=1))
    #+ p9.geom_text(p9.aes(label='Rlab', x=0.0001, y=.1))
  + p9.ylab("Count")
  + p9.xlab("Expected Abundance")
  + p9.scale_y_log10()
  + p9.scale_x_log10()
  + p9.facet_grid('mouse~day'))
g

In [None]:
(p9.ggplot(p9.aes(x='conc', y='cnt', ),data=lib14_wt)
+ p9.geom_point()
+p9.facet_grid('day ~ mouse'))

In [None]:
lib14_cnt.sample(5)

### Steps
- Get genes that are present in all inoculum samples. 
    - Nubmer of samples is 10, choose genes that have sampleName.nunique == 10
- For each gene, if # of barcodes > 1, 
    - pick barcode that is present in all samples
    - pick a barcode with max count

In [None]:
# Steps

test = cnt_df[cnt_df.library == 'library_14_2']
test = test[test.day == 'd0']
test.sampleName.nunique()

In [None]:
t2 = test.groupby(['ShortName']).agg({'sampleName':['nunique']}).reset_index()
t2.columns = ['gene', 'num_samples']
t2 = t2[t2.num_samples == 10].gene.values
test = test[test.ShortName.isin(t2)]

In [None]:
one_bc_per_gene = test.groupby('ShortName').barcode.nunique().reset_index()
m_bc_per_gene = one_bc_per_gene[one_bc_per_gene.barcode > 1].ShortName.values
one_bc_per_gene = one_bc_per_gene[one_bc_per_gene.barcode == 1].ShortName.values

In [None]:
len(m_bc_per_gene)

In [None]:
len(one_bc_per_gene)

In [None]:
test = test[test.ShortName.isin(m_bc_per_gene)]

In [None]:
test.sample(10)[['barcode', 'cnt', 'ShortName', 'sampleName', 'experiment', 'sampleID', 'dnaid']]

In [None]:
x = test.groupby(['ShortName', 'barcode']).sampleName.nunique().reset_index()
x = x[x.sampleName == 10].barcode.values

In [None]:
test = test[test.barcode.isin(x)]

In [None]:
test.groupby(['ShortName', 'barcode']).agg({'cnt':['median']})
# pick the one with the largest mean

In [None]:
test[test.ShortName == 'ytfL']