In [None]:
import pandas as pd
import plotnine as p9
from pathlib import Path
from scipy import stats
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotnine as p9

import skmisc
from matplotlib.patches import Ellipse
from sklearn.decomposition import PCA
from diffexpr.py_deseq import py_DESeq2


from rpy2.robjects.packages import importr
from rpy2.robjects.vectors import FloatVector
stats = importr('stats')

from scipy.stats import norm

# Table of Contents: <a name='top' /> 

## 1. <a href=#outliers>Looking for outliers</a>
## 2. <a href=#timecourse>Tracking barcode timecourse</a>
## 3.  <a href=#DEseq>DEseq analysis</a>



## Loading the data:

- Run the following cells before any of the analyses

In [None]:
# Get all the count/library data

# Loads controls, samples, and directoris with data

results_dir = "../../data/processed/results"
controls = pd.read_table('../../data/metadata/controls.txt', header=None, 
                        names = ['DN', 'barcode', 'phenotype', 'conc'])
meta_dir = "../../data/metadata"


samples = [f.stem for f in Path(results_dir).iterdir()]
samples

In [None]:
def read_count_files(sample, exp, results_dir=results_dir):
    df = pd.read_table(Path(results_dir)/f'{sample}/{sample}_counts_{exp}.txt', sep=" ").assign(exp=exp)
    df = (df.reset_index().rename({'index':'barcode'}, axis=1)
          .melt(id_vars=['barcode','Position', 'Element', 'Strand', 'Feature', 'ShortName', 'exp']))
    df['proportion'] = df['value']/ df.groupby('variable')['value'].transform('sum')
    expansion = df['variable'].str.split('-', expand=True)
    df['mouse'], df['day'],df['organ'] = expansion[0], expansion[1], expansion[2]
    df = df.rename({'variable':'sample', 'value':'cnts'}, axis=1)
    return df


def load_sample(sample, meta_dir = meta_dir, results_dir = results_dir):
    meta = (pd.read_table(Path(meta_dir)/f'{sample}_metadata.txt', header=None, 
                        names = ['DN', 'lib', 'exp', 'DN2', 'sample', 'day', 'organ'])
            .drop(['DN', 'DN2'], axis=1))
    exps = meta.exp.unique()
    dfs = [read_count_files(sample, exp) for exp in exps]
    fdf = pd.concat(dfs).assign(dnaid=sample)
    #fdf = fdf.merge(meta[['lib', 'exp']], how='left', left_on='exp', right_on='exp')
    return fdf


def load_samples(samples, meta_dir=meta_dir, results_dir=results_dir):
    dfs = [load_sample(sample, meta_dir, results_dir) for sample in samples]
    return pd.concat(dfs)


# Load all the count data for specified dnaids. 

fdf = load_samples(['dnaid2023','dnaid2024'])

gene_info = fdf[['Feature', 'ShortName']].drop_duplicates().set_index('Feature')

In [None]:
fdf.head()

In [None]:
controls.head()

## Looking for Outliers <a name='outliers' />

Go Back to the <a href=#top>Beginning</a> 

Got to Go Back to the <a href=#DEseq>DESeq Analysis</a>

In [None]:
def calculate_correlation(df, groupby, v1, v2):
    corr_df  = df.groupby(groupby)[[v1, v2]].corr()
    corr_df = corr_df.reset_index()
    corr_df = corr_df[corr_df['level_1'] == v1].drop(['level_1',v1], axis=1)
    return corr_df




control_cnts = controls.merge(fdf, left_on='barcode', right_on = 'barcode').drop(['DN','Position', 'Element',
                                                                                 'Strand', 'Feature', 'ShortName'], axis=1)

wt_cnts = control_cnts.copy()[control_cnts.phenotype == 'wt']
wt_cnts['logConc'] = np.log10(wt_cnts['conc'])
wt_cnts['logCnts'] = np.log10(wt_cnts['cnts'].replace({0:1}))
wt_cnts['logCnts_no0'] = np.log10(wt_cnts['cnts'])


original_corr = calculate_correlation(wt_cnts, 'sample', 'conc', 'cnts')
log_corr = calculate_correlation(wt_cnts, 'sample', 'logConc', 'logCnts').set_index('sample')
log_corr_n0 = calculate_correlation(wt_cnts, 'sample', 'logConc', 'logCnts_no0').set_index('sample')
log_corr.columns = ['R']
log_corr['R2'] = log_corr['R']**2
wt_cnts[(wt_cnts.mouse == 'inoculum') & (wt_cnts.exp == 'TV5490A')]

In [None]:
fdf[(fdf.dnaid == 'dnaid2023')& (fdf.exp == 'TV5490A') & (fdf.mouse == 'inoculum')].cnts.max()

In [None]:
log_corr_n0.style.apply(highlight_low, cutoff=0.85)

In [None]:
wt_cnts_sub = wt_cnts[(wt_cnts.exp=='TV5490A') & (wt_cnts.dnaid == 'dnaid2023')& (wt_cnts.day != 'd0')]
wt_cnts_sub_mice = wt_cnts_sub[wt_cnts_sub.mouse.isin(['al965', 'al966', 'al967', 'al968'])]
wt_cnts_sub.head()

In [None]:
p9.options.figure_size = (10, 15)
(p9.ggplot(wt_cnts_sub_mice, p9.aes(x='conc', y='cnts'))
 + p9.geom_point()
 +p9.geom_smooth(method = "lm")
 + p9.theme_classic()
 +p9.theme(text = p9.element_text( size = 20))
 + p9.ylab("Count")
 + p9.xlab("Conc")
 + p9.scale_y_log10()
 + p9.scale_x_log10()
 + p9.facet_grid('mouse ~day'))

In [None]:
wt_cnts_sub_mice[(wt_cnts_sub_mice.mouse == 'al965') & (wt_cnts_sub_mice.day == 'd1')][['cnts', 'conc']].sort_values('conc')

In [None]:
test = wt_cnts_sub_mice[(wt_cnts_sub_mice.mouse == 'al967') & (wt_cnts_sub_mice.day == 'd2')][['logCnts', 'logConc']].sort_values('logConc')

In [None]:
#without 0
test = test.replace({0:-math.inf})#.dropna()
test.corr()

In [None]:
# with 0
test

In [None]:
import math


In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score


def get_rsq_and_mse(df):
    model = LinearRegression()
    x_vals = df.logCnts.values.reshape((-1,1))
    y_vals = df.logConc.values

    model.fit(x_vals, y_vals)
    r_sq = model.score(x_vals, y_vals)
    mse = mean_squared_error(y_vals, model.predict(x_vals))
    return r_sq, mse


In [None]:
labels = []
lm_res = []
for i, g in wt_cnts.groupby(['mouse', 'day']):
    label = '-'.join(i)
    labels.append(label)
    lm_res.append(get_rsq_and_mse(g))
erdf = pd.DataFrame(lm_res, index=labels, columns=['r2', 'mse'])

In [None]:
erdf[erdf.r2>0.6].shape

In [None]:
erdf[erdf.mse < 0.2].shape

In [None]:
erdf.mse.hist(bins=20)

In [None]:
oc_df.head()

In [None]:
p9.options.figure_size = (5, 5)
(p9.ggplot(oc_df, p9.aes(x='inoculum-d0-inoculum', y='al966-d1-feces'))
 + p9.geom_point()
 +p9.geom_smooth(method = "lm")
 + p9.theme_classic()
 +p9.theme(text = p9.element_text( size = 18))
 + p9.ylab("AL966/Day1")
 + p9.xlab("Inoculum")
 + p9.scale_y_log10()
 + p9.scale_x_log10())
 #+ p9.facet_grid('mouse ~day'))

In [None]:
p9.options.figure_size = (5, 5)
(p9.ggplot(oc_df, p9.aes(x='inoculum-d0-inoculum', y='al967-d2-feces'))
 + p9.geom_point()
 +p9.geom_smooth(method = "lm")
 + p9.theme_classic()
 +p9.theme(text = p9.element_text( size = 18))
 + p9.ylab("AL967/Day2")
 + p9.xlab("Inoculum")
 + p9.scale_y_log10()
 + p9.scale_x_log10())
 #+ p9.facet_grid('mouse ~day'))

In [None]:
?sns.clustermap

In [None]:
hi_df = (fdf[(fdf.exp == 'TV5490A') & (fdf.dnaid == 'dnaid2023')& (fdf.day != 'd1')]
        .pivot(index='barcode', columns='sample', values='cnts'))
hi_df = hi_df[(hi_df['inoculum-d0-inoculum'] > 1000) & (hi_df['unenriched_inoculum-d0-inoculum'] > 1000)]
hi_df = hi_df/hi_df.sum()
hi_df


g = sns.clustermap(hi_df.corr(), linewidths=0.5, linecolor='black',figsize=(25,25),vmin=0.6, vmax=1, cmap = mmap,
             cbar_kws={'label': 'Correlation Coefficient'})

In [None]:
oc_df = (fdf[(fdf.exp == 'TV5490A') & (fdf.dnaid == 'dnaid2023') & (fdf.day != 'd1')]
        .pivot(index='barcode', columns='sample', values='proportion'))

mmap = sns.color_palette("Blues", as_cmap=True)

sns.set_style("white")

sns.set_context("notebook", font_scale=3)
g = sns.clustermap(oc_df.corr(), linewidths=0.5, linecolor='black',figsize=(25,25),vmin=0.6, vmax=1, cmap = mmap,
             cbar_kws={'label': 'Correlation Coefficient'})


In [None]:
oc_df = (fdf[(fdf.exp == 'TV5490A') & (fdf.dnaid == 'dnaid2023')]
        .pivot(index='barcode', columns='sample', values='proportion'))

mmap = sns.color_palette("Blues", as_cmap=True)

sns.set_style("white")

sns.set_context("notebook", font_scale=2)
g = sns.clustermap(oc_df.corr(), linewidths=0.5, linecolor='black',figsize=(25,25),vmin=0.6, vmax=1, cmap = mmap,
            cbar_kws={'label': 'Correlation Coefficient'})

In [None]:
def highlight_low(s, cutoff):
    is_max = s < cutoff
    return ['background-color: #ffca35' if v else '' for v in is_max]

In [None]:

original_corr = calculate_correlation(wt_cnts_sub, 'sample', 'conc', 'cnts').set_index('sample')
log_corr = calculate_correlation(wt_cnts_sub, 'sample', 'logConc', 'logCnts').set_index('sample')
oc = pd.DataFrame(oc_df.corr().mean(), columns = ["Mean Overall Correlation"])

final_corr = pd.concat([original_corr,  log_corr_n0, oc], axis=1).dropna()
final_corr.columns = ['Original_R',  'R_log10', 'Overall_R']

In [None]:
(final_corr.style
 .apply(highlight_low, cutoff=0.8, subset = ['Original_R'])
 .apply(highlight_low, cutoff=0.9, subset=['R_log10'])
 .apply(highlight_low, cutoff=0.7, subset=['Overall_R']))

In [None]:
final_corr

In [None]:
x = .style.apply(highlight_low, cutoff=0.7)

In [None]:
x.merge(original_corr, left_index=True, right_index=True)

In [None]:
cdf2 = fdf[(fdf.exp == 'TV5490B') & (fdf.dnaid == 'dnaid2023')]
cdf2 = cdf2.pivot(index='barcode', columns='sample', values='proportion')
cdf2.head()

In [None]:
sns.set_style("white")
sns.set_context("notebook", font_scale=2.0)
g = sns.clustermap(cdf2.corr(), linewidths=0.5, linecolor='black',figsize=(15,15),vmin=0.6, vmax=1,
            cbar_kws={'label': 'Correlation Coefficient'})

In [None]:
cdf3 = fdf[(fdf.exp == 'TV5490C') & (fdf.dnaid == 'dnaid2023')]
cdf3 = cdf3.pivot(index='barcode', columns='sample', values='proportion')
cdf3.head()

In [None]:
sns.set_style("white")
sns.set_context("notebook", font_scale=2.0)
g = sns.clustermap(cdf3.corr(), linewidths=0.5, linecolor='black',figsize=(15,15),vmin=0.6, vmax=1,
            cbar_kws={'label': 'Correlation Coefficient'})

In [None]:
#Inoculum

inc = fdf[fdf.organ == 'inoculum']
inc.groupby(['exp', 'mouse']).barcode.nunique()

In [None]:
inc0 = inc[inc.cnts > 0]
inc0.groupby(['exp', 'mouse']).barcode.nunique()

In [None]:
cond1 = fdf[(fdf.mouse =='unenriched_inoculum') & (fdf.cnts > 1000)].barcode.values
cond2 = fdf[(fdf.mouse =='inoculum') & (fdf.cnts > 1000)].barcode.values

In [None]:
actualbc = set(cond1).intersection(set(cond2))
ndf = fdf[fdf.barcode.isin(actualbc)]

In [None]:
plt.figure(figsize=(15,6))
fdf[(fdf.exp == 'TV5490A') & (fdf.mouse == 'inoculum')].cnts.hist(bins=50)

In [None]:
filt_df = (fdf[(fdf.exp == 'TV5490A') & (fdf.dnaid == 'dnaid2023')]
        .pivot(index='barcode', columns='sample', values='cnts'))
filt0 = filt_df[(filt_df['inoculum-d0-inoculum'] >0)& (filt_df['unenriched_inoculum-d0-inoculum'] > 0)]
filt1000 = filt_df[(filt_df['inoculum-d0-inoculum'] >1000)& (filt_df['unenriched_inoculum-d0-inoculum'] > 1000)]

In [None]:
plt.figure(figsize=(8, 6))
filt_df['inoculum-d0-inoculum'].hist(bins=50)
plt.xlabel('Barcode Counts')
plt.ylabel("Frequency")
plt.title(f"Total: {filt_df.shape[0]} barcodes")

In [None]:
plt.figure(figsize=(8, 6))
filt0['inoculum-d0-inoculum'].hist(bins=50)
plt.xlabel('Barcode Counts')
plt.ylabel("Frequency")
plt.title(f"Total: {filt0.shape[0]} barcodes")

In [None]:
plt.figure(figsize=(8, 6))
filt1000['inoculum-d0-inoculum'].hist(bins=50)
plt.xlabel('Barcode Counts')
plt.ylabel("Frequency")
plt.title(f"Total: {filt1000.shape[0]} barcodes")

In [None]:
filt0.min()

In [None]:
pd.DataFrame(fdf.groupby(['exp']).barcode.nunique()).reset_index()

In [None]:
pd.DataFrame(ndf.groupby(['exp']).barcode.nunique()).reset_index()

In [None]:
pd.DataFrame(ndf.groupby(['exp']).barcode.nunique()).reset_index()

In [None]:
import skmisc
from matplotlib.patches import Ellipse
from sklearn.decomposition import PCA


"""
Plotting PCA elipses:
__author__:
"""

def plot_point_cov(points, nstd=2, ax=None, **kwargs):
    """
    Plots an `nstd` sigma ellipse based on the mean and covariance of a point
    "cloud" (points, an Nx2 array).
    Parameters
    ----------
        points : An Nx2 array of the data points.
        nstd : The radius of the ellipse in numbers of standard deviations.
            Defaults to 2 standard deviations.
        ax : The axis that the ellipse will be plotted on. Defaults to the
            current axis.
        Additional keyword arguments are pass on to the ellipse patch.
    Returns
    -------
        A matplotlib ellipse artist
    """
    pos = points.mean(axis=0)
    cov = np.cov(points, rowvar=False)
    return plot_cov_ellipse(cov, pos, nstd, ax, **kwargs)


def plot_cov_ellipse(cov, pos, nstd=2, ax=None, **kwargs):
    """
    Plots an `nstd` sigma error ellipse based on the specified covariance
    matrix (`cov`). Additional keyword arguments are passed on to the
    ellipse patch artist.
    Parameters
    ----------
        cov : The 2x2 covariance matrix to base the ellipse on
        pos : The location of the center of the ellipse. Expects a 2-element
            sequence of [x0, y0].
        nstd : The radius of the ellipse in numbers of standard deviations.
            Defaults to 2 standard deviations.
        ax : The axis that the ellipse will be plotted on. Defaults to the
            current axis.
        Additional keyword arguments are pass on to the ellipse patch.
    Returns
    -------
        A matplotlib ellipse artist
    """
    def eigsorted(cov):
        vals, vecs = np.linalg.eigh(cov)
        order = vals.argsort()[::-1]
        return vals[order], vecs[:,order]

    if ax is None:
        ax = plt.gca()

    vals, vecs = eigsorted(cov)
    theta = np.degrees(np.arctan2(*vecs[:,0][::-1]))

    # Width and height are "full" widths, not radius
    width, height = 2 * nstd * np.sqrt(vals)
    ellip = Ellipse(xy=pos, width=width, height=height, angle=theta, **kwargs)

    ax.add_artist(ellip)
    return ellip
#____________________________________________________




def plotPCA(pDf, pc1_var, pc2_var, colorby, col, nameby="", el=False):
    sns.set_style("ticks")
    sns.set_context("notebook", font_scale=2.2)
    group = pDf[colorby].unique()
    assert len(group) <= len(col)
    fig = plt.figure(figsize=(8, 8))
    for g, c in zip(group, col):
        df = pDf[pDf[colorby] == g]
        x, y = df[["PC1"]].values, df[["PC2"]].values
        ax = plt.scatter(x, y, c=c, s=150, label=g)
        if el:
            pts = np.asarray([[float(a), float(b)] for a, b in zip(x, y)])
            plot_point_cov(pts, nstd=2, alpha=0.1, color=c)
        if nameby:
            labels = df[nameby]
            for label, pc1, pc2 in zip(labels, x, y):
                plt.annotate(label, xy=(pc1, pc2), xytext=(-5, 7), textcoords="offset points",fontsize=14)
        plt.xlabel('Principal Component 1, {} %'.format(pc1_var), )
        plt.ylabel('Principal Component 2, {} %'.format(pc2_var), )
        #plt.xticks(fontsize=16)
        #plt.yticks(fontsize=16)
        plt.legend(frameon=True)
    return fig

def find_pc1_pc2(df, meta):
    df = df.T
    pca = PCA(n_components=2)
    principalComponents = pca.fit_transform(df)
    pDf = (pd.DataFrame(data=principalComponents, columns=['PC1', 'PC2'])
           .set_index(df.index))
    pc1_var = round(pca.explained_variance_ratio_[0] * 100, 2)
    pc2_var = round(pca.explained_variance_ratio_[1] * 100, 2)
    pDf2 = pDf.merge(meta, left_index=True, right_index=True)
    return pDf2, pc1_var, pc2_var

In [None]:
meta = exp1[['sample', 'mouse', 'day', 'organ']].drop_duplicates().set_index('sample')


In [None]:
pDF, pc1, pc2, = find_pc1_pc2(oc_df, meta)

In [None]:

plotPCA(pDF, pc1, pc2, colorby='day', nameby='mouse', col=['blue', 'green', 'red']);

In [None]:
ndf.groupby(['exp', 'mouse']).barcode.nunique()

## Tracking genes and barcodes <a name='timecourse' />

Go Back to the <a href=#top>Beginning</a> 

Got to Go Back to the <a href=#DEseq>DESeq Analysis</a>

In [None]:
#dcuB
l1 = (fdf.ShortName == 'dcuB') & (fdf.exp == 'TV5490A') & (fdf.mouse == 'inoculum')
l2 = (fdf.ShortName == 'dcuB') & (fdf.exp == 'TV5490B')& (fdf.mouse == 'inoculum')
l3 = (fdf.ShortName == 'dcuB') & (fdf.exp == 'TV5490C')& (fdf.mouse == 'inoculum')

In [None]:
fdf['ShortName'].notnull() & fdf['ShortName'].str.contains('hyb')


In [None]:
test = fdf[fdf['ShortName'].notnull() & fdf['ShortName'].str.contains('hybG')
]

In [None]:
test.barcode.unique()

In [None]:
fdf[(fdf.barcode=='CGGCGACAACTGACACC') & (fdf.day =='d1')]

In [None]:
def get_deseq_dataset(fdf, exp):
    return None



In [None]:
p9.options.figure_size = (15, 10)
(p9.ggplot(test, p9.aes(x='day', y='proportion', color='mouse',shape='exp', group='mouse'))
 + p9.geom_point(size=6)
 + p9.geom_line()
 + p9.theme_classic()
#  + p9.ylab("Count")
#  + p9.xlab("Conc")
  + p9.scale_y_log10()
 + p9.facet_wrap("~barcode")
)

In [None]:
p9.options.figure_size = (15, 15)
(p9.ggplot(test, p9.aes(x='day', y='proportion', color='barcode', shape='exp'))
 + p9.geom_point(size=6)
 #+ p9.geom_path(p9.aes(color=test['barcode']))
 + p9.theme_classic()
#  + p9.ylab("Count")
#  + p9.xlab("Conc")
  + p9.scale_y_log10()
 + p9.facet_wrap("~mouse")
)

In [None]:
fdf[l2]

In [None]:
fdf[l3]

In [None]:
f[f.barcode =='AAACGTAACATAAGCCA']

In [None]:
f.tail().merge(s[['lib', 'exp']].drop_duplicates(),  left_on='exp', right_on='exp')

## DESeq Analysis <a name='DEseq' />

Go Back to the <a href=#top>Beginning</a>

In [None]:
#fdf is dataframe will all the data from dnaid2023 and dnaid2024
fdf.head()

# Outline of Analysis

-  Analysis is done for each dnaid and each exp withing dnaid independently.

### Step 1: subset to dnaid and exp.

### Step 2: Look at the controls:
    - This could be correlation of wt as before
    - or correlation between samples, 
    - or some combination of both
    
    The result needs to be a list of sample ids to keep, ex. [al964-d4-feces.... ]
    
### Step 3: DeSeq Analysis

    - Create datasetss for analysis, specifically edf and sdf. 
    - Pivot, and filter -> make inflexible for now, fix later
    - Keep only samples from previous step
    
    - Calculate fitness
    - Visualize fitness for each gene as 2 overlapping histograms
    
    - Calculate z-scores
    

In [None]:
def subset_experiment(df, query_string):
    
    '''
    example query string : '(exp=="TV5490A") & (dnaid == "dnaid2023")'
    '''
    return df.query(query_string)



def calculate_correlation(df, groupby, v1, v2):
    corr_df  = df.groupby(groupby)[[v1, v2]].corr()
    corr_df = corr_df.reset_index()
    corr_df = corr_df[corr_df['level_1'] == v1].drop(['level_1',v1], axis=1)
    return corr_df


def good_mice(df, controls, cutoff):
    control_cnts = controls.merge(df, left_on='barcode', right_on = 'barcode').drop(['DN','Position', 'Element',
                                                                                 'Strand', 'Feature', 'ShortName'], axis=1)

    wt_cnts = control_cnts.copy()[control_cnts.phenotype == 'wt']
    wt_cnts['logConc'] = np.log10(wt_cnts['conc'])
    wt_cnts['logCnts'] = np.log10(wt_cnts['cnts'].replace({0:1}))
    log_corr = calculate_correlation(wt_cnts, 'sample', 'logConc', 'logCnts').set_index('sample')
    log_corr.columns = ['R']
    return log_corr[log_corr.R > cutoff].index

def generate_DE_dataset(df, samples_to_keep, to_filter = 0): # Assumes already subset to 1 experiment, 1 dnaid
    sample_data = df[['sample', 'mouse', 'day', 'organ', 'dnaid']].set_index('sample').drop_duplicates()
    sample_data = sample_data.loc[sample_data.index.intersection(samples_to_keep)]
    expr_data = df.pivot(index='barcode', columns='sample', values='cnts')
    
    expr_data = expr_data[(expr_data['inoculum-d0-inoculum'] >= to_filter) & (expr_data['unenriched_inoculum-d0-inoculum'] >= to_filter)]
    expr_data = expr_data[list(sample_data.index)].reset_index()
    return sample_data, expr_data


def calculate_fitness(edf, sdf):
    dds = py_DESeq2(count_matrix = edf,
                   design_matrix = sdf,
                   design_formula = '~ day',
                   gene_column = 'barcode') # <- telling DESeq2 this should be the gene ID column
    
    dds.run_deseq() 
    days = list(sdf['day'].unique())
    days.remove('d0')
    all_results = []
    for d in days:
        dds.get_deseq_result(contrast =['day', d, 'd0'])
        res = dds.deseq_result
        res['day'] = d
        all_results.append(res)
    return pd.concat(all_results)



def calculate_2dist_zscore(u1, s1, u2, s2):
    return (u1-u2)/np.sqrt((s1**2) + (s2**2))


def calculte_comparisons(fitness, df, controls, cntrl_type = 'wt'):
    genes = set(df.Feature.values)
    genes.remove('-')
    other_barcodes = set(df[df.Feature == '-'].barcode.values)
    control_barcodes = set(controls[controls.phenotype==cntrl_type].barcode.values)
    days = fitness.day.unique()
    all_comps = []
    for day in days:
        control_fits = fitness[fitness.day == day].loc[fitness.index.intersection(control_barcodes)]
        control_mu = control_fits.log2FoldChange.mean()
        control_sigma = np.sqrt(control_fits.lfcSE.pow(2).sum())/control_fits.shape[0]
        gene_comps = {}
        for gene in genes:
            gene_barcodes = set(df[df.Feature == gene].barcode.values)
            gene_fits = fitness[fitness.day == day].loc[fitness.index.intersection(gene_barcodes)]
            if gene_fits.shape[0] > 0:
                gene_mu = gene_fits.log2FoldChange.mean()
                gene_sigma = np.sqrt(gene_fits.lfcSE.pow(2).sum())/gene_fits.shape[0]
                zscore = calculate_2dist_zscore(gene_mu, gene_sigma, control_mu, control_sigma)
                ci = 2**gene_mu/2**control_mu
                num_bc = gene_fits.shape[0]
                meanExp = gene_fits.baseMean.mean()
                std = np.std(gene_fits.baseMean)
                
                gene_comps[gene] = [zscore, num_bc, meanExp, std, ci]
        for  barcode in other_barcodes:
            other_fit =  fitness[fitness.day == day].loc[fitness.index.intersection([barcode])]
            if not other_fit.empty:
                zscore = calculate_2dist_zscore(other_fit.log2FoldChange.values[0], 
                                                             other_fit.lfcSE.values[0], control_mu, control_sigma)
                ci = 2**other_fit.log2FoldChange.values[0]/2**control_mu
                gene_comps[barcode] = [zscore, 1, other_fit.baseMean.values[0], 0, ci]
        
        comp_df = pd.DataFrame(gene_comps, index =[day+'_zscore', day+'_num_bc', day+"_meanExpr", day+"_std", day+"_ci"]).T
        
        all_comps.append(comp_df)
    return pd.concat(all_comps, axis=1)


def comp_stats(comp):
    pvalues = 2*norm.cdf(-np.abs(comp), 0, 1)
    p_adjust = list(stats.p_adjust(FloatVector(pvalues), method = 'BH'))
    s = pd.DataFrame(comp)
    s[comp.name+'_pval'] = pvalues
    s[comp.name+'_padj'] = p_adjust
    s = s.rename({comp.name: comp.name+"_zscore" })
    return s



def analyze_experiment(fdf, query, cutoff, controls, to_filter=1000, cntrl_type = 'wt'):
    exp1 = subset_experiment(fdf, query)
    mice = good_mice(exp1, controls, cutoff)
    sdf, edf = generate_DE_dataset(exp1, mice, to_filter)
    fitness = calculate_fitness(edf, sdf)
    barcode_info = exp1[['barcode', 'Feature', 'ShortName']].drop_duplicates().set_index('barcode')
    fitness_annot = fitness.merge(barcode_info, how='left', left_index=True, right_index = True)
    comp_to_wt = calculte_comparisons(fitness, exp1, controls, cntrl_type)
    final_list = [comp_stats(comp_to_wt[c]) for c in comp_to_wt.columns if 'zscore' in c]
    return fitness_annot, comp_to_wt, pd.concat(final_list, axis=1)






In [None]:
def vis_fitness(fitness, controls, gene, day):
    fit = fitness[fitness.day == day]
    cf = (controls[controls.phenotype == 'wt'].set_index('barcode').merge(fit, how='left', left_index=True, right_index = True)
                       .drop(['DN'], axis=1))
    cf = cf.drop_duplicates()
    gf = fit[fit.ShortName == gene]
    
    
    sns.set_style("white")
    sns.set_context("notebook", font_scale=1.5)
    plt.figure(figsize=(8,6)) 
    cf.log2FoldChange.hist(bins=25, label = 'Control Barcodes')
    gf.log2FoldChange.hist(bins=15, label = f'{gene} Barcodes')
    plt.legend()
    plt.xlabel('log2FoldChange compared to inoculum')
    return cf, gf
    

In [None]:
cf, gf = vis_fitness(f2, controls, 'hilD', 'd2')



## Reproducing Chris's results

This code produces the same results as the ones produce by Chris. This was done on count dataframe without any filtering. 

In [None]:
fdf[fdf.exp == 'TV5490A'].groupby('mouse').barcode.nunique()

In [None]:
query = '(exp=="TV5490A") & (dnaid == "dnaid2023")'

In [None]:
#fdf, query, cutoff, controls, to_filter=True, cntrl_type = 'wt'

In [None]:
original_fitness, original_comps, original_results = analyze_experiment(fdf, query, 0, controls, 0, 'wt');

In [None]:
new_fitness, new_comps, new_results = analyze_experiment(fdf, query, 0.85, controls, 1, 'wt');

In [None]:
original_results.head()

In [None]:
cf, gf = vis_fitness(original_fitness, controls, 'malT', 'd1')
#original_fitness[original_fitness.Feature == 'SL1344_0058']

In [None]:
cf

In [None]:
cf, gf = vis_fitness(new_fitness, controls, 'hilD', 'd2')
gf

In [None]:
cf, gf, = vis_fitness(new_fitness, controls, 'sul2', 'd1')

In [None]:
gf

In [None]:
new_results.head()

In [None]:
new_comps.columns

In [None]:
#Day 1 Results
d1_res = new_results.merge(new_comps[['d1_num_bc', 'd1_meanExpr', 'd1_std', 'd1_ci']], left_index=True, right_index=True).drop_duplicates()
#d1_res = d1_res[d1_res.d1_zscore_padj <0.05]
d1_res = d1_res[['d1_zscore', 'd1_zscore_padj', 'd1_ci']].merge(gene_info, how='left', left_index=True, right_index=True).sort_index()

In [None]:
#Day 1 Results
d2_res = new_results.merge(new_comps[['d2_num_bc', 'd2_meanExpr', 'd2_std', 'd2_ci']], left_index=True, right_index=True).drop_duplicates()
#d2_res = d2_res[d2_res.d2_zscore_padj <0.05]
d2_res = d2_res[['d2_zscore', 'd2_zscore_padj', 'd2_ci']].merge(gene_info, how='left', left_index=True, right_index=True).sort_index()

In [None]:
d1_res[d1_res.ShortName == 'frdD']

In [None]:
d2_res[d2_res.ShortName == 'hybD']

In [None]:
d2_res

In [None]:


plt.figure(figsize=(15,10))
#plt.figure(figsize=(30,10))
d1_sig = d1_res[d1_res.d1_zscore_padj < 0.05]

plt.plot(d1_sig.d1_zscore, d1_sig.d1_ci, 'o', color=sns.color_palette()[0], markersize=12)



for x,y,l in zip(d1_sig.d1_zscore,d1_sig.d1_ci, d1_sig.ShortName):

    label = l
    if str(label).startswith("SL1344"):
        continue
    plt.annotate(label, # this is the text
                 (x,y), # this is the point to label
                 textcoords="offset points", # how to position the text
                 xytext=(0,10), # distance from text to points (x,y)
                 ha='left') 
plt.ylabel("CI (Mean(Gene Fitness)/ Mean(WT Fitness))")
plt.xlabel("Z-Score")
plt.title("Significant Results Day 1 (p_adj < 0.05)")
plt.yscale('log')


In [None]:
plt.figure(figsize=(30,10))
d2_sig = d2_res[d2_res.d2_zscore_padj < 0.05]

plt.plot(d2_sig.d2_zscore, d2_sig.d2_ci, 'o', color=sns.color_palette()[1], markersize=12)



for x,y,l in zip(d2_sig.d2_zscore,d2_sig.d2_ci, d2_sig.ShortName):

    label = l
    
    if not label or str(label).startswith("SL1344") or str(label) == 'nan':
        continue
    plt.annotate(label, # this is the text
                 (x,y), # this is the point to label
                 textcoords="offset points", # how to position the text
                 xytext=(-8,8), # distance from text to points (x,y)
                 ha='left') 
plt.ylabel("CI (Mean(Gene Fitness)/ Mean(WT Fitness))")
plt.xlabel("Z-Score")
plt.title("Significant Results Day 2 (p_adj < 0.05)")
plt.yscale("log")
#plt.xscale("log")

In [None]:
d2_sig

In [None]:
new_results[new_results.d2_zscore_padj < 0.05].merge(gene_info, how='left', left_index=True, right_index=True).sort_index()

In [None]:
def counts_overtime(fdf, exp, gene):
    df = fdf[(fdf.ShortName == gene) & (fdf.exp == exp)]
    nbc = df.barcode.nunique()
    if nbc == 0:
        return f"{gene} not found"
    print(nbc/4)
    inoculum = df[(df.day == 'd0') & (df.mouse == 'inoculum')]
    df = df[df.day != 'd0']
    if nbc/4 < 1 :
        
        xdim = 4*nbc
        print(xdim)
        ydim = 5
    else:
        xdim =  16
        ydim = 5*nbc/4
        
    p9.options.figure_size = (xdim, ydim)
    g = (p9.ggplot(df, p9.aes(x='day', y='cnts', color='mouse', group='mouse'))
     + p9.geom_point(size=6)
     + p9.geom_line()
     + p9.theme_classic()
      + p9.ylab("Counts")
      + p9.xlab("Day")
      + p9.scale_y_log10()
     + p9.geom_hline(inoculum, p9.aes(yintercept = 'cnts', color='dnaid'), linetype="dashed",  size=1)
     + p9.facet_wrap("~barcode")

    )
    
    return g

In [None]:
g = counts_overtime(fdf, 'TV5490A', 'dcuR')
g

In [None]:

test = fdf[(fdf.ShortName == 'hilD') & (fdf.exp == 'TV5490A')]
inoculum = test[(test.day == 'd0') & (test.mouse == 'inoculum')]
test = test[test.day != 'd0']
inoculum
p9.options.figure_size = (15, 20)
(p9.ggplot(test, p9.aes(x='day', y='cnts', color='mouse', group='mouse'))
 + p9.geom_point(size=6)
 + p9.geom_line()
 + p9.theme_classic()
#  + p9.ylab("Count")
#  + p9.xlab("Conc")
  + p9.scale_y_log10()
 + p9.geom_hline(inoculum, p9.aes(yintercept = 'cnts', color='dnaid'), linetype="dashed",  size=1)
 + p9.facet_wrap("~barcode")
 
)

In [None]:
expr_data = exp1.pivot(index='barcode', columns='sample', values='cnts')

In [None]:
plt.figure(figsize=(8,6))
expr_data['inoculum-d0-inoculum'].hist(bins=50)

plt.vlines(x=1000, ymin=0, ymax=3000, color='black')
plt.vlines(x=1, ymin=0, ymax=3000, color='orange')

In [None]:
expr_data['inoculum-d0-inoculum'].mean()

In [None]:
expr_data[expr_data['inoculum-d0-inoculum'] >1000].shape

In [None]:
test = fdf[(fdf.ShortName == 'hilD') & (fdf.exp == 'TV5490A')]
inoculum = test[(test.day == 'd0') & (test.mouse == 'inoculum')]
test = test[test.day != 'd0']
inoculum
p9.options.figure_size = (15, 20)
(p9.ggplot(test, p9.aes(x='day', y='cnts', color='barcode', group='barcode'))
 + p9.geom_point(size=6)
 + p9.geom_line()
 + p9.theme_classic()
#  + p9.ylab("Count")
#  + p9.xlab("Conc")
  + p9.scale_y_log10()
 + p9.geom_hline(inoculum, p9.aes(yintercept = 'cnts', color='dnaid'), linetype="dashed",  size=1)
 + p9.facet_wrap("~mouse")
 
)

In [None]:
#plt.figure()
#original_comps.d1_num_bc.hist(bins=50)
plt.figure()
original_comps.d1_meanExpr.hist(bins=50)
plt.xlabel('mean "expression"')
plt.ylabel('frequency')

# Filtering 
- Removing barcodes with 0 counts in the inoculum and removing 2 mice that looked like outliers and re-running the results

In [None]:
#Subsetting so that only looking at barcodes with > 0 counts in both enriched and unenriched inoculum

# Needs to be done on by exp. basis
def remove_null_barcodes(fdf, exp, dnaid):
    df0 = fdf[(fdf.exp == exp) & (fdf.dnaid == dnaid)].copy()
    cond1 = df0[(df0.mouse =='unenriched_inoculum') & (df0.cnts > 0)].barcode.values
    cond2 = df0[(df0.mouse =='inoculum') & (df0.cnts > 0)].barcode.values
    actualbc = set(cond1).intersection(set(cond2))
    df0 = df0[df0.barcode.isin(actualbc)]
    return df0

df0 = remove_null_barcodes(fdf, 'TV5490A', 'dnaid2023')
print(df0.groupby(['exp', 'mouse']).barcode.nunique())

df0 = df0[(df0.mouse!='al965') & (df0.mouse!='al967')]

In [None]:
new_fitness, new_comps, new_results = analyze_experiment(df0, 'TV5490A', controls, 'wt', dnaid='dnaid2023')

In [None]:
p9.options.figure_size = (10, 8)
(p9.ggplot(new_comps, p9.aes(x='d1_meanExpr'))
 + p9.geom_histogram(bins=50, fill='red', alpha=0.8)
 +p9.geom_histogram(original_comps, p9.aes(x='d1_meanExpr'), bins=50,fill='blue', alpha=0.4)
 + p9.theme_classic()
 + p9.ylab("Count")
 + p9.xlab("meanExpr of barcodes")

)

In [None]:
new_comps.plot( x='d1_zscore', y='d2_zscore',kind='scatter')

In [None]:
# new_results['d1_-logPval'] = -1*np.log(new_results['d1_zscore_padj'])
# c1 = abs(new_results['d1_zscore']) > 2
# c2 = new_results['d1_zscore_padj']< 0.05
# new_results['d1_hit'] = c1 & c2

In [None]:
new_results.head()

In [None]:
(p9.ggplot(new_results, p9.aes(x = 'd1_zscore', y = 'd1_-logPval', fill='d1_hit')) +
 p9.geom_point(size = 2, shape = 'o', colour = "grey")+
 p9.labs(x = 'Z-Score', y = '-log10(Pval)')+
 p9.scale_x_continuous(limits = [-12, 6], breaks = range(-12, 6, 2)) + 
 p9.scale_y_continuous(limits = [0, 25], breaks = range(0, 26, 5))+
 p9.ggtitle("Title")+
 p9.theme(
    plot_title = p9.element_text(family = "Arial", size = 11, hjust = 0), # Title size and font.
    
    axis_text = p9.element_text(family = "Arial", size = 10), # Size and font of x and y values.
    axis_title = p9.element_text(family = "Arial", size = 10), # Size and font of x and y axes.
    panel_border = p9.element_rect(colour = "black", fill = np.nan, size = 1), # Black border around the plot area.
    axis_ticks = p9.element_line(colour = "black", size = 1), # Style of x and y ticks.
    legend_position = "none"
  )
)
   #p9.geom_label(p9.aes(label=label), label_size=0.01, nudge_x=2,))#, adjust_text={'expand_points': (1.5, 1.5), 'arrowprops': {'arrowstyle': '-'}}))
  #p9.scale_fill_manual(breaks = ["blue", "red"], values = ["deepskyblue3", "firebrick1"]))



In [None]:
def subset_results(result_df, gene_info = gene_info, cond='d1', cutoff = 0.05, other_condition=False):
    if other_condition:
        return result_df[cond].merge(gene_info, how='left', left_index=True, right_index=True)
    else:
        return(result_df[result_df[f'{cond}_zscore_padj'] < cutoff][[c for c in result_df if cond in c]]
         .merge(gene_info, how='left', left_index=True, right_index=True))

In [None]:
# Compare significant results: Day 1
or_d1 = subset_results(original_results, cond='d1').sort_index()
or_d1

In [None]:
new_d1 = subset_results(new_results, cond='d1').sort_index()
new_d1

In [None]:
# Compare significant results: Day 2
or_d2 = subset_results(original_results, cond='d2').sort_index()
or_d2

In [None]:
new_d2 = subset_results(new_results, cond='d2').sort_index()
new_d2

In [None]:
# Different Library TV5490B

fdf[fdf.exp == 'TV5490B'].mouse.unique()
# Need to drop al975


dfB = remove_null_barcodes(fdf, 'TV5490B', 'dnaid2023')
print(dfB.groupby(['exp', 'mouse']).barcode.nunique())

dfB = dfB[(dfB.mouse!='al975')]


In [None]:
or_fitnessB, or_compsB, or_resultsB = analyze_experiment(fdf, 'TV5490B', controls, 'wt', dnaid='dnaid2023')

In [None]:
new_fitnessB, new_compsB, new_resultsB = analyze_experiment(dfB, 'TV5490B', controls, 'wt', dnaid='dnaid2023')

In [None]:
or_resultsB.head()

In [None]:
# Compare significant results: Day 1
or_dB1 = subset_results(or_resultsB, cond='d1').sort_values(by='d1_zscore_padj').sort_index()
or_dB1

In [None]:
# Compare significant results: Day 1
new_dB1 = subset_results(new_resultsB, cond='d1').sort_index()
new_dB1

In [None]:
# Compare significant results: Day 2
or_dB2 = subset_results(or_resultsB, cond='d2').sort_index()
or_dB2

In [None]:
# Compare significant results: Day 2
new_dB2 = subset_results(new_resultsB, cond='d2').sort_index()
new_dB2

In [None]:
plt.figure()
new_comps.d1_num_bc.hist(bins=50)
plt.figure()
new_comps.d1_meanExpr.hist(bins=50)

In [None]:
test3[test3.d2_zscore_padj < 0.1].sort_values(by='d2_zscore_padj').merge(gene_info, how='left', left_index=True, right_index=True)

In [None]:
df0_good_mice['sample'].unique()

In [None]:
fit0, comps0,  res0 = analyze_experiment(df0, 'TV5490A', controls, 'wt', dnaid='dnaid2023')
fit_gm, comps_gm,  res_gm = analyze_experiment(df0_good_mice, 'TV5490A', controls, 'wt', dnaid='dnaid2023')

In [None]:
res_gm[res_gm.d1_zscore_padj <0.05].sort_values(by='d1_zscore_padj').merge(gene_info,how='left', left_index=True, right_index=True) 

In [None]:
# x = fdf[(fdf.exp=='TV5490A') & (fdf.dnaid == 'dnaid2023')]
# x['sampleid'] = x['sample'] + x['dnaid']
# x = x[['sampleid', 'mouse', 'day', 'organ', 'dnaid']].set_index('sampleid').drop_duplicates()

# y = x.day.value_counts()
# y.name = 'num_mice'
fit0.merge(y, how='left', left_on='day', right_index=True)

In [None]:
d1_sig = res0[(res0.d1_padj < 0.05)]
d2_sig = res0[(res0.d2_padj < 0.05)]
d1_sig

In [None]:
# Day 1 or Day 2 Significant results
d1_sig = r0[(r0.d1_padj < 0.05)]
d2_sig = r0[(r0.d2_padj < 0.05)]
d1_sig

In [None]:
r0_with_genes = res0.merge(gene_info, how='left', left_index=True, right_index=True)

In [None]:
r0_with_genes[r0_with_genes.ShortName =='hilC']

In [None]:
repeat_res_with_genes = repeat_res.merge(gene_info, how='left', left_index=True, right_index=True)

In [None]:
repeat_res_with_genes[repeat_res_with_genes.ShortName == 'hilC']

In [None]:
hila = df0[(df0.ShortName == 'hilD') & (df0.day=='d1')].barcode.unique()

In [None]:
hila

In [None]:
fit0.head()

In [None]:
fdf[fdf.mouse == 'unenriched_inoculum'].proportion.hist(bins=100)

In [None]:
df0[df0.mouse == 'inoculum'].cnts.hist(bins=100)

In [None]:
df0[(df0.barcode.isin(hila)) &(df0.day == 'd1') & (df0.exp == 'TV5490A')]

In [None]:
df0[df0.barcode == 'CGGAGAACTCGTCATGG']

In [None]:
d2_sig.sort_values(by='d2_padj')

In [None]:
new_results[new_results.d1_padj<0.05].sort_values(by='d1_padj')

In [None]:
gene_info.head()

In [None]:
d1_sig.merge(gene_info, how='left', left_index=True, right_index=True).sort_values(by ='d1_padj')

In [None]:
new_results[new_results.d1_padj<0.05].sort_values(by='d1_padj').merge(gene_info, how='left', left_index=True, right_index=True)

In [None]:
fit0 = no0_results[0]
fit0.head()

In [None]:
barcode_info = fdf[['barcode', 'Feature', 'ShortName', 'Position']].drop_duplicates().set_index('barcode')
barcode_info.shape

In [None]:
x = fit0.merge(barcode_info, how='left',left_index=True, right_index=True, indicator=True)
x['_merge'].unique()


In [None]:
fit0.shape

In [None]:
x.shape

In [None]:
x = x[x.day == 'd1']
x[x['ShortName'] == 'rfaI']

In [None]:
x[x.padj < 0.05]

In [None]:
fit0.loc[fit0.index.intersection(controls.barcode.values)].log2FoldChange.hist(bins=25)