In [None]:
# %load ../snippets/basic_settings.py
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from pathlib import Path
import seaborn as sns
import sys
import plotly.express as px

sns.set_context("notebook", font_scale=1.1)
pd.set_option("display.max_columns", 100)
pd.set_option("display.max_rows", 100)
plt.rcParams["figure.figsize"] = (16, 12)
plt.rcParams['savefig.dpi'] = 200
plt.rcParams['figure.autolayout'] = False
plt.rcParams['axes.labelsize'] = 18
plt.rcParams['axes.titlesize'] = 20
plt.rcParams['font.size'] = 16
plt.rcParams['lines.linewidth'] = 2.0
plt.rcParams['lines.markersize'] = 8
plt.rcParams['legend.fontsize'] = 14
plt.rcParams['text.usetex'] = False  # True activates latex output in fonts!
#pd.set_option('display.float_format', lambda x: '{:,.2f}'.format(x))

# Load Ground Truth data

In [None]:
%ls /nfs/nas22/fs2202/biol_micro_bioinf_nccr/hardt/nguyenb/tnseq/scratch/02_22_result_benchmarks/

In [None]:
outDir = Path("/nfs/nas22/fs2202/biol_micro_bioinf_nccr/hardt/nguyenb/tnseq/scratch/02_22_result_benchmarks")
gt_file = outDir/"15-02-2022-ground_truth.csv"
gtDf = pd.read_csv(gt_file).iloc[:,:6]
gtDf['log_gt_CI'] = np.log2(gtDf.gt_CI)

# Load current results

In [None]:
resDir = Path("/nfs/nas22/fs2202/biol_micro_bioinf_nccr/hardt/nguyenb/tnseq/scratch/08_21/results/nguyenb")
res_file = resDir/'24-11-2021-all-libraries-zscores.csv'

In [None]:
resDf = pd.read_csv(res_file, index_col=0).rename({'ci': 'CI'}, axis=1)
resDf['log_CI'] = np.log2(resDf.CI)

In [None]:
resDf.sample(5)

In [None]:
# Try for all
comp = resDf.merge(gtDf, on=['gene', 'day'])
comp.head()


In [None]:
# 1. correlation between CI for each library / day
gt_CI = "log_gt_CI"
exp_CI = "log_CI"


In [None]:
def get_ci_corr(comp, gt_CI, exp_CI, method):
    corr_df = comp.groupby(['library', 'day'])[[gt_CI, exp_CI]].corr().unstack().iloc[:,1].reset_index()
    corr_df.columns = ['library', 'day', 'R2']
    r2mean = corr_df.groupby('library').R2.mean()
    corr_df = corr_df.set_index('library').assign(method=method)
    corr_df['r2lib'] = r2mean
    corr_df = corr_df.reset_index()
    
    return corr_df

In [None]:
zscore_corr = get_ci_corr(comp,"log_gt_CI", "log_CI", 'zscore' )

In [None]:
def compare_to_gt(compDf, gt_padj = "gt_padj", exp_padj = 'padj'):
    compDf['gt_hits'] = compDf[gt_padj]<0.05
    compDf['screen_hits'] = compDf[exp_padj]<0.05
    compDf['TP'] = (compDf.gt_hits & compDf.screen_hits) == True
    compDf['TN'] = (compDf.gt_hits == False) & (compDf.screen_hits == False)
    compDf['FP'] = (compDf.gt_hits == False) & (compDf.screen_hits == True)
    compDf['FN'] = (compDf.gt_hits == True) & (compDf.screen_hits == False)
    confMat = (pd.DataFrame(compDf[['TP', 'FN',  'FP', 'TN', ]].sum()
                            .values
                            .reshape((2,2)), index=['Real Pos', 'Real Neg'],
                      columns=['Pred Pos', 'Pred Neg']))
    prec = confMat.loc['Real Pos', 'Pred Pos']/confMat.sum()['Pred Pos']
    recall = confMat.loc['Real Pos', 'Pred Pos']/confMat.sum(axis=1)['Real Pos']
    return compDf, confMat, prec, recall

In [None]:
clrs = px.colors.qualitative.Safe
libraries = resDf.library.unique()
library_clrs = {lib:col for lib, col in zip(libraries, clrs)}

In [None]:
per_lib_corr = zscore_corr[['library', 'r2lib', 'method']].drop_duplicates()
fig = px.bar(per_lib_corr, template='simple_white',
       color_discrete_map= library_clrs,
       category_orders={'library': per_lib_corr.sort_values('r2lib').library.values},
       x='library', y='r2lib', color='library', 
      labels={'library':'Library', 'r2lib': 'R2'},
      title="R2 between RBSeq CIs and experimental CIs", hover_data=['method'])

fig.update_layout(
    
    font_size=14,
    title={
        'y':0.9,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'}
)
fig.update_xaxes(showticklabels=False)
fig

In [None]:
def get_stats(comp, method, gt_padj, exp_padj):
    pr = []
    for lib, g in comp.groupby('library'):
        print(lib)
        df, mat, prec, recall = compare_to_gt(g, gt_padj, exp_padj)
        pr.append([lib, prec, recall])
        print(mat)
    pr_df = pd.DataFrame(pr, columns = ['library', 'precision', 'recall']).assign(method=method)
    return pr_df


In [None]:
prec_zscore = get_stats(comp, 'zscore', "gt_padj", "padj")
prec_mageck = get_stats(comp2, 'mageck', "gt_padj", "fdr")
prec_magnobatch = get_stats(comp3, 'mageck-noBatch', "gt_padj", "fdr")

In [None]:
prec_zscore

In [None]:
# px.scatter(comp, x='log_gt_CI', y='log_CI', trendline="ols", color='day', height=800, 
#           hover_data=['gene', 'library'])

# Load MAGeCK resuts

In [None]:
maDir = Path("/nfs/nas22/fs2202/biol_micro_bioinf_nccr/hardt/nguyenb/tnseq/scratch/02_22_mageck")
mres = maDir/'16-02-2022-batch-corrected-9-libraries.csv'
maDf = pd.read_csv(mres)[['id', 'neg|fdr', 'neg|lfc', 'pos|fdr', 'contrast', 'library']]
maDf['fdr'] = maDf[['neg|fdr', 'pos|fdr']].min(axis=1)
maDf = maDf.rename({'id':'gene', 'contrast':'day'}, axis=1)

In [None]:
comp2 = maDf.merge(gtDf, on=['gene', 'day'])
mageck_corr = get_ci_corr(comp2,"log_gt_CI", "neg|lfc", 'mageck' )


In [None]:
per_lib_corr_mageck = mageck_corr[['library', 'r2lib', 'method']].drop_duplicates()
fig = px.bar(per_lib_corr_mageck, template='simple_white',
       color_discrete_map= library_clrs,
       category_orders={'library': per_lib_corr_mageck.sort_values('r2lib').library.values},
       x='library', y='r2lib', color='library', 
      labels={'library':'Library', 'r2lib': 'R2'},
      title="R2 between RBSeq CIs and experimental CIs", hover_data=['method'])

fig.update_layout(
    
    font_size=14,
    title={
        'y':0.9,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'}
)
fig.update_xaxes(showticklabels=False)
fig

In [None]:
#px.line(corr_df_mageck, x='day', y='R2', color='library', markers=True)

In [None]:
mresnoBatch = maDir/'16-02-2022-not-batch-corrected-9-libraries.csv'
maDf2 = pd.read_csv(mresnoBatch)[['id', 'neg|fdr', 'neg|lfc', 'pos|fdr', 'contrast', 'library']]
maDf2['fdr'] = maDf2[['neg|fdr', 'pos|fdr']].min(axis=1)
maDf2 = maDf2.rename({'id':'gene', 'contrast':'day'}, axis=1)

In [None]:
comp3 = maDf2.merge(gtDf, on=['gene', 'day'])
exp_CI = 'neg|lfc'
corr_df_mageck2 = comp3.groupby(['library', 'day'])[[gt_CI, exp_CI]].corr().unstack().iloc[:,1].reset_index()
corr_df_mageck2.columns = ['library', 'day', 'R2']

In [None]:
px.line(corr_df_mageck2, x='day', y='R2', color='library', markers=True)

In [None]:
prec_zscore = get_stats(comp, 'zscore', "gt_padj", "padj")
prec_mageck = get_stats(comp2, 'mageck', "gt_padj", "fdr")
prec_magnobatch = get_stats(comp3, 'mageck-noBatch', "gt_padj", "fdr")
precision_all = pd.concat([prec_zscore, prec_mageck, prec_magnobatch])

In [None]:
px.bar(precision_all, template = 'simple_white',
       x='method', y='precision', color='library', barmode='group', 
       color_discrete_map= library_clrs,
       category_orders={'library': precision_all.sort_values('precision').library.values})

In [None]:
px.bar(precision_all, template = 'simple_white',
       x='method', y='recall', color='library', barmode='group', 
       color_discrete_map= library_clrs,
       category_orders={'library': precision_all.sort_values('recall').library.values})

In [None]:
precision_all = precision_all.melt(id_vars=['library', 'method'], var_name='metric', 
                                   value_name='prop',
                                  )

In [None]:

fig = px.box(precision_all, x='method', y='prop', template="simple_white", 
             facet_col='metric', color='method', height=600, width=700)
fig.update_yaxes(tickvals=[0, 0.25, 0.5, 0.75, 1] )

In [None]:
precision_all[precision_all.library == 'library_10_1']