In [None]:
# %load 10_2022_load_config.py
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from pathlib import Path
import seaborn as sns
import sys
import plotly.express as px
import plotly.io as pio
import yaml

sns.set_context("notebook", font_scale=1.4)
pd.set_option("display.max_columns", 100)
pd.set_option("display.max_rows", 100)
plt.rcParams["figure.figsize"] = (16, 12)
plt.rcParams['savefig.dpi'] = 200
plt.rcParams['figure.autolayout'] = False
plt.rcParams['axes.labelsize'] = 18
plt.rcParams['axes.titlesize'] = 20
plt.rcParams['font.size'] = 16
plt.rcParams['lines.linewidth'] = 2.0
plt.rcParams['lines.markersize'] = 8
plt.rcParams['legend.fontsize'] = 14
pd.set_option('display.float_format', lambda x: '{:,.4f}'.format(x))


config_file = "10_2022_analysis.yaml"
with open(config_file) as file:
    # The FullLoader parameter handles the conversion from YAML
    # scalar values to Python the dictionary format
    configs = yaml.load(file, Loader=yaml.FullLoader)
    
# Run on server:
root = Path(configs['root'])
scratchDir = root/configs['scratchDir']
figuresDir = root/configs['figuresDir']
libraries = configs['libraries']
resultsDir = configs['resultsDir']
countsDir = configs['countsDir']
sampleDataFile = configs['sampleDataFile']

alphabetClrs = px.colors.qualitative.Alphabet
clrs = ["#f7ba65", "#bf4713", "#9c002f", "#d73d00", "#008080", "#004c4c"]
colors = {'grey': alphabetClrs[8], 
        'light_yellow': clrs[0],
        'darko': clrs[1],
        'maroon':clrs[2],
        'brighto': clrs[3],
        'teal':clrs[4],
        'darkteal':clrs[5]
       }

In [None]:
from scipy.stats import wilcoxon
import pyranges as pr

# Load the count files

In [None]:
countFiles = [f for f in (root/countsDir).glob("*_mbarq_merged_counts.csv")]
sampleData = pd.read_csv(root/sampleDataFile)

In [None]:
df_list = []
for f in countFiles:
    cdf = pd.read_csv(f).assign(library=f.stem.split("_mbarq")[0])
    cdf = cdf.melt(id_vars=['barcode', 'Name', 'library'], var_name='sampleID', value_name='cnts')
    inocSd = sampleData[sampleData.mouse == 'inoculum']
    cdf = cdf.merge(inocSd, on=['library', 'sampleID'], how='inner')
    cdfSum = cdf.groupby(['Name', 'library']).agg({'cnts':['median', 'mean', 'min', 'max']}).reset_index()
    cdfSum.columns = ['Name','library', 'med_inoculum_abundance', 'cnts_mean', 'cnts_min', 'cnts_max']
    df_list.append(cdfSum)

In [None]:
cntsSum = pd.concat(df_list)[['Name', 'library', 'med_inoculum_abundance']]

In [None]:
cntsSum['log_med_inoculum_abundace'] = np.log2(cntsSum.med_inoculum_abundance +1)

In [None]:
cntsSum[cntsSum.Name == 'gudT']

# Process experimental data from Bidong

In [None]:
expCIs = pd.read_csv(root/configs['experimentalData'])
expCIs2 = pd.read_csv(root/configs['experimentalData2']).iloc[:,:6]
expCIs2['log_exp_CI'] = np.log2(expCIs2.gt_CI)
expCIs2 = expCIs2.rename({'gt_CI': 'exp_CI', 'gt_pval':'exp_pval', 'gt_padj': 'exp_padj'}, axis=1)
expCIs = expCIs.dropna()
hypMed = 1.0
pval = expCIs.groupby(['gene', 'day']).apply(lambda x: wilcoxon(x.CI.values-hypMed)[1]).reset_index()
pval.columns = ['gene', 'day', 'exp_pval']
med = expCIs.groupby(['gene', 'day']).agg({'CI':['median', 'count']}).reset_index()
med.columns = ['gene', 'day', 'exp_CI', 'num_mice']
expRes = pval.merge(med, on=['gene', 'day'])
expRes['log_exp_CI'] = np.log2(expRes.exp_CI)
expRes = pd.concat([expRes, expCIs2])
expRes = expRes.rename({'gene': 'Name', 'day': 'contrast'}, axis=1)

In [None]:
expRes.head()

# Load Results

## Post-processing: merging different libraries

In [None]:
resFiles = [f for f in (root/resultsDir).glob("*_rra_results.csv")]
results = pd.concat([pd.read_csv(f).assign(library=f.stem.split("_rra")[0]) for f in resFiles])
results['fdr'] = results[['neg_selection_fdr', 'pos_selection_fdr']].min(axis=1)
#results = results[~results.Name.str.contains(":")]

In [None]:
results.sample(5)

## Adding GFF annotation

In [None]:
gff = pr.read_gff3(configs['gff'], as_df=True)
gene_info = gff[gff.Feature == 'gene'][['Chromosome', 'Start', 'End', 'Name', 'locus_tag']]
gene_info['Chromosome'] = gene_info.Chromosome.astype('str')
prot_info = gff[gff.Feature == 'CDS'][['protein_id', 'locus_tag']]

## Adding EGGNOG annotation

In [None]:
ann = pd.read_table(configs["eggnog"], comment='#').iloc[:, [0, 8, 10,11, 12,13,14,15,16,17,18,19]]
ann.columns = ['protein_id', 'Prefered_Name', 'EC', 'KEGG_ko', 'KEGG_Pathway',
       'KEGG_Module', 'KEGG_Reaction', 'KEGG_rclass', 'BRITE', 'KEGG_TC',
       'CAZy', 'BiGG_Reaction']
ann_info = (prot_info.merge(gene_info, on='locus_tag', how='outer')
            .merge(ann, on='protein_id', how='outer'))

In [None]:
fdf = results.merge(ann_info, on='Name', how='outer')

#fdf = fdf.fillna('N/A')

In [None]:
fdf.sample(10)

In [None]:
fdf[(fdf.Name == 'SL1344_2750') & (fdf.contrast == 'd1')]

In [None]:
# This should be produced by the mbarq given gff and eggnogg file
# not the abundance part

## Providing some summary data

In [None]:
results = results.merge(cntsSum, on=['Name', 'library']).drop('med_inoculum_abundance', axis=1)

In [None]:
results.sample(10)

In [None]:
results['nomWA'] = results['LFC']* results['log_med_inoculum_abundace']

In [None]:
resSum = (results.groupby(['Name', 'contrast'])
          .agg({'LFC':['mean', 'median', 'std'], 'library':['nunique'], 'log_med_inoculum_abundace':['sum'], 
               'nomWA':['sum']})
          .reset_index())
resSum.columns = ['Name', 'contrast', 'LFC_mean', 'LFC_median', 'LFC_std', 'library_count', 'denomWA', 
                 'nomWA']

In [None]:
resSum['LFC_weighted_mean'] = resSum['nomWA']/resSum['denomWA']

In [None]:
resSum[(~resSum.LFC_std.isna()) & (resSum.contrast == 'd1')].sample(10)

In [None]:
fdf = fdf.merge(resSum, on=['Name', 'contrast'], how='left').drop(['nomWA', 'denomWA'], axis=1)
fdf = fdf.merge(cntsSum, on=['Name', 'library'], how='left').drop('med_inoculum_abundance', axis=1)

In [None]:
fdf.sample(10)

## Fitting linear model to experimental data to correct LFC

In [None]:
import statsmodels.formula.api as smf
import statsmodels.api as sm
comp = results.merge(expRes, on=['Name', 'contrast'])
lm = smf.ols(formula='log_exp_CI ~ LFC+contrast', data=comp).fit()

In [None]:
lm.summary()

In [None]:
results['expected_CI'] = lm.predict(results[['LFC', 'contrast']])
fdf2 = fdf.merge(results[['Name', 'contrast', 'expected_CI', 'library']], on=['Name', 'contrast', 'library'], how='left')

In [None]:
fdf2.LFC.isna().sum()

In [None]:
fdf2.sample(10)

In [None]:
fdf2[(fdf2.Name == 'rfaD') & (fdf2.contrast == 'd1')]

In [None]:
fdf2.drop_duplicates().to_csv(root/resultsDir/'26-10-22-annotated-results.csv', index=False)

In [None]:
fdf2.shape

# Comparing to Experimental Data

In [None]:
from sklearn.metrics import confusion_matrix, f1_score, balanced_accuracy_score, precision_score, recall_score, roc_auc_score

clrs = px.colors.qualitative.Safe
libraries = sorted(['library_15_1', 'library_13_2', 'library_9_1', 
             'library_10_1', 'library_11_2', 'library_12_1',
             'library_12_2', 'library_13_1', 'library_10_2',
             'library_14_2'])


library_clrs = {lib:col for lib, col in zip(libraries, clrs)}

def get_ci_corr(comp, exp_CI, comp_CI, method):
    corr_df = comp.groupby(['library', 'day'])[[exp_CI, comp_CI]].corr().unstack().iloc[:,1].reset_index()
    corr_df.columns = ['library', 'day', 'R']
    corr_df['R2'] = corr_df['R']**2
    r2mean = corr_df.groupby('library').R2.mean()
    rmean = corr_df.groupby('library').R.mean()
    corr_df = corr_df.set_index('library').assign(method=method)
    corr_df['r2lib'] = r2mean
    corr_df['rlib'] = rmean
    corr_df = corr_df.reset_index()
    return corr_df


def compare_to_gt(compDf, exp_padj = "exp_padj", comp_padj = 'fdr'):
    compDf['exp_hits'] = compDf[exp_padj]<0.05
    compDf['comp_hits'] = compDf[comp_padj]<0.05
    confMat = confusion_matrix(compDf.exp_hits.values, compDf.comp_hits.values)
    prec = precision_score(compDf.exp_hits.values, compDf.comp_hits.values)
    recl = recall_score(compDf.exp_hits.values, compDf.comp_hits.values)
    f1 = f1_score(compDf.exp_hits.values, compDf.comp_hits.values)
    bacc = balanced_accuracy_score(compDf.exp_hits.values, compDf.comp_hits.values)
    roc = roc_auc_score(compDf.exp_hits.values, compDf.comp_hits.values)
    return compDf, confMat, prec, recl, f1, bacc, roc


def get_stats(comp, method, gt_padj, exp_padj):
    pr = []
    for lib, g in comp.groupby('library'):
        print(lib)
        df, mat, prec, recall, f1, bacc, roc = compare_to_gt(g, gt_padj, exp_padj)
        pr.append([lib, prec, recall, f1, bacc, roc])
        print(mat)
    pr_df = pd.DataFrame(pr, columns = ['library', 'precision', 'recall', 'f1', 'bacc', 'roc']).assign(method=method)
    return pr_df


def get_numHits(res, pval_col, method, pval_cutoff=0.05):
    return (res[res[pval_col] < pval_cutoff].copy()
           .groupby(['library', 'day']).gene.nunique()
           .reset_index()
            .assign(method=method))

def plot_correlations(corr_df):
    per_lib_corr = corr_df[['library', 'rlib', 'method']].drop_duplicates()
    fig = px.bar(per_lib_corr, template='simple_white',
       color_discrete_map= library_clrs,
       category_orders={'library': per_lib_corr.sort_values('rlib').library.values},
       x='library', y='rlib', color='library', 
      labels={'library':'Library', 'rlib': 'R'},
      title="R between RBSeq CIs and experimental CIs", hover_data=['method'])

    fig.update_layout(

        font_size=14,
        title={
            'y':0.9,
            'x':0.5,
            'xanchor': 'center',
            'yanchor': 'top'},
        yaxis_range=[0,1]
    )
    fig.update_xaxes(showticklabels=False)
    return fig

In [None]:
exp_CI = "log_exp_CI"
exp_padj = "exp_pval"
comp_CI = "LFC"
comp_padj = "fdr"

mageck_corr = get_ci_corr(comp, exp_CI, comp_CI, "mageck-new")
mageck_stats = get_stats(comp, "mageck", exp_padj, comp_padj)
fig = plot_correlations(mageck_corr)

In [None]:
fig

In [None]:
mageck_melt = mageck_stats.melt(id_vars=['library', 'method'])

In [None]:
fig = px.box(mageck_melt, x='variable', y='value', color='variable', template='plotly_white', 
             height=600, width=600)
fig.update_layout(yaxis_range = [0,1.1] )

In [None]:
#comp_short = comp[~comp.library.isin(['library_9_1', 'library_10_1', 'library_13_2'])]
fig = px.scatter(comp[comp.day== 'd2'],
        x='LFC', y='log_exp_CI', trendline='ols', width=1000,
           height=1000,  template='simple_white',
                 title='MAGeCK Analysis', 
           labels = {'log_CI': 'RBSeq CI', 'log_gt_CI': 'Validated CI'}
          )
fig.update_layout(yaxis_range = [-10,4], xaxis_range = [-10,4] )
fig

In [None]:
fig = px.scatter(comp[comp.library=='library_11_2'],
        x='LFC', y='log_exp_CI', facet_row='day', trendline='ols', width=400,
           height=1000, color='day', template='simple_white',
                 title='MAGeCK Analysis', hover_data=['gene'],
           labels = {'log_CI': 'RBSeq CI', 'log_gt_CI': 'Validated CI'}
          )

fig
#fig.

In [None]:
results[(results.library == 'library_11_2') & (results.day == 'd1')].LFC.mean()

In [None]:
results[(results.library == 'library_11_2') & (results.day == 'd1')].expected_CI.mean()

In [None]:
results[(results.library == 'library_10_1') & (results.day == 'd1')].LFC.hist(bins=100)
results[(results.library == 'library_10_1') & (results.day == 'd1')].expected_CI.hist(bins=100)

In [None]:
results.gene.nunique()

In [None]:
results[(results.LFC > 1) & (results.fdr < 0.05)].gene.nunique()

In [None]:
results[(results.expected_CI > 1) & (results.fdr < 0.05)]

In [None]:
resSum.head()

In [None]:
corr_df = results[['Name', 'LFC', 'contrast', 'library']].copy()

corr_df = corr_df.pivot(index=['library', 'contrast'], columns='Name', values='LFC')
corr_df['contrast'] = [int(d[1].strip('d')) for d in corr_df.index]
corr_df = corr_df.corr(min_periods=6)
corr_df.index.name = 'gene2'
corr_df = corr_df.unstack().reset_index().dropna()
corr_df = corr_df[corr_df.gene2 == 'contrast'].iloc[:-1, ]
corr_df = corr_df.drop('gene2', axis=1).rename({0: "correlation_dpi"}, axis=1)


In [None]:
corr_df

In [None]:
import pyranges as pr
gff = pr.read_gff3(configs['gff'], as_df=True)
gff = gff[gff.Feature == 'gene']
gff = gff[['Chromosome', 'Start', 'End', 'Name']]
fdf = (results.merge(resSum, on = ['gene', 'day'], how='left')
       .merge(corr_df, on=['gene'], how='left')
      .merge(gff, left_on='gene', right_on='Name', how='left'))

fdf = fdf.merge(ann_info, on="Name", how='outer')
fdf['Chromosome'] = fdf.Chromosome.astype('str')
fdf = fdf.fillna('N/A')

In [None]:
fdf = fdf.sort_values(['Chromosome', 'Start'])

In [None]:
to_draw = fdf[['expected_CI', 'LFC_mean', 'LFC_std', 'library_count', 'day', 'correlation_dpi', 'Chromosome', 'Start', 'Name']].drop_duplicates()

In [None]:
to_draw = to_draw[to_draw.Chromosome == 'FQ312003.1']

In [None]:
to_draw.sample(19)

In [None]:
px.scatter(to_draw, x='Start', y='LFC_mean', size='library_count', color='day',
           height=800, width=1600, hover_data=['Name', 'LFC_mean', 'correlation_dpi'])

In [None]:
fdf.sample(20)

In [None]:
fdf[fdf.KEGG_Pathway.str.contains('ko00380')].Name.unique()

In [None]:
fdf[fdf.Name =='cysE']

In [None]:
fdf[fdf.gene == 'ydiO']

In [None]:
fdf.to_csv(root/resultsDir/'18-10-22-annotated-results.csv', index=False)

In [None]:
def sig_results(df, th=1):
    return df[(abs(df.LFC) > th) &(((df.neg_selection_fdr < 0.01))|((df.pos_selection_fdr < 0.01)))]

In [None]:
fdf2 = fdf.replace('N/A', np.nan)
fdf_sig = sig_results(fdf2, 1)

In [None]:
fdf_sig

In [None]:
early = fdf_sig[fdf_sig.day.isin(['d1', 'd2'])].gene.unique()

In [None]:
'hflC' in early

In [None]:
for i in early:
    print(i)

In [None]:
late = fdf_sig[(fdf_sig.day.isin(['d3', 'd4'])) & (fdf_sig.LFC < 0)].gene.unique()
late = [g for g in late if g not in early]

In [None]:
for i in late:
    print(i)

In [None]:
fdf[fdf.Name == 'SL1344_3749']

In [None]:

test = fdf_sig[fdf_sig.KEGG_Pathway.str.contains('ko02060')]

In [None]:
test = test[['Name', 'LFC', 'library', 'day', 'fdr', 'EC', 'KEGG_Pathway']].replace('N/A', np.nan).dropna()

In [None]:
test

In [None]:
test[test.Name == 'SL1344_4469'].groupby(['day']).LFC.mean()

In [None]:
sns.barplot(data=test[test.day == 'd1'], x='Name', y='LFC')

In [None]:
sns.barplot(data=test[test.day == 'd3'], x='Name', y='LFC')