# Load Settings and Configs

In [None]:
# %load load_manuscript_data.py
from datetime import date
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from pathlib import Path
import seaborn as sns
import sys
import plotly.express as px
import plotly.io as pio
import yaml

sns.set_context("notebook", font_scale=1.4)
pd.set_option("display.max_columns", 100)
pd.set_option("display.max_rows", 100)
plt.rcParams["figure.figsize"] = (16, 12)
plt.rcParams['savefig.dpi'] = 200
plt.rcParams['figure.autolayout'] = False
plt.rcParams['axes.labelsize'] = 18
plt.rcParams['axes.titlesize'] = 20
plt.rcParams['font.size'] = 16
plt.rcParams['lines.linewidth'] = 2.0
plt.rcParams['lines.markersize'] = 8
plt.rcParams['legend.fontsize'] = 14
pd.set_option('display.float_format', lambda x: '{:,.4f}'.format(x))


config_file = "manuscript_config.yaml"
with open(config_file) as file:
    # The FullLoader parameter handles the conversion from YAML
    # scalar values to Python the dictionary format
    configs = yaml.load(file, Loader=yaml.FullLoader)
    
root = Path(configs['root'])
scratchDir = Path(configs['scratchDir'])
figuresDir = Path(configs['figuresDir'])

alphabetClrs = px.colors.qualitative.Alphabet
clrs = ["#f7ba65", "#bf4713", "#9c002f", "#d73d00", "#008080", "#004c4c"]
colors = {'grey': alphabetClrs[8], 
        'light_yellow': clrs[0],
        'darko': clrs[1],
        'maroon':clrs[2],
        'brighto': clrs[3],
        'teal':clrs[4],
        'darkteal':clrs[5]
       }

sushi_colors = {'red': '#C0504D',
             'orange': '#F79646',
             'medSea': '#4BACC6', 
             'black': '#000000',
             'dgreen': '#4DAF4A', #'#00B04E',
             'lgreen': '#92D050',
             'dblue': '#366092',
             'lblue': '#95B3D7'}
today = date.today().strftime("%d-%m-%y")

# Nguyen et al 2020

## Load the data

In [None]:
nguyenConfig = configs['nguyen']
countsFile = root/nguyenConfig['countsFile']
resultsFile = root/nguyenConfig['resultsFile']
sampleDataFile = root/nguyenConfig['sampleDataFile']
publishedResultsFile = root/nguyenConfig['publishedResultsFile']
publishedPhenotypesFile = root/nguyenConfig['publishedPhenotypesFile']
mapFile = root/nguyenConfig['mapFile']['filtered']

## Define hits

In [None]:
publishedResults = pd.read_csv(publishedResultsFile, skiprows=1)
results = pd.read_csv(resultsFile)
# What is a hit?
results = results.rename({'Name': 'locus_tag'}, axis=1)
results['LFC'] = round(results['LFC'], 1)
results['mbarq_hit'] = ((results['neg_selection_fdr'] <= 0.05) | (results['pos_selection_fdr'] <= 0.05)) & (abs(results.LFC) >= 0.6)

annotations = (pd.read_csv(mapFile)[['Name', 'locus_tag']]
               .drop_duplicates())
results = results.merge(annotations, how='left', on='locus_tag')

publishedCI = (publishedResults.melt(id_vars=['locus', 'gene'], 
                                    value_vars=[c for c in publishedResults.columns if 'median_CI' in c],
                                    value_name='median_CI', 
                                    var_name=['contrast'])
                               .rename({'locus': 'locus_tag'}, axis=1))


publishedCI['contrast'] = publishedCI.contrast.str.split("_", expand=True)[0]
publishedCI['log_median_CI'] = round(np.log2(publishedCI.median_CI), 1)

publishedHits = (publishedResults.melt(id_vars=['locus'], var_name=['contrast'],
                                     value_vars=[c for c in publishedResults.columns if 'adj_p_value_CI'in c],
                                     value_name='adj_pvalue')
                                 .rename({'locus': 'locus_tag'}, axis=1))
publishedHits['contrast'] = publishedCI.contrast.str.split("_", expand=True)[0]
publishedDf = publishedCI.merge(publishedHits, on=['locus_tag', 'contrast'])

# # What is a hit?
publishedDf['published_hit'] = ((publishedDf.adj_pvalue <= 0.05) & (abs(publishedDf.log_median_CI) >= 0.6)).astype(int)*2

compCntrl = results.merge(publishedDf, on=['locus_tag', 'contrast'], how='outer')
compCntrl['mbarq_hit'] = compCntrl.mbarq_hit.fillna(0)
compCntrl['published_hit'] = compCntrl.published_hit.fillna(0)
compCntrl['hit'] = (compCntrl['mbarq_hit'] + compCntrl['published_hit']).astype(int).astype(str)
compCntrl.hit.replace({'0': 'None', '1': 'mBARq only', '2': 'Original analysis only', '3': 'Both methods'}, inplace=True)
compCntrl = compCntrl[compCntrl.locus_tag.str.startswith('SL1344')]

## Compare CIs

In [None]:
def compare_CIs(df, contrast):
    
    legend_title = 'Significant change in fitness <br> detected by:'
    to_plot = df[df.contrast == contrast]
    to_plot = to_plot.rename({'hit': legend_title}, axis=1)
    fig = px.scatter(to_plot, x='LFC', y='log_median_CI', color=legend_title,
                     height=800, width=1000,
    
                     template = 'plotly_white', 
                     labels = {'log_median_CI': 'log(CI) (original analysis)', 'LFC': 'LFC (mBARq analysis)'},
                     color_discrete_map = {'None': colors['grey'], 
                                           'Both methods': sushi_colors['dgreen'], 
                                           'mBARq only': sushi_colors['dblue'], 
                                           'Original analysis only': sushi_colors['orange']},
                     hover_data=['Name', 'gene'],
                     category_orders = {legend_title:[ 'None',
                                                       'Original analysis only', 
                                                       'mBARq only',
                                                       'Both methods']},)

    fig.update_traces(marker=dict(size=20,
                        line=dict(width=1,
                                                    color='DarkSlateGrey'), opacity=0.9),
                              selector=dict(mode='markers'))
    fig.update_layout(font={'size':16})
    return fig

In [None]:
sig_df = compCntrl[((compCntrl.hit != 'None') & (compCntrl.contrast == 'd1')) & ((compCntrl.LFC < 0)| (compCntrl.median_CI < 1))]

In [None]:
sig_df[sig_df.Name == 'yjfQ']

In [None]:
# for g in sig_df.Name.values:
#     print(g)

In [None]:
early = compCntrl[compCntrl.contrast.isin(['d1', 'd2'])]

In [None]:
early[early.hit == 'Original analysis only'].locus_tag.nunique()


In [None]:
early[early.hit == 'Both methods'].locus_tag.nunique()

In [None]:
early[early.hit == 'mBARq only'].locus_tag.nunique()

In [None]:
fig = compare_CIs(compCntrl, 'd1')
fig.write_image(figuresDir/f"{today}_Figure2C.png", format ='png', scale=2)
fig

In [None]:
# can specify width, height, and scale to improve resolution

In [None]:
#compare_CIs(compCntrl, 'd2')

In [None]:
#compare_CIs(compCntrl, 'd3')

### Calculate CI correlations

In [None]:
corr_df = compCntrl.groupby('contrast')[['LFC', 'log_median_CI']].corr().iloc[0::2,-1].reset_index()
corr_df.columns = ['contrast', 'LFC', 'R']
corr_df['contrast'] = corr_df.contrast.replace({'d1': 'Day 1 p.i.',
                                               'd2': 'Day 2 p.i.',
                                               'd3': 'Day 3 p.i.',
                                               'd4': 'Day 4 p.i.'})

In [None]:
#corr_df = corr_df[corr_df.contrast != 'd4']
fig = px.bar(corr_df, x="R", y='contrast', color='contrast',
      color_discrete_sequence = ['black']*4,
             labels={"contrast":'', 'R': "Pearson's <i>r</i>"},
      height=350, width=500, text_auto='.2f', template='plotly_white', orientation='h')
fig.update_layout(showlegend=False)
fig.write_image(figuresDir/f"{today}_Figure2D.png", format ='png', scale=2)
fig

### Calculate recall, precision, balanced accuracy 

In [None]:
from sklearn.metrics import recall_score, precision_score, balanced_accuracy_score
phenotypes = pd.read_csv(root/nguyenConfig["publishedPhenotypesFile"])
phenotypes = phenotypes.rename({'locus': 'locus_tag', 'day': 'contrast'}, axis=1)
phenotypes = phenotypes.merge(compCntrl, how='left', on=['locus_tag', 'contrast'])
# # Same definition of hit as for mbarq
phenotypes['pheno_hit'] = ((phenotypes['adjusted p value (C.I.)'] <= 0.05) & (abs(np.log2(phenotypes['median'])) >= 0.5)).astype(int)
phenotypes = phenotypes[['Name', 'gene_x', 'contrast', 'pheno_hit', 'mbarq_hit', 'published_hit']].dropna()
phenotypes['mbarq_hit'] = phenotypes.mbarq_hit.astype(int)
phenotypes['published_hit'] = (phenotypes.published_hit/2).astype(int)
#phenotypes = phenotypes[phenotypes.contrast != 'd4']

metrics = {'mBARq': (precision_score(phenotypes.pheno_hit, phenotypes.mbarq_hit), 
                     recall_score(phenotypes.pheno_hit, phenotypes.mbarq_hit), 
                     balanced_accuracy_score(phenotypes.pheno_hit, phenotypes.mbarq_hit)), 
          'Original analysis': (precision_score(phenotypes.pheno_hit, phenotypes.published_hit), 
                       recall_score(phenotypes.pheno_hit, phenotypes.published_hit), 
                       balanced_accuracy_score(phenotypes.pheno_hit, phenotypes.published_hit) )}

metricDf = (pd.DataFrame(metrics, index=['Precision', 'Recall', 'Balanced Accuracy'])
              .T
            .reset_index()
            .rename({'index':'Method'}, axis=1)
              .melt(id_vars=['Method'], var_name='Metric', value_name='Score'))

In [None]:
phenotypes[['Name', 'contrast']].drop_duplicates().shape

In [None]:
phenotypes.pheno_hit.value_counts()

In [None]:
phenotypes.mbarq_hit.value_counts()

In [None]:
fig = px.bar(metricDf, x='Metric', y='Score', 
       color='Method', barmode='group', text_auto='.2f', 
       height=400, width=400, 
        labels = {'Metric':''},
      template='plotly_white', 
      color_discrete_map = {'mBARq':'black' , 'Original analysis': colors['grey']})
fig.write_image(figuresDir/f"{today}_Figure2E.png", format ='png', scale=2 )
fig

In [None]:
allD1 = compCntrl[compCntrl.contrast == 'd1']

In [None]:
allD1[((allD1.mbarq_hit == True) | (allD1.published_hit == 2)) & (allD1.LFC < 0)][['Name', 'mbarq_hit', 'published_hit']]

# Wetmore et al 2015

## Load the data

In [None]:
def get_bigger_tstat(x):
    a =[abs(i) for i in x.values]
    return x.values[a.index(max(a))]

In [None]:
wetmoreConfig = configs['wetmore']
wcountsFile = root/wetmoreConfig['countsFile']
wresultsFile = root/wetmoreConfig['resultsFile']
wsampleDataFile = root/wetmoreConfig['sampleDataFile']
wpublishedResultsFile = root/wetmoreConfig['publishedResultsFile']
wpublishedCountsFile = root/wetmoreConfig['publishedCountsFile']
wmapFile = root/wetmoreConfig['mapFile']['filtered']
wpublishedStatsFile = root/wetmoreConfig['publishedStatsFile']

In [None]:
contrast_map = {'D-Maltose_monohydrate': 'D-Maltose',
                'a-Ketoglutaric_acid_disodium_salt_hydrate': 'a-Ketoglutaric acid',
                'a-Ketoglutaric': 'a-Ketoglutaric acid',
                'Potassium_acetate': 'Acetate',
                'acetate': 'Acetate',
                'CAS_amino_acids': 'CAS amino acids',
                'CAS': 'CAS amino acids',
                'Tween_20': 'Tween',
                'Sodium_L-Lactate': 'L-Lactate',
                'Sodium_D,L-Lactate': 'D,L-Lactate',
                'Sodium_pyruvate': 'Pyruvate',
                'pyruvate': 'Pyruvate',
                'Putrescine_Dihydrochloride': 'Putrescine',
                'N-Acetyl-D-Glucosamine': 'NAG',
                'L-Glutamic_acid_monopotassium_salt_monohydrate': 'L-Glutamic acid',
                'L-Glutamic': 'L-Glutamic acid',
                'Sodium_Fumarate_dibasic': 'Fumarate',
                'L-Malic_acid_disodium_salt_monohydrate': 'L-Malic acid',
                'Sodium_Fumarate_dibasic': 'Fumarate', 
                'Sodium_succinate_dibasic_hexahydrate': 'Succinate'
               }

## Clean published results

In [None]:
wsampleData = pd.read_csv(wsampleDataFile)

In [None]:
#wsampleData

In [None]:
wsampleData.groupby('condition').filter(lambda x: len(x) > 1).to_csv(wsampleDataFile.with_suffix('.replicates.csv'),
                                                                    index=False)

In [None]:
pcounts = pd.read_table(wpublishedCountsFile).drop(['scaffold', 'strand', 'pos', 'locusId', 'f'], axis=1)
pcounts = pcounts.melt(id_vars=['barcode', 'rcbarcode'], value_name='cnt', var_name='sampleID')
pcounts = pcounts[pcounts.sampleID.str.contains('set1')]
pcounts['sampleID'] = pcounts['sampleID'].str.split('.', expand=True)[1]

In [None]:
wpublishedResults = pd.read_table(wpublishedResultsFile).drop(['locusId', 'desc', 'comb'], axis=1)
wpublishedResults = wpublishedResults.melt(id_vars=['sysName'],  
                                         var_name='contrast', 
                                         value_name='LFC')
wpublishedResults['set'] = wpublishedResults.contrast.str.split(expand=True)[0]
wpublishedResults['contrast'] = wpublishedResults.contrast.str.split(expand=True)[1]
wpublishedStats = pd.read_table(wpublishedStatsFile).drop(['locusId', 'desc'], axis=1)
wpublishedStats = wpublishedStats.melt(id_vars=['sysName'],  
                                         var_name='contrast', 
                                         value_name='tstat')
wpublishedStats['set'] = wpublishedStats.contrast.str.split(expand=True)[0]
wpublishedStats['contrast'] = wpublishedStats.contrast.str.split(expand=True)[1]
wpublishedDf = wpublishedResults.merge(wpublishedStats, on=['sysName', 'contrast', 'set'])
wpublishedDf = wpublishedDf[wpublishedDf.set.str.contains('set1')]
wpublishedDf = wpublishedDf.rename({'sysName':'Name'}, axis=1)
wpublishedDf = (wpublishedDf.groupby(['contrast', 'Name']).agg({'LFC': ['median'], 
                                                               'tstat':[get_bigger_tstat]})
                            .reset_index())
wpublishedDf.columns = ['contrast', 'Name', 'published_LFC', 'tstat']
wpublishedDf['contrast'] =wpublishedDf['contrast'].replace(contrast_map)

In [None]:
wpublishedDf.contrast.unique()

In [None]:
nag_genes = ['Sama_0944', 'Sama_0945', 'Sama_0946', 'Sama_0948']
mannose_genes = ['Sama_0561', 'Sama_0562', 'Sama_0563', 'Sama_0564']
wpublishedDf[(wpublishedDf.Name.isin(mannose_genes)) & (wpublishedDf.contrast == 'D-Mannitol')]

## Look at the count data

In [None]:
#wcnts = pd.read_csv(wcountsFile)
#wcnts_annotated = wcnts[~wcnts.old_locus_tag.isna()]
#wcnts_annotated.to_csv(wcountsFile.with_suffix(".annotated.csv"), index=False)

In [None]:
wcnts_annotated = pd.read_csv(wcountsFile.with_suffix(".annotated.csv"))

In [None]:
ncounts = wcnts_annotated.rename({'barcode':'rcbarcode'}, axis=1)
ncounts = ncounts.melt(id_vars=['rcbarcode', 'old_locus_tag'], var_name='sampleID', value_name='new_count')

In [None]:
cnts = ncounts.merge(pcounts, on=['rcbarcode', 'sampleID'], how='inner')

In [None]:
cnts['logCnt'] = np.log2(cnts['cnt'] +1)
cnts['logNewCnt'] = np.log2(cnts['new_count'] +1)

In [None]:
wcntCor = cnts.groupby('sampleID')[['logNewCnt', 'logCnt']].corr().iloc[0::2,-1].reset_index()
wcntCor.columns = ['contrast', 'comparison', 'R']
wcntCor['R2'] = round(wcntCor['R']**2, 3)

In [None]:
wcntCor

## Look at the results

In [None]:
wresults = pd.read_csv(wresultsFile)
wresults = wresults[~wresults.Name.str.contains(":")]
wresults['contrast'] = wresults['contrast'].replace(contrast_map)

In [None]:
carbon_sources =['D-Glucose', 'D-Maltose', 'a-Ketoglutaric acid', 'Acetate',
       'D-Cellobiose', 'L-Lactate', 'D,L-Lactate', 'Pyruvate',
       'D-Mannitol', 'Tween', 'L-Glutamic acid', 'L-Glutamine', 'Gly-Glu',
       'Gelatin', 'CAS amino acids', 'Putrescine', 'NAG', 'Adenosine',
       'Uridine', 'Thymidine', 'Inosine', 'Cytidine', 'D-Mannose',
       'Sucrose', 'L-Serine']

In [None]:
final_results = wresults[wresults.contrast.isin(carbon_sources)]

In [None]:
#final_results.to_csv(root/'wetmore_2015/results_1/Set1_rra_results_contrasts_edited.csv', index=False)

In [None]:
{'0': 'None', '1': 'mBARq only', '2': 'Original analysis only', '3': 'Both methods'}

In [None]:
wcomp = wresults.merge(wpublishedDf, on=['Name', 'contrast'], how='inner')
wcomp['mbarq_hits'] = ((abs(wcomp.LFC) >= 0.6) & ((wcomp.neg_selection_fdr < 0.05)| (wcomp.pos_selection_fdr < 0.05)))
wcomp['feba_hits'] = (abs(wcomp.tstat) >= 4).astype(int)*2
wcomp['hits'] = wcomp['mbarq_hits'].astype(int) + wcomp['feba_hits']
wcomp.hits.replace({0: 'None', 1: 'mBARq only', 2: 'Original analysis only', 3: 'Both methods'}, inplace=True)

In [None]:
wcomp.sample(5)

In [None]:
def compare_CIs_db(df, contrast):
    legend_title = 'Significant change in fitness <br> detected by:'
    to_plot = df[df.contrast == contrast]
    to_plot = to_plot.rename({'hits': legend_title}, axis=1)
    fig = px.scatter(to_plot, x='LFC', y='published_LFC', color=legend_title, height=800, width=1000,
              template = 'plotly_white', 
                labels = {'published_LFC': 'LFC (original analysis)', 'LFC': 'LFC (mBARq analysis)'},
               color_discrete_map = {'None': colors['grey'], 'Both methods': sushi_colors['dgreen'], 
                                     'mBARq only': sushi_colors['dblue'], 'Original analysis only': sushi_colors['orange']},
                hover_data=['Name'],
                category_orders = {legend_title:['None', 'Original analysis only', 'mBARq only', 'Both methods']},)


    fig.update_traces(marker=dict(size=20,
                        line=dict(width=1,
                                                    color='DarkSlateGrey'), opacity=0.8),
                              selector=dict(mode='markers'))

    fig.update_layout(font={'size':22})
    return fig


In [None]:
#pio.write_image(fig, figuresDir/'26-09-22-nag_ci.png', width=1000, height=600, scale=2)

In [None]:
fig = compare_CIs_db(wcomp, 'Tween')
fig

In [None]:
fig.write_image(figuresDir/f"{today}_Figure3C.png", format ='png', scale=2 )

In [None]:
wcomp[wcomp.contrast == 'D-Glucose']

In [None]:
wcorr_df = wcomp.groupby('contrast')[['LFC', 'published_LFC']].corr().iloc[0::2,-1].reset_index()
wcorr_df.columns = ['contrast', 'comparison', 'R']

In [None]:
wcorr_df

In [None]:
cor_df = pd.concat([wcorr_df, wcntCor])
cor_df.replace({'logNewCnt': 'Counts'}, inplace=True)

In [None]:
fig = px.box(cor_df, x='comparison', y='R', width=400, height=400, color='comparison',
      color_discrete_map = {'LFC': 'black' ,'Counts': 'black'
                           },
             labels={'comparison': ''},
        category_orders = {'comparison': ['Counts', 'LFC']},
      template='plotly_white', hover_data=['contrast'])
fig.update_yaxes(range=[0, 1.1])
fig.update_layout(showlegend=False, font=dict(size=20))

In [None]:
fig.write_image(figuresDir/f"{today}_Figure3D.png", format ='png', scale=2 )

In [None]:
gois = ["Sama_2129", "Sama_2132", "Sama_2134", "Sama_1943", "Sama_1942", "Sama_1944", "Sama_1946",  "Sama_2129", "Sama_2134",
       "Sama_2131", "Sama_2133", "Sama_1941", "Sama_1948", "Sama_1947"]
#gois = [""]

In [None]:
wcomp[(wcomp.Name.isin(gois)) & ((wcomp.contrast == 'D-Glucose')) ]

In [None]:
fig = px.bar(wcorr_df, x="contrast", y='R', color='contrast',
      color_discrete_sequence = ['black']*3,
      height=400, width=800, text_auto='.2f', template='plotly_white')
fig.update_layout(showlegend=False)

In [None]:
fig.write_image(figuresDir/f"{today}_Figure3D_v2.png", format ='png', scale=2 )

In [None]:
x = wcomp[wcomp.hits != 'None'].groupby('contrast').hits.value_counts(normalize=True)
#x = wcomp.groupby('contrast').hits.value_counts(normalize=True)
x.name = 'hit_props'
x = x.reset_index()

In [None]:
fig = px.box(x, x='hits', y='hit_props', color='hits', width=500, height=500, 
    labels = {'hit_props': 'Proportion of Hits', 'hits':''},
      color_discrete_map = {'Both methods': sushi_colors['dgreen'] ,'Original analysis only': sushi_colors['orange'], 
                            'mBARq only': sushi_colors['dblue']}, hover_data=['contrast'], 
      template='plotly_white')
fig.update_layout(showlegend=False, font=dict(size=20))

In [None]:
fig.write_image(figuresDir/f"{today}_Figure3E.png", format ='png', scale=2 )

In [None]:
legend_title = 'Significant change in fitness <br> detected by:'
x = x.rename({'hits':legend_title}, axis=1)
fig = px.bar(x, x='contrast', y='hit_props', color=legend_title, 
             labels = {'hit_props': 'Proportion of hits', 'contrast':''},
       color_discrete_map = {'Both methods': sushi_colors['dgreen'] ,'Original analysis only': sushi_colors['orange'], 
                            'mBARq only': sushi_colors['dblue']}, 
      template='plotly_white')
fig 

In [None]:
x.groupby("Significant change in fitness <br> detected by:").hit_props.mean()

In [None]:
fig.write_image(figuresDir/f"{today}_Figure3E_v2.png", format ='png', scale=2 )

# Testing

In [None]:
anot = pd.read_csv(wmapFile)[['Name', 'old_locus_tag']].drop_duplicates().rename({})

In [None]:
wcomp = wcomp.rename({'Name': 'old_locus_tag'}, axis=1)

In [None]:
wcomp = wcomp.merge(anot, on='old_locus_tag', how='left')

In [None]:
goi = set([c for c in wcomp.Name if 'his' in c or 'trp' in c]+['SAMA_RS11125'])

In [None]:
goi

In [None]:
wcomp[(wcomp.contrast == 'D-Glucose') & (wcomp.Name.isin(goi))]

In [None]:
df = pd.read_table(root/"wetmore_2015/glucose_results/TestNew_D-Glucose_vs_Time0.gene_summary.txt")

In [None]:
df = df[['id', 'neg|lfc', 'neg|fdr', 'pos|fdr']].copy()
df['contrast'] = 'D-Glucose'

In [None]:
df = df.rename({'id': 'Name', 'neg|lfc': 'LFC', 'neg|fdr': 'neg_selection_fdr', 
                'pos|fdr': 'pos_selection_fdr'}, axis=1)

In [None]:
df = df[~df.Name.str.contains(":")]

In [None]:
df.head()

In [None]:
df.LFC.hist(bins=100)

In [None]:
wcomp2 = df.merge(wpublishedDf, on=['Name', 'contrast'], how='inner')
wcomp2['New'] = ((abs(wcomp2.LFC) > 0.6) & ((wcomp2.neg_selection_fdr < 0.05)| (wcomp2.pos_selection_fdr < 0.05)))
wcomp2['Original'] = (abs(wcomp2.tstat) > 4).astype(int)*2
wcomp2['Hits'] = wcomp2['New'].astype(int) + wcomp2['Original']
wcomp2.Hits.replace({0: 'No defect', 1: 'New', 2: 'Original', 3: 'Confirmed'}, inplace=True)

In [None]:
wcorr_df2 = wcomp2.groupby('contrast')[['LFC', 'published_LFC']].corr().iloc[0::2,-1].reset_index()
wcorr_df2.columns = ['contrast', 'LFC', 'R']

In [None]:
wcorr_df2

In [None]:
compare_CIs_db(wcomp2, 'D-Glucose')

In [None]:
x = wcomp2[wcomp2.Hits != 'No defect'].groupby('contrast').Hits.value_counts(normalize=True)
x.name = 'hit_props'
x = x.reset_index()

In [None]:
x

# Jasinska 2020

## Load data 

In [None]:
jasinska_config = configs['jasinska']
jasinska_root = Path(jasinskaConfig['root'])
jasinska_sample_data_file = jroot/jasinska_config['sampleDataFile']
jasinska_published_freq_file = jroot/jasinska_config['publishedFrequency']

In [None]:
jasinska_sample_data = pd.read_csv(jroot/jasinska_sample_data_file)
jasinska_sample_data = jasinska_sample_data[["Run", "Drug_condition_and_replicate",  "Sample Name"]]
drugs_and_reps = jasinska_sample_data["Drug_condition_and_replicate"].str.split(" r", expand=True)
drugs_and_reps.columns = ['drug_condition', 'replicate']
drugs_and_reps['replicate'] = drugs_and_reps['replicate'].replace({'1': 'Replicate 1', 
                                                                  '2': 'Replicate 2', 
                                                                  '3': 'Replicate 3'})
names = jasinska_sample_data['Sample Name'].str.split('_', expand=True)
names.columns = ['exp', 'well', 'passage', 'subsample']
jasinska_sample_data = pd.concat([jasinska_sample_data, drugs_and_reps, names], axis=1)
jasinska_sample_data = jasinska_sample_data.drop(['Drug_condition_and_replicate', 'Sample Name'], axis=1)
jasinska_sample_data['passage'] = jasinska_sample_data['passage'].str.split("-", expand=True)[1].astype(int)
jasinska_sample_data['generation'] = jasinska_sample_data['passage']*6 # from the paper

no_drug_sample_data = jasinska_sample_data[jasinska_sample_data.drug_condition == 'No drug']

In [None]:
# Selecting only no drug samples to analyze

# no_drug_samples = jasinska_sample_data[jasinska_sample_data.drug_condition == 'No drug'].Run.unique()
# with open(root/"jasinska_2020/no_drug_samples.tsv", 'w') as fo:
#     for s in no_drug_samples:
#         fo.write(f"{s}_mbarq_counts.csv\n")

# tmp_samples = jsampleData[jsampleData.drug_condition == 'Low TMP'].Run.unique()
# with open(root/"jasinska_2020/low_tmp_samples.tsv", 'w') as fo:
#     for s in tmp_samples:
#         fo.write(f"{s}_mbarq_counts.csv\n")
# Running merge on each of these seperately

In [None]:
no_drugs_file = jasinska_root/jasinska_config['noDrugsFile']

In [None]:
jasinska_published_freq = pd.read_csv(jasinska_published_freq_file)
jasinska_published_freq['replicate'] = jasinska_published_freq['replicate'].replace({'Rep1': 'Replicate 1', 
                                                                  'Rep2': 'Replicate 2', 
                                                                  'Rep3': 'Replicate 3'})
jasinska_pub_barcodes = published.barcode.values
jasinska_color_map = jasinska_published_freq[['barcode', 'color']].set_index('barcode').to_dict()['color']

In [None]:
jasinska_counts = None
for chunk in pd.read_csv(no_drugs_file, chunksize=1000000):
    chunk_result = chunk.set_index("barcode")
    chunk_result = chunk_result[chunk_result.sum(axis=1) > 10]
    if jasinska_counts is None:
        jasinska_counts = chunk_result
    else:
        jasinska_counts = jasinska_counts.add(chunk_result, fill_value=0)

In [None]:
jasinska_counts = jasinska_counts/jasinska_counts.sum()

In [None]:
jasinska_counts = jasinska_counts.reset_index()

## Graph frequencies overtime

In [None]:
hi_freq = jasinska_counts[jasinska_counts.barcode.isin(jasinska_pub_barcodes)]
hi_freq  = hi_freq.melt(id_vars='barcode', value_name = 'Frequency', var_name = 'Run')
hi_freq_full = no_drug_sample_data.merge(hi_freq, on='Run', how='left')
freq_overtime = hi_freq_full.groupby(['barcode', 'replicate', 'generation']).Frequency.mean().reset_index()

In [None]:
def graph_frequency_over_time(df, time_col, freq_col, color_dict, barcode_col='barcode', 
                              filter_by_col='', filter_by_value='' ):
    if filter_by_col:
        df = df[df[filter_by_col] == filter_by_value]
    barcode_order = list(df.groupby([barcode_col])[freq_col].sum().sort_values(ascending=False).index)
    df = (df[[barcode_col, freq_col, time_col]].drop_duplicates()
          .pivot(index = time_col, columns = barcode_col))
    df.columns = [c[1] for c in list(df.columns)]
    df = df.reset_index()
    df = df[[time_col] + barcode_order]
    y = [df[c] for c in df.columns[1:]]
    color_map = [color_dict[c] for c in df.columns[1:]]
    sns.set_style('ticks')
    fig = plt.figure(figsize=(5,4))
    #create area chart
    plt.stackplot(df[time_col], y, colors=color_map)
    #add axis labels
    plt.xlabel('Time (generations)')
    plt.ylabel('Lineage frequency')

    return fig

In [None]:
fig = graph_frequency_over_time(freq_overtime, 'generation', 'Frequency', jasinska_color_map, 
                                'barcode', 'replicate', 'Replicate 1')
fig.savefig(figuresDir/f"{today}_Figure4A_i.png", dpi=150, bbox_inches = "tight")

In [None]:
fig = graph_frequency_over_time(freq_overtime, 'generation', 'Frequency', jasinska_color_map, 
                                'barcode', 'replicate', 'Replicate 2')
fig.savefig(figuresDir/f"{today}_Figure4A_ii.png", dpi=150, bbox_inches = "tight")

In [None]:
fig = graph_frequency_over_time(freq_overtime, 'generation', 'Frequency', jasinska_color_map, 
                                'barcode', 'replicate', 'Replicate 3')

fig.savefig(figuresDir/f"{today}_Figure4A_iii.png", dpi=150, bbox_inches = "tight")

## Graph final frequencies

In [None]:
mbarq_result_mean = hi_freq_full.groupby(['barcode', 'replicate']).Frequency.mean().reset_index()
mbarq_result_mean.columns = ['barcode', 'replicate', 'mbarq_av_freq']
mbarq_result_final = (hi_freq_full[hi_freq_full.generation == 420]
                      .groupby(['barcode', 'replicate'])
                      .Frequency.mean()
                      .reset_index())
mbarq_result_final.columns = ['barcode', 'replicate', 'mbarq_final_freq']

mbarq_result = (mbarq_result_mean.merge(mbarq_result_final, on=['barcode', 'replicate'])
                .merge(jasinska_published_freq, on=['barcode', 'replicate']))

In [None]:
final_freq_df = (mbarq_result[['barcode', 'replicate', 'mbarq_final_freq', 'final_freq']]
                 .melt(id_vars=['barcode', 'replicate'], value_name = 'Frequency', var_name ='method'))
final_freq_df['method'] = final_freq_df['method'].replace({'mbarq_final_freq': 'mBARq',
                                                          'final_freq': 'Original analysis'})
fig = px.bar(final_freq_df, x="method", y="Frequency", color="barcode", log_y=True, color_discrete_map = cdict, 
       facet_col='replicate', template="plotly_white",
       category_orders = {'replicate': ['Replicate 1', 'Replicate 2', 'Replicate 3']},
             labels = { 'Frequency': 'Lineage frequency','method':''},
            width=600, height=600
      )
fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))
fig.update_layout(showlegend=False)

fig.write_image(figuresDir/f"{today}_Figure4C.png", format ='png', scale=2 )

## Graph correlation for average frequencies

In [None]:
mbarq_result[['average_freq', 'mbarq_av_freq']].corr()

In [None]:
0.9991**2

In [None]:
fig = px.scatter(mbarq_result, x='average_freq', y='mbarq_av_freq', log_x=True, log_y=True, 
                 labels={'mbarq_av_freq': 'Average lineage frequency (mBARq)',
                        'average_freq': 'Average lineage frequency (original analysis)'},
           color='barcode', template='plotly_white', width=500, height=500, color_discrete_map=cdict)
fig.update_layout(showlegend=False)
fig.update_traces(marker=dict(size=14,
                              line=dict(width=1,
                                        color='DarkSlateGrey')),
                  selector=dict(mode='markers'))
fig.update_xaxes(range=[-3.5, 0.0])
fig.update_yaxes(range=[-3.5, 0.0])
fig.write_image(figuresDir/f"{today}_Figure4B.png", format ='png', scale=2 )

## Graph Low TMP Condition

In [None]:
low_tmp = process_frequency_df(lowTMPFile, jsampleData)

In [None]:
color_seq4 = [px.colors.qualitative.Light24[0], px.colors.qualitative.Light24[5], 
              px.colors.qualitative.Dark24[19], px.colors.qualitative.Light24[11], 
              px.colors.qualitative.Light24[9]] +  alphabetClrs + ['grey']*997

In [None]:
fig = graph_replicate(low_tmp, color_seq4, '1', f=0.01)

In [None]:
fig

In [None]:
fig = graph_replicate(low_tmp, color_seq4, '2')

In [None]:
fig

In [None]:
fig = graph_replicate(low_tmp, color_seq4, '3')

In [None]:
fig