In [None]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.colors as mcolor
import os
import gzip
from scipy import stats
from sklearn.metrics import roc_auc_score

In [None]:
matplotlib.rcParams['font.family']       = 'Arial'
matplotlib.rcParams['font.sans-serif']   = ["Arial","DejaVu Sans","Lucida Grande","Verdana"]
matplotlib.rcParams['figure.figsize']    = [4,3]
matplotlib.rcParams['font.size']         = 10
matplotlib.rcParams["axes.labelcolor"]   = "#000000"
matplotlib.rcParams["axes.linewidth"]    = 1.0 
matplotlib.rcParams["xtick.major.width"] = 1.0
matplotlib.rcParams["ytick.major.width"] = 1.0
cmap1 = plt.cm.tab20
cmap2 = plt.cm.Set3  
#plt.style.use('default')

In [None]:
os.chdir("/Users/konnonaoki/Documents/backupped/Research/IwasakiLab/Data/MetabolicNetworkEvolution/experiment/NK_M0154")

for dir in ["figures", "tables", "itol"]:
    try:
        os.mkdir(dir)
    except:
        None

#### Get correspondence table

In [None]:
df_sp_sptaxid = pd.read_table("/Users/konnonaoki/Documents/backupped/Research/IwasakiLab/Data/MetabolicNetworkEvolution/experiment/NK_M0154/list/gn_sptaxid.txt", names=['species','sptaxid'])
df_sp_sptaxid['species'] = [sp.split(":")[1] for sp in df_sp_sptaxid['species']]
df_sp_sptaxid


#### Essential gene profile

In [None]:
mtx_ePath = pd.read_table("/Users/konnonaoki/Documents/backupped/Research/IwasakiLab/Data/MetabolicNetworkEvolution/experiment/NK_M0154/ePath/Combined_KOExpEG.txt", index_col=0)
mtx_ePath


In [None]:
mtx_ePath_main = mtx_ePath.iloc[0:7150, 0:31]
mtx_ePath_main


In [None]:
species_KO_essentiality_list = []

for species in mtx_ePath_main.columns:
    for KO in mtx_ePath_main.index:
        species_KO_essentiality_list.append([species, KO, mtx_ePath_main.loc[KO, species]])

df_ePath = pd.DataFrame(species_KO_essentiality_list, columns = ['species', 'KO', 'property'])

df_ePath_sptaxid = pd.merge(df_ePath, df_sp_sptaxid, on = 'species')

# Property: Experimental EG(*),	Missing EG(o),	Experimental Not EG(x), or	Not Present(-)
df_ePath_sptaxid

In [None]:
len(set(df_ePath_sptaxid["sptaxid"]))

#### Open prediction results

In [None]:
df_future = pd.read_table(gzip.open("/Users/konnonaoki/Documents/backupped/Research/IwasakiLab/Data/MetabolicNetworkEvolution/experiment/NK_M0153/result/sp_prob.test.txt.gz"), names = ['KO', 'target', 'pred_method', 'selec_method', 'opt_mode', 'Nfeatures', 'species', 'prob'])
df_future 

In [None]:
df_future_sptaxid = pd.merge(df_future, df_sp_sptaxid, on = 'species')
df_future_sptaxid


#### Comparison

In [None]:
df_future_ePath = pd.merge(df_future_sptaxid, df_ePath_sptaxid.loc[:, ['KO', 'property', 'sptaxid']], on = ['sptaxid', 'KO'])
df_future_ePath_ext = df_future_ePath[
    (df_future_ePath['target']=='loss') &
    (df_future_ePath['pred_method']=='RF') &
    (df_future_ePath['selec_method']=='ANOVA') &
    (df_future_ePath['opt_mode']=='N_opt_for_AUC_of_the_OG') &
    (df_future_ePath['property']!='-')
]

df_future_ePath_ext = df_future_ePath_ext.reset_index()


In [None]:
df_future_ePath_ext['essentiality'] = (df_future_ePath_ext['property'] == '*')
df_future_ePath_ext

In [None]:
threshold = 2

ko_ess_iness_auc_list = []

for KO in list(set(df_future_ePath_ext['KO'])):
    df_future_ePath_ext_ko = df_future_ePath_ext[df_future_ePath_ext['KO']==KO]
    df_future_ePath_ext_ko = df_future_ePath_ext_ko.reset_index()
    df_future_ePath_ext_essential = df_future_ePath_ext_ko[df_future_ePath_ext_ko['essentiality']]
    df_future_ePath_ext_inessential = df_future_ePath_ext_ko[~df_future_ePath_ext_ko['essentiality']]

    if len(df_future_ePath_ext_essential) >= threshold and len(df_future_ePath_ext_inessential) >= threshold:
        ko_ess_iness_auc_list.append([
            KO, 
            df_future_ePath_ext_essential.median()['prob'], 
            df_future_ePath_ext_inessential.median()['prob'],
            roc_auc_score(df_future_ePath_ext_ko["essentiality"], -df_future_ePath_ext_ko["prob"])
        ]
        )

df_ko_ess_iness_auc = pd.DataFrame(ko_ess_iness_auc_list, columns = ['KO', 'prob_ess', 'prob_iness', 'AUC'])
df_ko_ess_iness_auc


In [None]:
fig = plt.figure(figsize=(2,2))
ax = fig.add_axes([0.1,0.1,0.8,0.8])

ax.scatter(df_ko_ess_iness_auc ["prob_ess"], df_ko_ess_iness_auc ["prob_iness"], alpha=0.5, s = 2, color ='#E1BB63')#color=df_testresult['color_0to50vs50to100'])

ax.set_xlim(0.0005,0.5)
ax.set_ylim(0.0005,0.5)
ax.set_xlabel("Loss prob. - Essential")
ax.set_ylabel("Loss prob. - Inessential")
ax.set_xscale('log')
ax.set_yscale('log')
plt.gca().spines['right'].set_visible(False)
plt.gca().spines['top'].set_visible(False)
x = np.arange(0, 2)

y = x 

ax.plot(x, y, color = "blue", alpha = 0.5, lw = 1)

plt.savefig("figures/prob_ess_iness.pdf",bbox_inches='tight')

stats.wilcoxon(df_ko_ess_iness_auc ["prob_ess"], df_ko_ess_iness_auc ["prob_iness"])

In [None]:
fig = plt.figure(figsize=(1.32,1.32))
ax = fig.add_axes([0.1,0.1,0.8,0.8])

ax.hist(np.log2(df_ko_ess_iness_auc['prob_iness'] / df_ko_ess_iness_auc['prob_ess']), color ='#E1BB63', bins = 100, range = (-3,3))

ax.set_xlabel("Probability ratio\n$log_2(inessential/essential)$")
ax.set_ylabel("#OGs")
ax.axvline(0, color = "blue", alpha = 0.1, lw = 0.5)
plt.savefig("figures/logratio_histogram.pdf",bbox_inches='tight')
plt.close()

stats.ttest_1samp(np.log2(df_ko_ess_iness_auc['prob_iness'] / df_ko_ess_iness_auc['prob_ess']), 0)

In [None]:
fig = plt.figure(figsize=(2,2))
ax = fig.add_axes([0.1,0.1,0.8,0.8])

ax.hist(df_ko_ess_iness_auc['AUC'], color ='#E1BB63', bins = 20, range = (0,1))

ax.set_xlabel("AUC")
ax.set_ylabel("#OGs")
ax.axvline(0.5, color = "blue", alpha = 0.5, lw = 1)

plt.savefig("figures/AUC_histogram.pdf",bbox_inches='tight')
#plt.close()

stats.ttest_1samp(df_ko_ess_iness_auc['AUC'], 0.5)

In [None]:
fig = plt.figure(figsize=(1.5,1.5))
ax = fig.add_axes([0.1,0.1,0.8,0.8])

ax.hist(df_ko_ess_iness_auc['AUC'], color ='#F6EEDB', bins = 20, range = (0,1), )
ax.hist(df_ko_ess_iness_auc['AUC'], color ='#E1BB63', bins = 20, range = (0,1), histtype = "step", lw = 1, )

ax.set_xlabel("AUC of predicting\ngene essentiality")
ax.set_ylabel("#OGs")
ax.set_xticks([0,0.5,1])
ax.axvline(0.5, color = "#000000", alpha = 0.5, lw = 0.75)
plt.gca().spines['right'].set_visible(False)
plt.gca().spines['top'].set_visible(False)

plt.savefig("figures/AUC_histogram.pdf",bbox_inches='tight')
#plt.close()

stats.ttest_1samp(df_ko_ess_iness_auc['AUC'], 0.5)

In [None]:
df_ko_ess_iness_auc

#### species included in ePath 

In [None]:
df_species = pd.read_table("/Users/konnonaoki/Documents/backupped/Research/IwasakiLab/Data/MetabolicNetworkEvolution/experiment/NK_M0154/list/organism.txt", index_col=1, names = ['T', 'species', 'name', 'taxonomy'])
df_species.loc[list(set(df_future_ePath['species'])), :].sort_values('taxonomy')
