In [None]:
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.colors as mcolor
import sys
import os
import pandas as pd
import numpy as np
from Bio import Phylo
import seaborn as sns
from scipy.stats import t, ttest_1samp, wilcoxon, mannwhitneyu, ttest_rel, zscore, spearmanr
import json
from statsmodels.stats.multitest import multipletests
from scipy.stats import gaussian_kde
from sklearn import linear_model
import re
from matplotlib.colors import ListedColormap
import networkx as nx
from scipy.stats import f_oneway

In [None]:
matplotlib.rcParams['font.family']       = 'Arial'
matplotlib.rcParams['font.sans-serif']   = ["Arial","DejaVu Sans","Lucida Grande","Verdana"]
matplotlib.rcParams['figure.figsize']    = [4,3]
matplotlib.rcParams['font.size']         = 10
matplotlib.rcParams["axes.labelcolor"]   = "#000000"
matplotlib.rcParams["axes.linewidth"]    = 1.0 
matplotlib.rcParams["xtick.major.width"] = 1.0
matplotlib.rcParams["ytick.major.width"] = 1.0
cmap1 = plt.cm.tab20
cmap2 = plt.cm.Set3  
#plt.style.use('default')

In [None]:
os.chdir("/Users/konnonaoki/Documents/backupped/Research/IwasakiLab/Data/MetabolicNetworkEvolution/experiment/NK_M0151")

for dir in ["figures", "tables", "networks"]:
    try:
        os.mkdir(dir)
    except:
        None

In [None]:
# Classess of KOs

df_path_ko = pd.read_table("tables/path_ko.txt", names = ['Pathway', 'KO'])
df_rn_ko = pd.read_table("tables/rn_ko.txt", names = ['Reaction','KO'])
df_md_ko = pd.read_table("tables/md_ko.txt", names = ['Module','KO'])
df_path_md = pd.read_table("tables/path_md.txt", names = ['Pathway','Module'])
ontology = json.load(open("/Users/konnonaoki/Documents/backupped/Research/IwasakiLab/Data/MetabolicNetworkEvolution/experiment/NK_M0151/json/ko00001.json"))

ontology_tree = Phylo.BaseTree.Tree(Phylo.BaseTree.Clade(name=ontology['name']))
root_clade    = Phylo.BaseTree.Clade(name=ontology['name'])
stack = [(ontology, root_clade)]

while len(stack) > 0:
    term, clade = stack.pop()
    if ('children' in term.keys()):
        for child in term['children']:
            child_clade = Phylo.BaseTree.Clade(name = child['name'])
            clade.clades.append(child_clade)
            stack.append((child, child_clade))

ontology_tree = Phylo.BaseTree.Tree(root_clade)

list_category_ko = []
for clade in ontology_tree.clade.clades[0].clades:
    for tip in clade.get_terminals():
        KO = tip.name.split()[0]
        if (KO[0] == 'K'):
            list_category_ko.append([clade.name, KO])
df_category_ko = pd.DataFrame(list_category_ko, columns = ['category', 'KO'])
st_category_ko = []
for clade in ontology_tree.clade.clades[0].clades:
    for tip in clade.get_terminals():
        KO = tip.name.split()[0]
        if (KO[0] == 'K'):
            list_category_ko.append([clade.name, KO])
df_category_ko = pd.DataFrame(list_category_ko, columns = ['category', 'KO'])
df_category_ko = df_category_ko[~df_category_ko.duplicated()]

df_ko_count = pd.DataFrame(df_category_ko.KO.value_counts())
set_ko_with_unique_category = set(df_ko_count[df_ko_count['KO']==1].index)
df_category_ko['unique'] = [(ko in set_ko_with_unique_category) for ko in df_category_ko.KO]
df_uniquecategory_ko = df_category_ko[df_category_ko['unique']]

# color of function categories

colors = ['#66C2A5', '#FC8D62', '#8DA0CB', '#E78AC3', '#555555', '#FC8D62', '#8DA0CB', '#E78AC3', '#66C2A5', '#FC8D62', '#000000']

cm_name = 'Set3' # B->G->R
cm = plt.get_cmap(cm_name)

df_category_ko_module = pd.merge(df_category_ko, df_md_ko, on = 'KO')
df_category_ko_module['Nko'] = 1
df_category_module_count = df_category_ko_module.groupby(['category', 'Module'], as_index = False).sum()
df_maxcategory_module = df_category_module_count.loc[df_category_module_count.groupby('Module')['Nko'].idxmax(),:].sort_values('category')
df_maxcategory_module = df_maxcategory_module.reset_index().loc[:, ['category', 'Module']]
df_category_color = pd.DataFrame([[category, i] for i, category in enumerate(df_maxcategory_module.category.unique())], columns = ["category", 'category_id'])
df_category_color['color'] = [mcolor.rgb2hex(cm(i)) for i in df_category_color['category_id']]
#df_category_color

df_category_ko_pathway = pd.merge(df_category_ko, df_path_ko, on = 'KO')
df_category_ko_pathway['Nko'] = 1
df_category_pathway_count = df_category_ko_pathway.groupby(['category', 'Pathway'], as_index = False).sum()
df_maxcategory_pathway = df_category_pathway_count.loc[df_category_pathway_count.groupby('Pathway')['Nko'].idxmax(),:].sort_values('category')
df_maxcategory_pathway = df_maxcategory_pathway.reset_index().loc[:, ['category', 'Pathway']]
df_maxcategory_pathway

#### Visualize AUC of cross validation

In [None]:
tree = 'mlgtdb'
acr  = 'MPPA'

for tree, acr in [('mlgtdb', 'MPPA'), ('mlgtdb', 'DOWNPASS'), ('nj', 'MPPA'), ('nj', 'DOWNPASS')]:
    df_auc_raw = pd.read_table("/Users/konnonaoki/Documents/backupped/Research/IwasakiLab/Data/MetabolicNetworkEvolution/experiment/NK_M0151/result/ko_auc."+tree+"."+acr+".txt", names = ['KO', 'target', 'method', 'auc'])
    df_auc = df_auc_raw.groupby(['KO', 'target', 'method'], as_index = False).mean()

    # visualize
    #df_auc_ext = df_auc[df_auc['target'] == target]
    fig = plt.figure(figsize=(2,2))
    ax = fig.add_axes([0.1,0.1,0.8,0.8])
    ax.set_ylim(-0.1,1.1)
    sns.violinplot(data = df_auc, x = 'method', y = 'auc', hue = 'target', linewidth=0.5, order = ['RF','LR'], palette = ['#FFFFFF', '#FFFFFF'])
    sns.stripplot (data = df_auc, x = 'method', y = 'auc', hue = 'target', linewidth=0,   order = ['RF','LR'], palette = ['#9FA6F1', '#E1BB63'], size = 1, alpha=.3,jitter=0.3, dodge=True)
    ax.set_ylabel("AUC")
    ax.set_xlabel("")
    plt.gca().spines['right'].set_visible(False)
    plt.gca().spines['top'].set_visible(False)
    ax.get_legend().remove()
    ax.set_title(tree+" "+acr)
    ax.axhline(0.5, linewidth = 1, alpha = 0.5, color = '#555555')
    plt.savefig("figures/NK_M0151_crossvalidation_"+tree+"_"+acr+".pdf",bbox_inches='tight')
    plt.close()

    # test if average AUC is 0.5
    for method in ['LR', 'RF']:
        for target in ['gain', 'loss']:
            print(tree, acr, target, method, ttest_1samp(df_auc[(df_auc['method'] == method) & (df_auc['target'] == target)]['auc'], 0.5).pvalue)

    # test if AUC of LR and RF is equal
    for target in ['gain', 'loss']:
        df_auc_comp = pd.merge(df_auc[(df_auc['method'] == 'LR') & (df_auc['target'] == target)], df_auc[(df_auc['method'] == 'RF') & (df_auc['target'] == target)], on = 'KO')
        print(tree, acr, target, 'LR', 'RF', wilcoxon(df_auc_comp['auc_x'], df_auc_comp['auc_y']).pvalue)

    # test if AUC of gain and loss is equal
    for method in ['LR', 'RF']:
        df_auc_gain = df_auc[(df_auc['method'] == method) & (df_auc['target'] == 'gain')]
        df_auc_loss = df_auc[(df_auc['method'] == method) & (df_auc['target'] == 'loss')]
        print(tree, acr, method, 'gain', 'loss', mannwhitneyu(df_auc_gain.auc, df_auc_loss.auc).pvalue)

In [None]:
tree, acr = 'mlgtdb', 'MPPA'
df_auc_raw = pd.read_table("/Users/konnonaoki/Documents/backupped/Research/IwasakiLab/Data/MetabolicNetworkEvolution/experiment/NK_M0151/result/ko_auc."+tree+"."+acr+".txt", names = ['KO', 'target', 'method', 'auc'])
df_auc = df_auc_raw.groupby(['KO', 'target', 'method'], as_index = False).mean()

df_auc.value_counts(["target", "method"])

#### Relation between gain predictability and loss predictability

In [None]:
# Relation between gain predictability and loss predictability
tree = 'mlgtdb'
acr  = 'MPPA'

for tree, acr in [('mlgtdb', 'MPPA'), ('mlgtdb', 'DOWNPASS'), ('nj', 'MPPA'), ('nj', 'DOWNPASS')]:
    df_auc_raw = pd.read_table("/Users/konnonaoki/Documents/backupped/Research/IwasakiLab/Data/MetabolicNetworkEvolution/experiment/NK_M0151/result/ko_auc."+tree+"."+acr+".txt", names = ['KO', 'target', 'method', 'auc'])
    df_auc = df_auc_raw.groupby(['KO', 'target', 'method'], as_index = False).mean()

    for method in ['LR', 'RF']:
        df_auc_gain_loss = pd.merge(df_auc[(df_auc['method']==method) & (df_auc['target']=='gain')], df_auc[(df_auc['method']==method) & (df_auc['target']=='loss')], on = 'KO')

        x = df_auc_gain_loss.auc_x
        y = df_auc_gain_loss.auc_y
        xy = np.vstack([x,y])
        z = gaussian_kde(xy)(xy)
        idx = z.argsort()
        x, y, z = x[idx], y[idx], z[idx]


        clf = linear_model.LinearRegression()
        x2 = [[x_] for x_ in x]
        clf.fit(x2, y)

        print(tree+" "+acr+" "+method)
        print("correlation coeff= ", clf.coef_)
        print("intercept= ", clf.intercept_)
        print("score= ", clf.score(x2, y))
        print("Spearman r = ",spearmanr(x2, y))

        fig = plt.figure(figsize=(2,2))
        ax = fig.add_axes([0.1,0.1,0.8,0.8])
        #ax.plot([-200, 500], [-200, 500], 'k-', lw=0.5, alpha = 0.5, color = '#555555')
        ax.scatter(x = x, y = y, c = z, s = 1, alpha = 0.5)

        # plot a regression line
        ax.set_title('Linear regression')
        ax.plot(x2, clf.predict(x2), color = '#000000', alpha =0.5)


        ax.set_xlim(-0.01,1.01)
        ax.set_ylim(-0.01,1.01)
        #ax.set_xscale("log")
        #ax.set_yscale("log")
        ax.set_xlabel("AUC of gain")
        ax.set_ylabel("AUC of loss")
        #ax.set_xticks([0,100,200,300])
        ax.set_title(tree+" "+acr+" "+method)
        plt.savefig("figures/NK_M0151_AUCgain_AUCloss_"+tree+"_"+acr+"_"+method+".pdf",bbox_inches = 'tight')
        plt.close()

#### Relation between AUC and number of gai/losses 

In [None]:
tree = "mlgtdb"
acr = "MPPA"

df_auc_raw = pd.read_table("/Users/konnonaoki/Documents/backupped/Research/IwasakiLab/Data/MetabolicNetworkEvolution/experiment/NK_M0151/result/ko_auc."+tree+"."+acr+".txt", names = ['KO', 'target', 'method', 'auc'])
df_auc = df_auc_raw.groupby(['KO', 'target', 'method'], as_index = False).mean()
    
df_ko_Ngain_Nloss = pd.read_table("../NK_M0150/table/ko_Ngain_Nloss.mlgtdb_MPPA.txt")
df_ko_auc_Ngain_Nloss = pd.merge(df_auc, df_ko_Ngain_Nloss, on = "KO", how = "left")

target = "gain"
pred_method = "RF"

for target in ["gain", "loss"]:
    for method in ["LR", "RF"]:

        df_ko_auc_Ngain_Nloss_ext = df_ko_auc_Ngain_Nloss[(df_ko_auc_Ngain_Nloss["target"] == target) & (df_ko_auc_Ngain_Nloss["method"] == method)]
        x = np.array(df_ko_auc_Ngain_Nloss_ext["N"+target])

        #for target, x in ["gain", 

        y = np.array(df_ko_auc_Ngain_Nloss_ext.auc)
        xy = np.vstack([x,y])
        z = gaussian_kde(xy)(xy)
        idx = z.argsort()
        x, y, z = x[idx], y[idx], z[idx]


        clf = linear_model.LinearRegression()
        x2 = [[x_] for x_ in x]
        clf.fit(x2, y)

        print(target+" "+method)
        print("回帰係数= ", clf.coef_)
        print("切片= ", clf.intercept_)
        print("決定係数= ", clf.score(x2, y))
        print("Spearman r = ",spearmanr(x2, y))

        fig = plt.figure(figsize=(2,2))
        ax = fig.add_axes([0.1,0.1,0.8,0.8])
        #ax.plot([-200, 500], [-200, 500], 'k-', lw=0.5, alpha = 0.5, color = '#555555')
        ax.scatter(x = x, y = y, c = z, s = 1, alpha = 0.5)

        # 回帰直線
        #ax.set_title('Linear regression')
        #ax.plot(x2, clf.predict(x2), color = '#000000', alpha =0.5)


        #ax.set_xlim(-0.01,1.01)
        ax.set_ylim(-0.01,1.01)
        #ax.set_xscale("log")
        #ax.set_yscale("log")
        ax.set_xlabel("# " + target + " events")
        ax.set_ylabel("AUC of "+target)
        #ax.set_xticks([0,100,200,300])
        ax.set_title(target+" "+method)
        plt.savefig("figures/NK_M0151_N"+target+"_AUC_"+tree+"_"+acr+"_"+method+".pdf",bbox_inches = 'tight')
        plt.close()

#### Relation between function and AUC

In [None]:

tree   = 'mlgtdb'
acr    = 'MPPA'

test_result_list = []

for target, prediction, color in [('gain', 'LR', '#9FA6F1'), ('gain', 'RF', '#9FA6F1'), ('loss', 'LR', '#E1BB63'), ('loss', 'RF', '#E1BB63')]:

    df_auc_raw = pd.read_table("/Users/konnonaoki/Documents/backupped/Research/IwasakiLab/Data/MetabolicNetworkEvolution/experiment/NK_M0151/result/ko_auc."+tree+"."+acr+".txt", names = ['KO', 'target', 'method', 'auc'])
    df_auc = df_auc_raw.groupby(['KO', 'target', 'method'], as_index = False).mean()

    df_category_auc =  pd.merge(df_category_ko, df_auc, on = 'KO')
    
    category_order = list(reversed(list(df_category_auc[(df_category_auc['target'] == "gain") & (df_category_auc['method'] == prediction)].groupby("category", as_index=False).median().sort_values("auc")["category"])))
    #category_order = list(reversed(list(df_category_auc[(df_category_auc['target'] == target) & (df_category_auc['method'] == prediction)].groupby("category", as_index=False).median().sort_values("auc")["category"])))
    
    
    df_category_auc_ext = df_category_auc[(df_category_auc['target'] == target) & (df_category_auc['method'] == prediction)]
    fig = plt.figure(figsize=(5,1.8))
    ax = fig.add_axes([0.1,0.1,0.8,0.8])
    sns.violinplot(data = df_category_auc_ext, x = 'category', y = 'auc', order = category_order, linewidth=0.5, orient = 'v', color = '#FFFFFF')
    sns.stripplot (data = df_category_auc_ext, x = 'category', y = 'auc', hue = 'target', order = category_order, linewidth=0, orient = 'v', size = 1, alpha=.5,jitter=0.3, dodge=True, palette=[color])
    #ax.tick_params(axis='x', labelrotation= 90)
    ax.get_legend().remove()
    ax.set_ylabel('AUC')
    ax.set_xlabel('KEGG category')
    ax.tick_params(axis='x', labelrotation= 90)
    ax.set_ylim(-0.1,1.1)
    plt.gca().spines['right'].set_visible(False)
    plt.gca().spines['top'].set_visible(False)
    ax.axhline(0.5, linewidth = 1, alpha = 0.5, color = '#555555')
    ax.set_title(target + "/" + prediction)
    plt.savefig("figures/NK_M0151_by_category_"+target + "_" + prediction+".pdf",bbox_inches='tight')
    plt.close()
    
    
    # test if average AUC is 0.5
    for category in sorted(list(set(df_category_auc_ext["category"]))):
        test_result_list.append([tree, acr, target, prediction, category, np.median(df_category_auc_ext[df_category_auc_ext['category']==category]['auc']), ttest_1samp(df_category_auc_ext[df_category_auc_ext['category']==category]['auc'], 0.5).pvalue])
        
        
    F_test_result = f_oneway(
        list(df_category_auc_ext[df_category_auc_ext["category"]=="09101 Carbohydrate metabolism"].auc),
        list(df_category_auc_ext[df_category_auc_ext["category"]=="09102 Energy metabolism"].auc),
        list(df_category_auc_ext[df_category_auc_ext["category"]=="09103 Lipid metabolism"].auc),
        list(df_category_auc_ext[df_category_auc_ext["category"]=="09104 Nucleotide metabolism"].auc),
        list(df_category_auc_ext[df_category_auc_ext["category"]=="09105 Amino acid metabolism"].auc),
        list(df_category_auc_ext[df_category_auc_ext["category"]=="09106 Metabolism of other amino acids"].auc),
        list(df_category_auc_ext[df_category_auc_ext["category"]=="09107 Glycan biosynthesis and metabolism"].auc),
        list(df_category_auc_ext[df_category_auc_ext["category"]=="09108 Metabolism of cofactors and vitamins"].auc),
        list(df_category_auc_ext[df_category_auc_ext["category"]=="09109 Metabolism of terpenoids and polyketides"].auc),
        list(df_category_auc_ext[df_category_auc_ext["category"]=="09110 Biosynthesis of other secondary metabolites"].auc),
        list(df_category_auc_ext[df_category_auc_ext["category"]=="09111 Xenobiotics biodegradation and metabolism"].auc),
    )
    
    print("F-test result", target, prediction, F_test_result.statistic, F_test_result.pvalue, sep = "\t")
    
df_test_result = pd.DataFrame(test_result_list, columns=["Tree", "ACR", "Target", "Prediction", "Function", "Median AUC", "p"])
df_test_result["q"] = list(multipletests(list(df_test_result["p"]), method = "fdr_bh")[1])
df_test_result["sig"] = ['*' if q<0.05 else '' for q in df_test_result.q]
df_test_result.to_csv("tables/NK_M0151_by_category.txt", index=False, sep = "\t")
df_test_result

#### compare over- and under-sampling

In [None]:
# compare over- and under-sampling

df_auc_raw = pd.read_table("/Users/konnonaoki/Documents/backupped/Research/IwasakiLab/Data/MetabolicNetworkEvolution/experiment/NK_M0151/result/ko_auc.mlgtdb.MPPA.sampling.filtered.txt", names = ['KO','target', 'method', 'auc'])
df_auc = df_auc_raw.groupby(['KO', 'target', 'method'], as_index = False).mean()
df_auc

In [None]:
# test if average AUC is 0.5
for method in ['LR_none','LR_over','LR_under', 'RF_none', 'RF_over', 'RF_none']:
    for target in ['gain', 'loss']:
            print(tree, acr, target, method, ttest_1samp(df_auc[(df_auc['method'] == method) & (df_auc['target'] == target)]['auc'], 0.5).pvalue)

In [None]:
fig = plt.figure(figsize=(6, 2))
ax = fig.add_axes([0.1,0.1,0.8,0.8])
ax.set_ylim(-0.1,1.1)
sns.violinplot(data = df_auc, x = 'method', y = 'auc', hue = 'target', linewidth=0.5, order = ['RF_none','RF_under','RF_over','LR_none', 'LR_under', 'LR_over'], palette = ['#FFFFFF'], orient = 'v')
sns.stripplot (data = df_auc, x = 'method', y = 'auc', hue = 'target', linewidth=0, order = ['RF_none','RF_under','RF_over','LR_none', 'LR_under', 'LR_over'], palette = ['#9FA6F1', '#E1BB63'], size = 1, alpha=.3,jitter=0.3, dodge=True, orient = 'v')
ax.get_legend().remove()
ax.set_ylabel('AUC')
ax.set_ylim(-0.1,1.1)
plt.gca().spines['right'].set_visible(False)
plt.gca().spines['top'].set_visible(False)
ax.axhline(0.5, linewidth = 1, alpha = 0.5, color = '#555555')
ax.set_xticklabels(['RF$_{none}$','RF$_{under}$','RF$_{over}$', 'LR$_{none}$', 'LR$_{under}$', 'LR$_{over}$'])
plt.savefig("figures/NK_M0151_underover.pdf",bbox_inches='tight')
plt.close()

#### Unsupervised clustering of OGs

In [None]:
# Unsupervised clustering of OGs
df_auc_raw = pd.read_table("/Users/konnonaoki/Documents/backupped/Research/IwasakiLab/Data/MetabolicNetworkEvolution/experiment/NK_M0151/result/ko_auc.mlgtdb.MPPA.clustering.filtered.txt", names = ['KO','target', 'method', 'auc', 'Ncluster'])
df_auc = df_auc_raw.groupby(['KO', 'target', 'method', 'Ncluster'], as_index = False).mean()
df_auc

#### calculate AUC for each number of clusters

In [None]:


for method, target, color in [('LR','loss','#E1BB63'), ('LR','gain','#9FA6F1'), ('RF','loss','#E1BB63'), ('RF','gain','#9FA6F1')]:

    df_auc_ext = df_auc[(df_auc['method']==method) & (df_auc['target']==target)]

    # clustering
    fig = plt.figure(figsize=(2,7))
    ax = fig.add_axes([0.1,0.1,0.8,0.8])
    ax.set_ylim(-0.1,1.1)
    sns.violinplot(data = df_auc_ext, y = 'Ncluster', x = 'auc', linewidth=0.5, order = [1,2,3,4,5,6,7,8,9,10,20,30,40,50], color = '#FFFFFF', orient = 'h')
    sns.stripplot (data = df_auc_ext, y = 'Ncluster', x = 'auc', linewidth=0,   order = [1,2,3,4,5,6,7,8,9,10,20,30,40,50], color = color, size = 1, alpha=.3, jitter=0.3, dodge=True, orient = 'h')

    df_Ncluster_medAUC = df_auc_ext.groupby("Ncluster", as_index=False).median()
    #ax.plot(df_Ncluster_medAUC.auc)

    #ax.get_legend().remove()
    ax.set_xlabel('AUC')
    ax.set_xlim(-0.1,1.1)
    plt.gca().spines['right'].set_visible(False)
    plt.gca().spines['top'].set_visible(False)
    ax.axvline(0.5, linewidth = 1, alpha = 0.5, color = '#555555')

    ax.set_title(target+" "+method)
    
    plt.savefig("figures/NK_M0151_clustering_"+target+"_"+method+".pdf",bbox_inches='tight')
    plt.close()

#### Number of OGs which is significantly predictable

In [None]:
threshold = 0.10

df_pvalue_raw = pd.read_table("/Users/konnonaoki/Documents/backupped/Research/IwasakiLab/Data/MetabolicNetworkEvolution/experiment/NK_M0151/result/ko_p.mlgtdb.MPPA.clustering.filtered.txt", names = ['KO','target', 'method', 'p', 'Ncluster'])
df_pvalue = df_pvalue_raw.groupby(['KO', 'target', 'method', 'Ncluster'], as_index = False).median()
list_target_method_Ncluster_Npredictable = []
for target in ["gain","loss"]:
    for method in ["LR", "RF"]:
        for Ncluster in [1,2,3,4,5,6,7,8,9,10,20,30,40,50]:
            df_pvalue_ext = df_pvalue[(df_pvalue["target"]==target) & (df_pvalue["method"]==method) & (df_pvalue["Ncluster"] == Ncluster)]
            df_pvalue_ext = df_pvalue_ext.reset_index()
            df_pvalue_ext["q"] = list(multipletests(list(df_pvalue_ext.loc[:,"p"]), method = "fdr_bh")[1])
            list_target_method_Ncluster_Npredictable.append([target, method, Ncluster, len(df_pvalue_ext[df_pvalue_ext["q"] < threshold])])

        df_pvalue_raw_without_clustering = pd.read_table("/Users/konnonaoki/Documents/backupped/Research/IwasakiLab/Data/MetabolicNetworkEvolution/experiment/NK_M0151/result/ko_p.mlgtdb.MPPA.txt", names = ['KO','target', 'method', 'p'])
        df_pvalue_without_clustering = df_pvalue_raw_without_clustering.groupby(['KO', 'target', 'method'], as_index = False).median()
        df_pvalue_without_clustering_ext = df_pvalue_without_clustering[(df_pvalue_without_clustering["target"]==target) & (df_pvalue_without_clustering["method"]==method)]
        df_pvalue_without_clustering_ext = df_pvalue_without_clustering_ext.reset_index()
        df_pvalue_without_clustering_ext["q"] = list(multipletests(list(df_pvalue_without_clustering_ext.loc[:,"p"]), method = "fdr_bh")[1])
        list_target_method_Ncluster_Npredictable.append([target, method, 70, sum(df_pvalue_without_clustering_ext["q"] < threshold)])

df_target_method_Ncluster_Npredictable = pd.DataFrame(list_target_method_Ncluster_Npredictable, columns = ["target", "method", "Ncluster", "Npredictable"])
df_target_method_Ncluster_Npredictable

In [None]:
for method in ["LR", "RF"]:
    fig = plt.figure(figsize=(3,3))
    ax = fig.add_axes([0.1,0.1,0.8,0.8])

    df_target_method_Ncluster_Npredictable_ext = \
        df_target_method_Ncluster_Npredictable[
            (df_target_method_Ncluster_Npredictable["target"]=="gain") & \
            (df_target_method_Ncluster_Npredictable["method"]==method)
            ]
    ax.scatter(x = df_target_method_Ncluster_Npredictable_ext["Ncluster"], y = df_target_method_Ncluster_Npredictable_ext["Npredictable"], color = '#9FA6F1', s = 20, alpha = 0.8)

    df_target_method_Ncluster_Npredictable_ext = \
        df_target_method_Ncluster_Npredictable[
            (df_target_method_Ncluster_Npredictable["target"]=="loss") & \
            (df_target_method_Ncluster_Npredictable["method"]==method)
            ]
    ax.scatter(x = df_target_method_Ncluster_Npredictable_ext["Ncluster"], y = df_target_method_Ncluster_Npredictable_ext["Npredictable"], color = '#E1BB63', s = 20, alpha = 0.8)

    ax.set_title(method)

    #ax.set_xscale("log")
    ax.set_xlabel("#merged features")
    ax.set_ylabel("#predictable OGs")
    ax.set_xlim(0,75)
    ax.set_ylim(0, 1800)
    ax.set_xticks([0,10,20,30,40,50,70])
    ax.set_xticklabels([0,10,20,30,40,50,"all"])
    plt.savefig("figures/NK_M0151_clustering_"+method+"_Npredictable.pdf",bbox_inches='tight')
    
    #plt.show()
    plt.close()

In [None]:
for method in ["LR", "RF"]:
    fig = plt.figure(figsize=(1,1))
    ax = fig.add_axes([0.1,0.1,0.8,0.8])

    df_target_method_Ncluster_Npredictable_ext = \
        df_target_method_Ncluster_Npredictable[
            (df_target_method_Ncluster_Npredictable["target"]=="gain") & \
            (df_target_method_Ncluster_Npredictable["method"]==method)
            ]
    ax.scatter(x = df_target_method_Ncluster_Npredictable_ext["Ncluster"], y = df_target_method_Ncluster_Npredictable_ext["Npredictable"], color = '#9FA6F1', s = 10, alpha = 0.8)

    df_target_method_Ncluster_Npredictable_ext = \
        df_target_method_Ncluster_Npredictable[
            (df_target_method_Ncluster_Npredictable["target"]=="loss") & \
            (df_target_method_Ncluster_Npredictable["method"]==method)
            ]
    ax.scatter(x = df_target_method_Ncluster_Npredictable_ext["Ncluster"], y = df_target_method_Ncluster_Npredictable_ext["Npredictable"], color = '#E1BB63', s = 10, alpha = 0.8)

    ax.set_title(method)

    #ax.set_xscale("log")
    ax.set_xlabel("#merged features")
    ax.set_ylabel("#predictable OGs")
    ax.set_xticks([1,2,3,4,5])
    ax.set_xticklabels([1,2,3,4,5])
    
    ax.set_xlim(0.5, 5.5)
    ax.set_ylim(-5,80)
    plt.savefig("figures/NK_M0151_clustering_"+method+"_Npredictable_expanded.pdf",bbox_inches='tight')
    
    #plt.show()
    plt.close()

##### Functional enrichment analysis of predictable OGs

In [None]:
pd.read_table("/Users/konnonaoki/Documents/backupped/Research/IwasakiLab/Data/MetabolicNetworkEvolution/experiment/NK_M0151/analysis/result/enrichment_gain_LR.txt")

In [None]:
# list up predictable OGs
threshold = 0.05
for target in ["gain","loss"]:
    for method in ["LR", "RF"]:
        df_pvalue_raw_without_clustering = pd.read_table("/Users/konnonaoki/Documents/backupped/Research/IwasakiLab/Data/MetabolicNetworkEvolution/experiment/NK_M0151/result/ko_p.mlgtdb.MPPA.txt", names = ['KO','target', 'method', 'p'])
        df_pvalue_without_clustering = df_pvalue_raw_without_clustering.groupby(['KO', 'target', 'method'], as_index = False).median()
        df_pvalue_without_clustering_ext = df_pvalue_without_clustering[(df_pvalue_without_clustering["target"]==target) & (df_pvalue_without_clustering["method"]==method)]
        df_pvalue_without_clustering_ext = df_pvalue_without_clustering_ext.reset_index()
        df_pvalue_without_clustering_ext["q"] = list(multipletests(list(df_pvalue_without_clustering_ext.loc[:,"p"]), method = "fdr_bh")[1])
        df_pvalue_without_clustering_ext_predictable = df_pvalue_without_clustering_ext[df_pvalue_without_clustering_ext["q"] < threshold]
        df_pvalue_without_clustering_ext_predictable.to_csv("tables/ko_p.predictable.unclustered_"+target+"_"+method+".txt", sep = "\t", index = False)

In [None]:
target = "gain"
color  = "#9FA6F1"
method = "LR"
Ncluster = 1
df_pvalue_ext = df_pvalue[(df_pvalue["target"]==target) & (df_pvalue["method"]==method) & (df_pvalue["Ncluster"] == Ncluster)]
df_pvalue_ext = df_pvalue_ext.reset_index()
df_pvalue_ext["q"] = list(multipletests(list(df_pvalue_ext.loc[:,"p"]), method = "fdr_bh")[1])
df_pvalue_gain_predictable_one_cluster = df_pvalue_ext[df_pvalue_ext["q"]<0.10]
df_auc_gain_predictable_one_cluster = pd.merge(df_pvalue_gain_predictable_one_cluster['KO'], df_auc[(df_auc['target']=="gain") & (df_auc['method']=="LR")], on = 'KO', how = 'left')
df_auc_gain_predictable_one_cluster_maxauc = df_auc_gain_predictable_one_cluster.loc[df_auc_gain_predictable_one_cluster.groupby("KO")["auc"].idxmax(), :]

KO_of_interest = df_auc_gain_predictable_one_cluster_maxauc[df_auc_gain_predictable_one_cluster_maxauc["Ncluster"]==1].KO

In [None]:
df_occurrence = pd.read_table("/Users/konnonaoki/Documents/backupped/Research/IwasakiLab/Data/MetabolicNetworkEvolution/experiment/NK_M0151/result/ko_target_gn_Ngenes_bin.txt", names = ['KO', 'target', 'node', 'Ngenes', 'Occurrence'])

df_KO_Ngenes = pd.merge(KO_of_interest, df_occurrence[df_occurrence["target"]=='gain'], on = 'KO')
df_KO_Ngenes["z-Ngenes"] = df_KO_Ngenes.groupby('KO').transform(zscore)['Ngenes']
df_KO_Ngenes_gained = df_KO_Ngenes[df_KO_Ngenes["Occurrence"] == 1]
df_ko_desc = pd.read_table("/Users/konnonaoki/GoogleDrive/Research/KonnoNaoki/repositories/handyenrich/ref/class_description/ko.kegg.txt", names = ["ko", "description"])
df_ko_desc["KO"] = [ko.split(":")[1] for ko in df_ko_desc['ko']]
df_KO_Ngenes_gained = pd.merge(df_KO_Ngenes_gained,df_ko_desc,on = 'KO')
df_KO_Ngenes_gained.to_csv("tables/KO_Ngenes_gained.txt", sep = '\t', index = None)
df_node_phylum = pd.read_table("/Users/konnonaoki/Documents/backupped/Research/IwasakiLab/Data/MetabolicNetworkEvolution/experiment/NK_M0150/table/node_phylum.mlgtdb_MPPA.txt", names = ["node", "phylum"])
df_KO_Ngenes_gained = pd.merge(df_KO_Ngenes_gained,df_node_phylum,on = 'node')
df_KO_Ngenes_gained

In [None]:
df_KO_meanNgenes_gained = df_KO_Ngenes_gained.groupby("KO", as_index=False).mean().sort_values("Ngenes")
df_KO_meanNgenes_gained = pd.merge(df_KO_meanNgenes_gained,df_ko_desc,on = 'KO')

In [None]:
df_phylum_Nspecies_color_name = pd.read_table("/Users/konnonaoki/Documents/backupped/Research/IwasakiLab/Data/MetabolicNetworkEvolution/experiment/NK_M0150/table/phylum_Nspecies_color_name.txt", sep = ',')

In [None]:
df_ancnode_Ngenes = df_KO_Ngenes[['node','Ngenes']].groupby('node',as_index=False).mean()
fig = plt.figure(figsize=(3,1.5))
ax = fig.add_axes([0.1,0.1,0.8,0.8])
ax.hist(df_ancnode_Ngenes.Ngenes,range=(0,1200), bins = 50, color = '#00FFAA', alpha =0.5)
ax.set_ylabel('#ancestral species')
ax.set_xlim(0,1200)
plt.savefig("figures/NK_M0151_histogram_Ngenes_ancestors.pdf",bbox_inches='tight')
plt.close()

In [None]:
fig = plt.figure(figsize=(3,10))
ax = fig.add_axes([0.1,0.1,0.8,0.8])
sns.stripplot (
    data = df_KO_Ngenes_gained, 
    y = 'KO', 
    x = 'Ngenes', 
    hue = 'phylum', 
    order = list(df_KO_Ngenes.groupby("KO").mean().sort_values("Ngenes").index), 
    hue_order = list(df_phylum_Nspecies_color_name.Phylum)   + ['upstream'], 
    palette   = list(df_phylum_Nspecies_color_name['color']) + ['#EEEEEE'] , 
    linewidth=0, 
    orient = 'h', 
    size = 2, 
    alpha=1,
    jitter=0.1
    )

ax.plot(df_KO_meanNgenes_gained['Ngenes'], df_KO_meanNgenes_gained["KO"], color = color, linewidth = 1, alpha = 0.5)

ax.legend(bbox_to_anchor=(-1.55, 1), loc='upper left')

ax.set_ylim(32.5, -0.5)
ax.set_xlim(0,1200)
ax.set_xlabel("#possessed OGs")

ax2 = ax.twinx()
ax2.set_ylim(32.5, -0.5)
ax2.set_yticks(ax.get_yticks())
ax2.set_yticklabels(df_KO_meanNgenes_gained.description) 

plt.savefig("figures/NK_M0151_clustering_"+method+"_total_Ngenes.pdf",bbox_inches='tight')
plt.close()

In [None]:
fig = plt.figure(figsize=(3,10))
ax = fig.add_axes([0.1,0.1,0.8,0.8])
sns.stripplot (
    data = df_KO_Ngenes_gained, 
    y = 'KO', 
    x = 'z-Ngenes', 
    hue = 'phylum', 
    order = list(df_KO_Ngenes.groupby("KO").mean().sort_values("Ngenes").index), 
    hue_order = list(df_phylum_Nspecies_color_name.Phylum)   + ['upstream'], 
    palette   = list(df_phylum_Nspecies_color_name['color']) + ['#EEEEEE'] , 
    linewidth=0, 
    orient = 'h', 
    size = 2, 
    alpha=1,
    jitter=0.1
    )
ax.legend(bbox_to_anchor=(-1.55, 1), loc='upper left')
ax.axvline(0, linewidth = 1, alpha = 0.5, color = '#555555')
ax.set_xlim(-2.5,2.5)
ax.set_xlabel("Z-score of #possessed OGs")

ax2 = ax.twinx()
ax2.set_ylim(32.5, -0.5)
ax2.set_yticks(ax.get_yticks())
ax2.set_yticklabels(df_KO_meanNgenes_gained.description) 

plt.savefig("figures/NK_M0151_clustering_"+method+"_total_Z-Ngenes.pdf",bbox_inches='tight')
plt.close()

#### Cross-phylum validation

In [None]:
# Cross-phylum validation
df_auc = pd.read_table("/Users/konnonaoki/Documents/backupped/Research/IwasakiLab/Data/MetabolicNetworkEvolution/experiment/NK_M0151/result/ko_auc.mlgtdb.MPPA.crossphylum.filtered.txt", names = ['KO', 'target', 'method', 'auc'])
df_auc['Phylum'] = ["_".join(method.split("_")[3:]) for method in df_auc.method]
df_auc['method'] = [method.split("_")[0] for method in df_auc.method]
df_auc

In [None]:
list_test_result=[]
for method, target, color in [('LR','loss','#E1BB63'), ('LR','gain','#9FA6F1'), ('RF','loss','#E1BB63'), ('RF','gain','#9FA6F1')]:
    
    df_auc_ext = df_auc[(df_auc['method']==method) & (df_auc['target']==target)].reset_index()
    fig = plt.figure(figsize=(5,1.8))
    ax = fig.add_axes([0.1,0.1,0.8,0.8])
    sns.violinplot(data = df_auc_ext, x = 'Phylum', y = 'auc', linewidth=0.5, order = ["Proteobacteria", "Actinobacteriota", "Firmicutes", "Bacteroidota", "Firmicutes_A", "Cyanobacteria", "Campylobacterota", "Spirochaetota", "Firmicutes_I", "Deinococcota"], color = '#FFFFFF', orient = 'v')
    sns.stripplot (data = df_auc_ext, x = 'Phylum', y = 'auc', linewidth=0,   order = ["Proteobacteria", "Actinobacteriota", "Firmicutes", "Bacteroidota", "Firmicutes_A", "Cyanobacteria", "Campylobacterota", "Spirochaetota", "Firmicutes_I", "Deinococcota"], color = color, size = 1, alpha=.3, jitter=0.3, dodge=True, orient = 'v')

    #df_Ncluster_medAUC = df_auc_ext.groupby("", as_index=False).median()
    #ax.plot(df_Ncluster_medAUC.auc)

    #ax.get_legend().remove()
    ax.set_ylabel('AUC')
    ax.set_ylim(-0.1,1.1)
    plt.gca().spines['right'].set_visible(False)
    plt.gca().spines['top'].set_visible(False)
    ax.axhline(0.5, linewidth = 1, alpha = 0.5, color = '#555555')
    ax.set_title(target + '/' + method)
    ax.tick_params(axis='x', labelrotation= 90)
    plt.savefig("figures/NK_M0151_crossphylum_"+target+"_"+method+".pdf",bbox_inches='tight')
    plt.close()
    
    for phylum in ["Proteobacteria", "Actinobacteriota", "Firmicutes", "Bacteroidota", "Firmicutes_A", "Cyanobacteria", "Campylobacterota", "Spirochaetota", "Firmicutes_I", "Deinococcota"]:
        list_test_result.append([phylum, target, method, ttest_1samp(df_auc_ext[(df_auc_ext['method'] == method) & (df_auc_ext['target'] == target) &  (df_auc_ext['Phylum'] == phylum)]['auc'], 0.5).pvalue])
        
    F_test_result = f_oneway(
        list(df_auc_ext[df_auc_ext["Phylum"]=="Proteobacteria"].auc),
        list(df_auc_ext[df_auc_ext["Phylum"]=="Actinobacteriota"].auc),
        list(df_auc_ext[df_auc_ext["Phylum"]=="Firmicutes"].auc),
        list(df_auc_ext[df_auc_ext["Phylum"]=="Bacteroidota"].auc),
        list(df_auc_ext[df_auc_ext["Phylum"]=="Firmicutes_A"].auc),
        list(df_auc_ext[df_auc_ext["Phylum"]=="Cyanobacteria"].auc),
        list(df_auc_ext[df_auc_ext["Phylum"]=="Campylobacterota"].auc),
        list(df_auc_ext[df_auc_ext["Phylum"]=="Spirochaetota"].auc),
        list(df_auc_ext[df_auc_ext["Phylum"]=="Firmicutes_I"].auc),
        list(df_auc_ext[df_auc_ext["Phylum"]=="Deinococcota"].auc),
    )
    
    print("F-test result", target, method, F_test_result.statistic, F_test_result.pvalue, sep = "\t")

df_test_result = pd.DataFrame(list_test_result, columns = ['Phylum', 'target', 'prediction_method', 'p'])

In [None]:
df_test_result['q'] =  list(multipletests(list(df_test_result.p), method = "fdr_bh")[1])
df_test_result['significant'] = ['*' if q<0.05 else '' for q in df_test_result.q]
df_test_result.to_csv("tables/NK_M0151_crossphylum.txt", index=False, sep = "\t")
df_test_result

#### Prediction from interpretable features

In [None]:
tree = 'mlgtdb'
acr  = 'MPPA'
# Interpretable features
df_auc_raw = pd.read_table("/Users/konnonaoki/Documents/backupped/Research/IwasakiLab/Data/MetabolicNetworkEvolution/experiment/NK_M0151/result/ko_auc.mlgtdb.MPPA.interpretable.filtered.txt", names = ['KO', 'target', 'method', 'auc', 'features'])
df_auc        = df_auc_raw.groupby(['KO', 'target', 'method', 'features'], as_index = False).mean()
df_auc 

In [None]:
for method, target, color in [('LR','loss','#E1BB63'), ('LR','gain','#9FA6F1'), ('RF','loss','#E1BB63'), ('RF','gain','#9FA6F1')]:

    df_auc_ext = df_auc[(df_auc['method']==method) & (df_auc['target']==target)]

    # clustering
    fig = plt.figure(figsize=(3.5,2))
    ax = fig.add_axes([0.1,0.1,0.8,0.8])
    ax.set_ylim(-0.1,1.1)
    sns.violinplot(data = df_auc_ext, x = 'features', y = 'auc', linewidth=0.5, order = ["md", "ec12", "path", "ec12md", "ec12path", "mdpath", "ec12mdpath"], color = '#FFFFFF')
    sns.stripplot (data = df_auc_ext, x = 'features', y = 'auc', linewidth=0,   order = ["md", "ec12", "path", "ec12md", "ec12path", "mdpath", "ec12mdpath"], color = color, size = 1, alpha=.3, jitter=0.3, dodge=True)

    df_auc_ext_meanAUC = df_auc_ext.groupby("features", as_index=False).mean()
    print(df_auc_ext_meanAUC.sort_values('auc'))
    #ax.scatter(y = df_auc_ext_meanAUC.features, x = df_auc_ext_meanAUC.auc, s = 5, marker = "|",zorder=5)

    #ax.get_legend().remove()
    ax.set_xlabel('Feature set')
    ax.set_ylabel('AUC')
    ax.set_ylim(-0.1,1.1)
    plt.gca().spines['right'].set_visible(False)
    plt.gca().spines['top'].set_visible(False)
    ax.axhline(0.5, linewidth = 1, alpha = 0.5, color = '#555555')
    ax.set_xticklabels(["M", "E", "P", "ME", "EP", "PM", "MEP"])

    ax.set_title(target+" "+method)
    
    plt.savefig("figures/NK_M0151_interpretable_"+target+"_"+method+".pdf",bbox_inches='tight')
    #plt.close()

    # test if AUC of X and Y is equal
    for comp_x, comp_y in [("md", "ec12"), ("md", "path"), ("ec12", "path")]:
        df_auc_comp = pd.merge(df_auc_ext[(df_auc_ext['features'] == comp_x) & (df_auc_ext['target'] == target)], df_auc_ext[(df_auc_ext['features'] == comp_y) & (df_auc_ext['target'] == target)], on = 'KO')
        #print(tree, acr, target, method, comp_x, comp_y, wilcoxon(df_auc_comp['auc_x'], df_auc_comp['auc_y']))
        print(tree, acr, target, method, comp_x, comp_y, ttest_rel(df_auc_comp['auc_x'], df_auc_comp['auc_y']))

In [None]:
# Interpretable features 

df_auc_raw = pd.read_table("/Users/konnonaoki/Documents/backupped/Research/IwasakiLab/Data/MetabolicNetworkEvolution/experiment/NK_M0151/result/ko_auc.mlgtdb.MPPA.Nselected.filtered.txt", names = ['KO', 'target', 'pred_method', 'featureset', 'selec_method', 'Nfeatures', 'AUC']) 
df_auc = df_auc_raw.groupby(['KO', 'target', 'pred_method', 'featureset', 'selec_method', 'Nfeatures'], as_index=False).mean()

In [None]:
def plot_Nfeatures_medAUC(df_auc, target, pred_method, featureset, selec_method, ax, color, label = None):
    df_auc_ext = df_auc[(df_auc['target']==target) & (df_auc['pred_method']==pred_method) & (df_auc['featureset']==featureset) & (df_auc['selec_method']==selec_method)]
    #plt.scatter(df_auc_ext[df_auc_ext['KO']=='K00002']['Nfeatures'], df_auc_ext[df_auc_ext['KO']=='K00002']['AUC'])

    df_auc_ext_5 = df_auc_ext.groupby(['Nfeatures'], as_index = False).quantile(0.05)
    df_auc_ext_50 = df_auc_ext.groupby(['Nfeatures'], as_index = False).quantile(0.5)
    df_auc_ext_95 = df_auc_ext.groupby(['Nfeatures'], as_index = False).quantile(0.95)
    df_auc_ext_percentile = \
        pd.merge(
            pd.merge(
                df_auc_ext_5, df_auc_ext_50, on = 'Nfeatures'
                ), 
                df_auc_ext_95
            )
    df_auc_ext_percentile = df_auc_ext_percentile.rename(columns = {'AUC_x':'AUC_5', 'AUC_y': 'AUC_50', 'AUC': 'AUC_95'})
    #print(target, pred_method, selec_method, featureset, df_auc_ext_percentile.sort_values('AUC_50', ascending=False))
    ax.plot(df_auc_ext_percentile['Nfeatures'], df_auc_ext_percentile['AUC_50'], color= color, label = label)

colors = ['#66C2A5', '#FC8D62', '#8DA0CB', '#E78AC3']
for featureset in ['md', 'ec12md', 'ec12mdpath']:
    for target in ['gain', 'loss']:
        fig = plt.figure(figsize=(2,2))
        ax = fig.add_axes([0.1,0.1,0.8,0.8])
        ax.set_xlim(0,51)
        ax.set_ylim(0.5, 0.8)
        ax.set_xlabel("#selected featues")
        ax.set_ylabel("Median AUC")
        ax.set_title(featureset + " " + target)

        i = 0
        for pred_method in ['LR', 'RF']:
            for selec_method in ['ANOVA', 'RandomForest']:
                plot_Nfeatures_medAUC(df_auc, target, pred_method, featureset, selec_method, ax, colors[i])
                i+=1
        plt.savefig("figures/NK_M0151_Nfeatures_medAUC_"+featureset+"_"+target+".pdf", bbox_inches = 'tight')
        #plt.show()
        plt.close()

In [None]:
for featureset in ['md']:
    for target in ['gain', 'loss']:
        fig = plt.figure(figsize=(3,3))
        ax = fig.add_axes([0.1,0.1,0.8,0.8])
        ax.set_xlim(0,51)
        ax.set_ylim(0.55, 0.85)
        ax.set_xlabel("#selected featues")
        ax.set_ylabel("Median AUC")
        ax.set_title(featureset + " " + target)
        
        df_auc_category = pd.merge(df_auc, df_category_ko, on = 'KO')

        i = 0
        for category in sorted(list(set(df_auc_category.category))):
            for pred_method in ['RF']:
                for selec_method in ['ANOVA']:
                    plot_Nfeatures_medAUC(df_auc_category[df_auc_category.category == category], target, pred_method, featureset, selec_method, ax, mcolor.rgb2hex(cm(i)), label = category)
                    i+=1
        ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left',borderaxespad=0,)
        
        plt.savefig("figures/NK_M0151_Nfeatures_medAUC_"+featureset+"_"+target+"_category_RF_ANOVA.pdf", bbox_inches = 'tight')
        #plt.show()
        plt.close()

In [None]:
def sparsemtx2mtx(X, Y, Z):
    X_uniq_list = list(sorted(list(set(X))))
    Y_uniq_list = list(sorted(list(set(Y))))
    Z_matrix    = np.zeros((len(X_uniq_list), len(Y_uniq_list)))
    for x, y, z in zip(X, Y, Z):
        x_idx = X_uniq_list.index(x)
        y_idx = Y_uniq_list.index(y)
        Z_matrix[x_idx, y_idx] = z
    #return X_uniq_list, Y_uniq_list, Z_matrix
    return pd.DataFrame(Z_matrix, index = X_uniq_list, columns = Y_uniq_list,)

In [None]:
for target in ['gain', 'loss']:

    selec_method = 'ANOVA'
    pred_method = 'RF'
    featureset = 'md'
    df_auc_ext = df_auc[(df_auc['target']==target) & (df_auc['pred_method']==pred_method) & (df_auc['featureset']==featureset) & (df_auc['selec_method']==selec_method)]

    df_KO_Nfeatures_AUC = sparsemtx2mtx(df_auc_ext.KO, df_auc_ext.Nfeatures, df_auc_ext.AUC)

    g = sns.clustermap(df_KO_Nfeatures_AUC, col_cluster=False, method = 'ward', metric = 'euclidean', cmap = 'coolwarm', figsize = (5,5), vmin = 0.1, vmax = 0.9)

    g.fig.axes[2].set_yticks([])
    g.fig.axes[2].set_xticks(np.array([1, 10, 20, 30, 40, 50])-0.5)
    g.fig.axes[2].set_xticklabels(np.array([1, 10, 20, 30, 40, 50]))
    g.fig.axes[2].set_ylabel(str(len(df_KO_Nfeatures_AUC.index)) + " OGs")
    g.fig.axes[2].set_xlabel("# selected features")
    g.fig.axes[2].set_title(featureset + " " + target + " " + selec_method + " " + pred_method)
    g.fig.axes[3].set_title("AUC")

    plt.savefig("figures/NK_M0151_KO_Nfeatures_AUC_"+featureset+"_"+target+".pdf", bbox_inches = 'tight')
    plt.close()

In [None]:
target = 'gain'
selec_method = 'RandomForest'
pred_method = 'RF'
featureset = 'md'
df_auc_ext = df_auc[(df_auc['target']==target) & (df_auc['pred_method']==pred_method) & (df_auc['featureset']==featureset) & (df_auc['selec_method']==selec_method)]
df_auc_ext_max_AUC = df_auc_ext.loc[df_auc_ext.groupby("KO")["AUC"].idxmax(), :]

In [None]:
# by category 
cm_name = 'Set3' # B->G->R
cm = plt.get_cmap(cm_name)

df_auc_category = pd.merge(df_auc, df_category_ko, on = 'KO')

for target in ['gain', 'loss']:
    fig = plt.figure(figsize=(2,1))
    
    i = 0
    for category in sorted(list(set(df_auc_category.category))):
        for pred_method in ['RF']:
            for selec_method in ['ANOVA']:
                ax = fig.add_axes([0.1,0.1-i,0.8,0.8], label = category)
                ax.set_xlim(0,51)
                if (i == len(set(df_auc_category.category)) - 1): ax.set_xlabel("Optimal # features")
                if (i != len(set(df_auc_category.category)) - 1): ax.set_xticklabels([])
                ax.set_ylabel("# OGs")
                #ax.set_title(category)
                plt.gca().spines['right'].set_visible(False)
                plt.gca().spines['top'].set_visible(False)
                df_auc_category_ext = df_auc_category[(df_auc_category['target']==target) & (df_auc_category['pred_method']==pred_method) & (df_auc_category['featureset']==featureset) & (df_auc_category['selec_method']==selec_method) & (df_auc_category['category']==category)]
                df_auc_ext_max_AUC = df_auc_category_ext.loc[df_auc_category_ext.groupby("KO")["AUC"].idxmax(), :]
                ax.hist(df_auc_ext_max_AUC.Nfeatures, range= (0,50), bins = 25, histtype= 'stepfilled', color = mcolor.rgb2hex(cm(i)), alpha = 0.8)
                i+=1
    plt.savefig("figures/NK_M0151_Nfeatures_of_maxAUC_"+featureset+"_"+target+"_category.pdf", bbox_inches = 'tight')
    plt.close()

In [None]:
fig = plt.figure(figsize=(2,2))
ax = fig.add_axes([0.1,0.1,0.8,0.8])
ax.set_xlim(0,51)
ax.set_xlabel("Optimal # features")
ax.set_ylabel("# OGs")
ax.set_title(featureset, )

for target, color in [('gain', '#9FA6F1'), ('loss', '#E1BB63')]:
    for pred_method in ['RF']:
        for selec_method in ['ANOVA']:
            df_auc_ext = df_auc[(df_auc['target']==target) & (df_auc['pred_method']==pred_method) & (df_auc['featureset']==featureset) & (df_auc['selec_method']==selec_method)]
            df_auc_ext_max_AUC = df_auc_ext.loc[df_auc_ext.groupby("KO")["AUC"].idxmax(), :]
            ax.hist(df_auc_ext_max_AUC.Nfeatures, range= (0,50), bins = 50, histtype= 'step', color = color, alpha = 0.8)
plt.savefig("figures/NK_M0151_Nfeatures_of_maxAUC_"+featureset+"_category_RF_ANOVA.pdf", bbox_inches = 'tight')

In [None]:
target = 'gain'
selec_method = 'ANOVA'
pred_method = 'RF'
featureset = 'md'

df_auc_ext = df_auc[(df_auc['target']==target) & (df_auc['pred_method']==pred_method) & (df_auc['featureset']==featureset) & (df_auc['selec_method']==selec_method)]
df_auc_ext_max_AUC = df_auc_ext.loc[df_auc_ext.groupby("KO")["AUC"].idxmax(), :]

df_auc_ext_max_AUC_Nreaction = pd.merge(df_auc_ext_max_AUC, df_rn_ko.KO.value_counts().reset_index().rename(columns={'index':'KO', 'KO': 'Nreactions'}), on = 'KO')
df_auc_ext_max_AUC_Nmodule = pd.merge(df_auc_ext_max_AUC, df_md_ko.KO.value_counts().reset_index().rename(columns={'index':'KO', 'KO': 'Nmodules'}), on = 'KO')

fig = plt.figure(figsize=(3,2))
ax = fig.add_axes([0.1,0.1,0.8,0.8])
df_auc_ext_max_AUC_Nreaction['promiscuity'] = ['1' if Nreaction == 1 else '2-10' if Nreaction <= 10 else '11-20' if Nreaction <= 20 else '>20' for Nreaction in df_auc_ext_max_AUC_Nreaction.Nreactions]
sns.boxplot(data = df_auc_ext_max_AUC_Nreaction, y = 'Nfeatures', x = 'promiscuity', linewidth=0.5, color = '#FFFFFF', order = ['1', '2-10', '11-20', '>20'])
sns.stripplot(data = df_auc_ext_max_AUC_Nreaction, y = 'Nfeatures', x = 'promiscuity', linewidth=0.5, color = '#FF0000', alpha = 0.1, s = 1, order = ['1', '2-10', '11-20', '>20'], jitter = 0.3)
ax.set_xlabel("Promiscuity (#reactions involved)")
ax.set_ylabel("Optimal #features")
ax.set_title(featureset + " " + target + " " + selec_method + " " + pred_method)
plt.savefig("figures/NK_M0151_optNfeatures_Nreactions_"+target+"_"+featureset+".pdf", bbox_inches = 'tight')
plt.close()

In [None]:
fig = plt.figure(figsize=(3,2))
ax = fig.add_axes([0.1,0.1,0.8,0.8])
sns.boxplot(data = df_auc_ext_max_AUC_Nmodule, y = 'Nfeatures', x = 'Nmodules', linewidth=0.5, color = '#FFFFFF', )
sns.stripplot (data = df_auc_ext_max_AUC_Nmodule, y = 'Nfeatures', x = 'Nmodules', linewidth=0, color = '#000000', size = 1, alpha=.3, jitter=.3, dodge=True)
ax.set_xlabel("#modules involved")
ax.set_ylabel("Optimal #features")
ax.set_title(featureset + " " + target + " " + selec_method + " " + pred_method)
plt.savefig("figures/NK_M0151_optNfeatures_Nmodules_"+target+"_"+featureset+".pdf", bbox_inches = 'tight')
plt.close()

In [None]:
df_auc_ext_max_AUC_category = pd.merge(df_auc_ext_max_AUC, df_category_ko, on = 'KO')
sns.violinplot(data = df_auc_ext_max_AUC_category, y = 'category', x = 'Nfeatures', hue = 'target', order = sorted(list(set(df_auc_ext_max_AUC_category['category']))), linewidth=0.5, orient = 'h', color = '#FFFFFF', alpha = 0)
sns.stripplot (data = df_auc_ext_max_AUC_category, y = 'category', x = 'Nfeatures', hue = 'target', order = sorted(list(set(df_auc_ext_max_AUC_category['category']))), linewidth=0, orient = 'h', size = 1, alpha=.5,jitter=0.3, dodge=True, palette=['#FF0000'])
plt.close()

In [None]:
df_auc_ext_max_AUC_Ncategory = pd.merge(df_auc_ext_max_AUC, df_category_ko.KO.value_counts().reset_index().rename(columns={'index':'KO', 'KO': 'Ncategories'}), on = 'KO')
fig = plt.figure(figsize=(3,2))
ax = fig.add_axes([0.1,0.1,0.8,0.8])
sns.boxplot(data = df_auc_ext_max_AUC_Ncategory, y = 'Nfeatures', x = 'Ncategories', linewidth=0.5, color = '#FFFFFF', )
sns.stripplot(data = df_auc_ext_max_AUC_Ncategory, y = 'Nfeatures', x = 'Ncategories', linewidth=0,   color = '#000000', size = 1, alpha=.3, jitter=0.3, dodge=True)
plt.close()

In [None]:
# Heatmap of selected features: Module vs Module 

rank_threshold = 14

for target in ['gain', 'loss']:

    featureset = 'md'
    selection  = 'ANOVA'

    df_feature_target_method_ko_selectionscore = pd.read_table("/Users/konnonaoki/Documents/backupped/Research/IwasakiLab/Data/MetabolicNetworkEvolution/experiment/NK_M0151/result/feature_target_method_ko_selectionscore.txt", names = ['featureset', 'target', 'selection', 'KO', 'feature', 'score'])
    df_feature_target_method_ko_selectionscore['abs_score'] = abs(df_feature_target_method_ko_selectionscore['score'])
    df_feature_target_method_ko_selectionscore_ext = \
        df_feature_target_method_ko_selectionscore[
            (df_feature_target_method_ko_selectionscore['featureset']==featureset) &
            (df_feature_target_method_ko_selectionscore['target']    ==target)     &
            (df_feature_target_method_ko_selectionscore['selection'] ==selection)
            ]
    df_feature_target_method_ko_selectionscore_ext = df_feature_target_method_ko_selectionscore_ext.reset_index()
    df_feature_target_method_ko_selectionscore_ext['score_rank_descending'] = df_feature_target_method_ko_selectionscore_ext.groupby(["target", "selection", "KO"])['abs_score'].rank(ascending=False)

    df_feature_target_method_ko_selectionscore_ext['selected_top'+str(rank_threshold)] = \
        df_feature_target_method_ko_selectionscore_ext['score_rank_descending'] < rank_threshold
    df_feature_target_method_ko_selectionscore_ext['score_rank_top'+str(rank_threshold)] = \
        [max(rank_threshold - rank, 0) for rank in df_feature_target_method_ko_selectionscore_ext['score_rank_descending']]
    df_feature_target_method_ko_selectionscore_ext


    df_md_ko = pd.read_table("tables/md_ko.txt", names = ['Module','KO'])
    df_feature_target_method_ko_selectionscore_MD = pd.merge(df_feature_target_method_ko_selectionscore_ext, df_md_ko, on = 'KO').rename(columns = {'Module':'Module_of_KO'})
    df_feature_target_method_ko_selectionscore_MD

    df_feature_target_method_md_selectionscore_MD = df_feature_target_method_ko_selectionscore_MD.groupby(['target', 'selection', 'Module_of_KO', 'feature'], as_index= False).mean()
    df_feature_target_method_md_selectionscore_MD

    df_md_md_rank = sparsemtx2mtx(
        df_feature_target_method_md_selectionscore_MD.Module_of_KO, 
        df_feature_target_method_md_selectionscore_MD.feature,
        df_feature_target_method_md_selectionscore_MD['selected_top'+str(rank_threshold)]
        )

    df_category_ko_module = pd.merge(df_category_ko, df_md_ko, on = 'KO')
    df_category_ko_module['Nko'] = 1
    df_category_module_count = df_category_ko_module.groupby(['category', 'Module'], as_index = False).sum()
    df_maxcategory_module = df_category_module_count.loc[df_category_module_count.groupby('Module')['Nko'].idxmax(),:].sort_values('category')
    df_maxcategory_module = df_maxcategory_module.reset_index().loc[:, ['category', 'Module']]

    df_category_color = pd.DataFrame([[category, i] for i, category in enumerate(df_maxcategory_module.category.unique())], columns = ["category", 'category_id'])
    df_maxcategory_module_color = pd.merge(df_maxcategory_module, df_category_color)

    rows = pd.merge(pd.DataFrame(list(set(df_md_md_rank.index)), columns = ['Module']), df_maxcategory_module_color).sort_values('Module').sort_values('category', kind='mergesort')
    columns = pd.merge(pd.DataFrame(list(set(df_md_md_rank.columns)), columns = ['Module']), df_maxcategory_module_color).sort_values('Module').sort_values('category', kind='mergesort')
    rows.sort_values('category_id')


    # plot a figure
    df_md_md_rank = df_md_md_rank.loc[rows.Module, columns.Module]

    fig = plt.figure(figsize=(5,5))
    ax = fig.add_axes([0.1,0.1,0.8,0.8])
    ax2 = fig.add_axes([0.93,0.1,0.05,0.3])
    sns.heatmap(df_md_md_rank, ax = ax, cbar_ax = ax2, cmap = 'inferno')
    ax.tick_params(bottom = False, left = False, labelbottom = False, labelleft = False)
    ax2.set_yticklabels([0, 0.25, 0.5, 1])

    # user-defined colormap
    cmap = ListedColormap([cm(i) for i in range(11)], name="custom")

    # y-axis
    ax3 = fig.add_axes([0.05,0.1,0.045,0.8])
    sns.heatmap([[color_id] for color_id in rows.category_id], ax = ax3, cbar = False, cmap = cmap)
    ax3.tick_params(bottom = False, left = False, labelbottom = False, labelleft = False)
    ax3.set_ylabel("Modules including predicted OGs")

    # x-axis

    ax4 = fig.add_axes([0.1,0.05,0.8,0.045])
    sns.heatmap([columns.category_id], ax = ax4, cbar = False, cmap = cmap)
    ax4.tick_params(bottom = False, left = False, labelbottom = False, labelleft = False)
    ax4.set_xlabel("Modules as predictors")

    plt.savefig("figures/NK_M0151_selected_features_heatmap_"+featureset+"_"+target+"_"+selection+".pdf")
    plt.close()

In [None]:
# Distribution of important features by functional categories

for target, rank_threshold, feature_set, selection in [('gain', 14, 'md', 'ANOVA'), ('gain', 50, 'md', 'RandomForest'), ('loss', 8, 'md', 'ANOVA'), ('loss', 14, 'md', 'RandomForest')]:
    
    df_feature_target_method_ko_selectionscore = pd.read_table("/Users/konnonaoki/Documents/backupped/Research/IwasakiLab/Data/MetabolicNetworkEvolution/experiment/NK_M0151/result/feature_target_method_ko_selectionscore.txt", names = ['featureset', 'target', 'selection', 'KO', 'feature', 'score'])
    df_feature_target_method_ko_selectionscore['abs_score'] = abs(df_feature_target_method_ko_selectionscore['score'])
    df_feature_target_method_ko_selectionscore_ext = \
        df_feature_target_method_ko_selectionscore[
            (df_feature_target_method_ko_selectionscore['featureset']==featureset) &
            (df_feature_target_method_ko_selectionscore['target']    ==target)     &
            (df_feature_target_method_ko_selectionscore['selection'] ==selection)
            ]
    df_feature_target_method_ko_selectionscore_ext = df_feature_target_method_ko_selectionscore_ext.reset_index()
    df_feature_target_method_ko_selectionscore_ext['score_rank_descending'] = df_feature_target_method_ko_selectionscore_ext.groupby(["target", "selection", "KO"])['abs_score'].rank(ascending=False)
    df_ko_feature_scorerank = df_feature_target_method_ko_selectionscore_ext.loc[:, ['KO', 'feature', 'score_rank_descending']]

    # select top-N features
    df_feature_target_method_ko_selectionscore_ext['selected_top'+str(rank_threshold)] = \
        df_feature_target_method_ko_selectionscore_ext['score_rank_descending'] < rank_threshold
    df_feature_target_method_ko_selectionscore_ext['score_rank_top'+str(rank_threshold)] = \
        [max(rank_threshold - rank, 0) for rank in df_feature_target_method_ko_selectionscore_ext['score_rank_descending']]
    df_feature_target_method_ko_selectionscore_ext['signed_score_rank_top'+str(rank_threshold)] = df_feature_target_method_ko_selectionscore_ext['score_rank_top'+str(rank_threshold)] * df_feature_target_method_ko_selectionscore_ext['score'] / df_feature_target_method_ko_selectionscore_ext['abs_score']

    # 
    df_feature_target_method_ko_selectionscore_ext = pd.merge(df_feature_target_method_ko_selectionscore_ext, df_uniquecategory_ko, on = 'KO')
    df_feature_target_method_ko_selectionscore_ext = df_feature_target_method_ko_selectionscore_ext.fillna(0)
    df_feature_target_method_ko_selectionscore_ext_category_sum = df_feature_target_method_ko_selectionscore_ext.groupby(['category', 'feature'], as_index=False).sum()

    # Plot 1: Important features for predicting gain/loss of eech KEGG category
    fig = plt.figure(figsize=(5,0.5))
    list_test_result = []
    for i, category in enumerate(df_category_color.category):
        df = df_feature_target_method_ko_selectionscore_ext_category_sum[df_feature_target_method_ko_selectionscore_ext_category_sum['category'] == category]
        df = pd.merge(df, df_maxcategory_module, left_on = 'feature', right_on = 'Module').sort_values('category_y')
        df = pd.merge(df, df_category_color, left_on = 'category_y', right_on = 'category')
        
        # testing enrichment to the same category
        same_category = list(df[df.category_x == df.category_y]["selected_top"+str(rank_threshold)])
        different_category = list(df[df.category_x != df.category_y]["selected_top"+str(rank_threshold)])
        list_test_result.append([category, target, rank_threshold, feature_set, selection,mannwhitneyu(same_category, different_category, use_continuity=True, alternative=None).pvalue])
        
        ax = fig.add_axes([0.1,0.1-i,0.8,0.8])
        ax.set_xlim(0,340)
        ax.bar(x = df.feature, height = df["selected_top"+str(rank_threshold)], color = df.color)
        plt.gca().spines['right'].set_visible(False)
        plt.gca().spines['top'].set_visible(False)
        ax.tick_params(labelbottom=False, bottom=False)
        #ax.set_title(category, x = -0.6, y = 0)
        ax.text(-0.1,0.45,category,c=df_category_color.color[i],ha='right',transform=ax.transAxes)
        #ax.set_ylim(0,130)
        if (i==10): ax.set_xlabel("Features (339 KEGG Modules)")
        if (i==0):  ax.set_title("#OGs", x=-0.05, fontsize= 10)
        if (i==0):  ax.text(0.3,1.5,featureset+" "+target+" "+selection+" top "+str(rank_threshold),ha='left',transform=ax.transAxes)
    plt.savefig("figures/NK_M0151_nKOs_features_"+featureset+"_"+target+"_"+selection+"_top"+str(rank_threshold)+".pdf", bbox_inches='tight')
    plt.close()
    
    df_test_result = pd.DataFrame(list_test_result, columns = ["category", "target", "rank_threshold", "feature_set", "selection_method", "p"])
    df_test_result ['q'] =  list(multipletests(list(df_test_result.p), method = "fdr_bh")[1])
    df_test_result['significant'] = ['*' if q<0.05 else '' for q in df_test_result.q]
    print(df_test_result[df_test_result['q']<0.05])
    
    # Plot 2: IRank distribution of feature importance
    df_ko_md_feature_scorerank = pd.merge(df_md_ko, df_ko_feature_scorerank, on = 'KO').loc[:, ['Module', 'KO', 'feature', 'score_rank_descending']]
    df_ko_md_feature_scorerank_same_md = df_ko_md_feature_scorerank[df_ko_md_feature_scorerank['Module']==df_ko_md_feature_scorerank['feature']]

    fig = plt.figure(figsize=(3,2))
    ax = fig.add_axes([0.1,0.1,0.8,0.8])
    # the module which the predicted OG belongs to
    ax.hist(df_ko_md_feature_scorerank_same_md.groupby('KO').mean(), range=(0,339), bins =20, histtype='step',density=True, color ='#008F00', lw=2)
    # null distribution
    ax.hist(df_ko_feature_scorerank['score_rank_descending'], range=(0,339), bins =20, histtype='stepfilled',density=True, color = '#DDDDDD')
    ax.set_xlabel("Rank of feature importance")
    ax.set_ylabel("Frequency")
    ax.set_title(featureset+"_"+target+"_"+selection+"_top"+str(rank_threshold))
    plt.savefig("figures/NK_M0151_feature_importance_"+featureset+"_"+target+"_"+selection+"_top"+str(rank_threshold)+".pdf", bbox_inches='tight')
    plt.close()

In [None]:
def make_df_feature_target_method_ko_selectionscore_ext(target, rank_threshold, featureset, selection):
    
    df_feature_target_method_ko_selectionscore = pd.read_table("/Users/konnonaoki/Documents/backupped/Research/IwasakiLab/Data/MetabolicNetworkEvolution/experiment/NK_M0151/result/feature_target_method_ko_selectionscore.txt", names = ['featureset', 'target', 'selection', 'KO', 'feature', 'score'])

    df_feature_target_method_ko_selectionscore['abs_score'] = abs(df_feature_target_method_ko_selectionscore['score'])

    df_feature_target_method_ko_selectionscore_ext = \
            df_feature_target_method_ko_selectionscore[
                (df_feature_target_method_ko_selectionscore['featureset']==featureset) &
                (df_feature_target_method_ko_selectionscore['target']    ==target)     &
                (df_feature_target_method_ko_selectionscore['selection'] ==selection)
                ]

    df_feature_target_method_ko_selectionscore_ext = df_feature_target_method_ko_selectionscore_ext.reset_index()
    df_feature_target_method_ko_selectionscore_ext['score_rank_descending'] = df_feature_target_method_ko_selectionscore_ext.groupby(["target", "selection", "KO"])['abs_score'].rank(ascending=False)
    df_ko_feature_scorerank = df_feature_target_method_ko_selectionscore_ext.loc[:, ['KO', 'feature', 'score_rank_descending']]

    # select top-N features
    df_feature_target_method_ko_selectionscore_ext['selected_top'+str(rank_threshold)] = \
        df_feature_target_method_ko_selectionscore_ext['score_rank_descending'] < rank_threshold
    df_feature_target_method_ko_selectionscore_ext['score_rank_top'+str(rank_threshold)] = \
        [max(rank_threshold - rank, 0) for rank in df_feature_target_method_ko_selectionscore_ext['score_rank_descending']]
    df_feature_target_method_ko_selectionscore_ext['signed_score_rank_top'+str(rank_threshold)] = df_feature_target_method_ko_selectionscore_ext['score_rank_top'+str(rank_threshold)] * df_feature_target_method_ko_selectionscore_ext['score'] / df_feature_target_method_ko_selectionscore_ext['abs_score']

    # 
    df_feature_target_method_ko_selectionscore_ext = pd.merge(df_feature_target_method_ko_selectionscore_ext, df_category_ko, on = 'KO')
    df_feature_target_method_ko_selectionscore_ext = df_feature_target_method_ko_selectionscore_ext.fillna(0)
    
    return df_feature_target_method_ko_selectionscore_ext

def make_heatmap_ko_md_rank(df_feature_target_method_ko_selectionscore_ext, category, category_i, rank_threshold, featureset, selection):
    
        df_feature_target_method_ko_selectionscore_ext_of_a_category = df_feature_target_method_ko_selectionscore_ext[
            df_feature_target_method_ko_selectionscore_ext['category'] == category
        ]

        df_ko_md_toprank = sparsemtx2mtx(df_feature_target_method_ko_selectionscore_ext_of_a_category['KO'], df_feature_target_method_ko_selectionscore_ext_of_a_category['feature'], df_feature_target_method_ko_selectionscore_ext_of_a_category['signed_score_rank_top'+str(rank_threshold)])

        df_ko_md_rank       = sparsemtx2mtx(df_feature_target_method_ko_selectionscore_ext_of_a_category['KO'], df_feature_target_method_ko_selectionscore_ext_of_a_category['feature'], df_feature_target_method_ko_selectionscore_ext_of_a_category['score_rank_descending'])

        
        # To sort columns
        df_module_maxcategory_ext = pd.merge(pd.DataFrame(df_ko_md_toprank.columns, columns= ['Module']), df_maxcategory_module).sort_values('category')
        # To sort rows
        df_ko_md_ext = pd.merge(pd.DataFrame(df_ko_md_toprank.index, columns= ['KO']), df_md_ko).sort_values('Module')
        #df_category_pathway_md_ext = pd.merge(df_maxcategory_pathway[df_maxcategory_pathway['category'] == '09111 Xenobiotics biodegradation and metabolism'], df_path_md).sort_values('Module')
        #df_ko_path_md_ext = pd.merge(df_ko_md_ext, df_category_pathway_md_ext, on = 'Module').sort_values('Module')
        #df_ko_md_ext = df_ko_path_md_ext.sort_values('Module'). sort_values('Pathway', kind = 'mergesort')
        i=0
        id_list = []
        prev_md=""
        for md in df_ko_md_ext.Module:
            if (prev_md!=md): i+=1
            id_list.append((i%2+1)*0.2 )
            prev_md=md
        df_ko_md_ext['Module_id'] = id_list

        # Plot: KO-Module-Rank of feature importance
        fig = plt.figure(figsize=(5,3))
        ax = fig.add_axes([0.1,0.1,0.8,0.8])
        ax2 = fig.add_axes([0.95,0.1,0.05, 0.4])
        ax3 = fig.add_axes([0.08, 0.1, 0.015, 0.8])
        ax4 = fig.add_axes([0.1, 0.05, 0.8, 0.045])
        ax5 = fig.add_axes([0.06, 0.1, 0.015, 0.8])

        sns.heatmap(df_ko_md_toprank.reindex(
            columns = df_module_maxcategory_ext.Module,
            index      = df_ko_md_ext.KO,
            ),
            cmap='coolwarm',
            center=0,
            ax = ax,
            cbar_ax = ax2
        )
        ax.set_xlabel("")
        ax.set_ylabel("")
        ax.set_title(category)

        # Columns
        cmap = ListedColormap([cm(i) for i in range(11)], name="custom") # user-defined colormap
        sns.heatmap([pd.merge(df_module_maxcategory_ext, df_category_color).category_id], ax = ax4, cbar = False, cmap = cmap)
        ax4.tick_params(bottom = False, left = False, labelbottom = False, labelleft = False)
        ax4.set_xlabel("Predictor: 339 modules in KEGG Module")

        # Rows
        sns.heatmap([[i] for i in df_ko_md_ext.Module_id], ax = ax3, cbar = False, cmap = 'binary', vmin=0, vmax=1)
        sns.heatmap([[0]], ax = ax5, cbar = False, cmap = ListedColormap([cm(category_i)], name="custom"), vmin=0, vmax=1)
        ax5.set_ylabel("Predicted OGs")

        ax.tick_params(bottom=False, left=False, labelbottom=False, labelleft=False, )
        ax3.tick_params(bottom=False, left=False, labelbottom=False, labelleft=False, )
        ax4.tick_params(bottom=False, left=False, labelbottom=False, labelleft=False, )
        ax5.tick_params(bottom=False, left=False, labelbottom=False, labelleft=False, )

        plt.savefig("figures/NK_M0151_module_ko_importance_"+featureset+"_"+target+"_"+selection+"_top"+str(rank_threshold)+"_"+category+".pdf", bbox_inches='tight')
        plt.close()
        
        #return df_ko_md_toprank
        return df_ko_md_rank

In [None]:
# For each category, create an overview heatmap
for target, rank_threshold, feature_set, selection in [('gain', 14, 'md', 'ANOVA'), ('gain', 50, 'md', 'RandomForest'), ('loss', 8, 'md', 'ANOVA'), ('loss', 14, 'md', 'RandomForest')]:
    
    df_feature_target_method_ko_selectionscore_ext = make_df_feature_target_method_ko_selectionscore_ext(target, rank_threshold, feature_set, selection)
    
   

    for category_i, category in enumerate(df_category_color.category):

        make_heatmap_ko_md_rank(df_feature_target_method_ko_selectionscore_ext, category, category_i, rank_threshold, feature_set, selection)

In [None]:
target, rank_threshold, feature_set, selection = 'gain', 14, 'md', 'ANOVA'

## TO DO: ここはallを指定できるようにすべき
category, category_i = '09111 Xenobiotics biodegradation and metabolism', 10

df_feature_target_method_ko_selectionscore_ext = make_df_feature_target_method_ko_selectionscore_ext(target, rank_threshold, feature_set, selection)
df_ko_md_toprank = make_heatmap_ko_md_rank(df_feature_target_method_ko_selectionscore_ext, category, category_i, rank_threshold, feature_set, selection)

In [None]:
try:
    os.mkdir("figures/important_features_heatmap")
except:
    None
for module_of_interest in df_maxcategory_module[df_maxcategory_module['category'] == category].Module:

    ko_list = list(set(df_md_ko[df_md_ko['Module']==module_of_interest].KO) & set (df_ko_md_toprank.index))
    df = df_ko_md_toprank.loc[ko_list,:]
    df = df.loc[:, (df.sum(axis=0) != 0)]
    df = df.loc[:, df.sum().sort_values(ascending = False).index]
    
    Nrows = len(df.index)
    Ncolumns = len(df.columns)
    
    if (Nrows > 0 and Ncolumns >0):

        fig = plt.figure(figsize=(8*Ncolumns/35*1.1,2*Nrows/8))
        ax = fig.add_axes([0.1,0.1,0.8,0.8])
        sns.heatmap(df, ax = ax, cmap = 'coolwarm', center = 0)

        plt.savefig("figures/important_features_heatmap/NK_M0151_"+feature_set+"_"+target+"_"+selection+"_top"+str(rank_threshold)+"_"+module_of_interest+".pdf", bbox_inches='tight')
        plt.close()

In [None]:
# Enumerate pairs of KEGG Modules which share one or more reactions or contains adjacent reactions
network_M = nx.read_gml("/Users/konnonaoki/Documents/backupped/Research/IwasakiLab/Data/SyntrophyExploration/NK_S0002/Single_Filter_N_CompressedNetwork.gml")
mapping = {module: module.split(":")[1] for module in network_M.nodes}
network_M = nx.relabel_nodes(network_M, mapping)
network_M_Xenobiotics = network_M.subgraph(df_maxcategory_module[df_maxcategory_module['category'] == '09111 Xenobiotics biodegradation and metabolism'].Module)
nx.write_gml(network_M_Xenobiotics, "networks/network_module.xenobiotics.gml")

In [None]:
for edge in network_M_Xenobiotics.edges:
    print(edge[0], edge[1])

In [None]:
list_module_pair = [
    ["M00551", "M00543"],
    ["M00551", "M00538"],
    ["M00551", "M00537"],
    ["M00638", "M00534"],
    ["M00568", "M00548"],
    ["M00568", "M00637"],
    ["M00539", "M00419"],
    ["M00541", "M00418"],
    ["M00569", "M00547"],
    ["M00569", "M00548"],
    ["M00569", "M00637"],
    ["M00569", "M00638"],
]

In [None]:
list_rank_pair = []
for pair in list_module_pair:
    rank_pair = []
    for i, module in enumerate(pair):
        module_of_interest = pair[i]
        module_of_interest_the_other = pair[abs(1-i)]

        ko_list = list(set(df_md_ko[df_md_ko['Module']==module_of_interest].KO) & set (df_ko_md_toprank.index))
        df = df_ko_md_toprank.loc[ko_list,:]
        #df = df.loc[:, (df.sum(axis=0) != 0)]
        df = df.loc[:, df.sum().sort_values(ascending = False).index]
        print(i, module, df.mean()[module_of_interest_the_other])
        rank_pair.append(df.mean()[module_of_interest_the_other])
    list_rank_pair.append(rank_pair)

In [None]:
df_result = pd.DataFrame(list_rank_pair, columns = ["central", "peripheral"])
df_result

In [None]:
fig = plt.figure(figsize=(3,3))
ax = fig.add_axes([0.1,0.1,0.8,0.8])
ax.scatter(df_result['central'], df_result['peripheral'], alpha = 0.5)
ax.set_xlim(1,339)
ax.set_ylim(1,339)
#ax.set_xscale("log")
#ax.set_yscale("log")