In [None]:
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.colors as mcolor
import sys
import os
import pandas as pd
import numpy as np
from Bio import Phylo
import seaborn as sns
from scipy.stats import t, ttest_1samp, wilcoxon, mannwhitneyu, ttest_rel, zscore, spearmanr
import json
from statsmodels.stats.multitest import multipletests
from scipy.stats import gaussian_kde
from sklearn import linear_model
import re
from matplotlib.colors import ListedColormap
import networkx as nx

In [None]:
matplotlib.rcParams['font.family']       = 'Arial'
matplotlib.rcParams['font.sans-serif']   = ["Arial","DejaVu Sans","Lucida Grande","Verdana"]
matplotlib.rcParams['figure.figsize']    = [4,3]
matplotlib.rcParams['font.size']         = 10
matplotlib.rcParams["axes.labelcolor"]   = "#000000"
matplotlib.rcParams["axes.linewidth"]    = 1.0 
matplotlib.rcParams["xtick.major.width"] = 1.0
matplotlib.rcParams["ytick.major.width"] = 1.0
cmap1 = plt.cm.tab20
cmap2 = plt.cm.Set3  
#plt.style.use('default')

In [None]:
os.chdir("/Users/konnonaoki/Documents/backupped/Research/IwasakiLab/Data/MetabolicNetworkEvolution/experiment/NK_M0151")

for dir in ["figures", "tables", "networks"]:
    try:
        os.mkdir(dir)
    except:
        None

### Classes of KOs

In [None]:
# Classess of KOs

df_path_ko = pd.read_table("tables/path_ko.txt", names = ['Pathway', 'KO'])
df_rn_ko = pd.read_table("tables/rn_ko.txt", names = ['Reaction','KO'])
df_md_ko = pd.read_table("tables/md_ko.txt", names = ['Module','KO'])
df_path_md = pd.read_table("tables/path_md.txt", names = ['Pathway','Module'])
ontology = json.load(open("/Users/konnonaoki/Documents/backupped/Research/IwasakiLab/Data/MetabolicNetworkEvolution/experiment/NK_M0151/json/ko00001.json"))

ontology_tree = Phylo.BaseTree.Tree(Phylo.BaseTree.Clade(name=ontology['name']))
root_clade    = Phylo.BaseTree.Clade(name=ontology['name'])
stack = [(ontology, root_clade)]

while len(stack) > 0:
    term, clade = stack.pop()
    if ('children' in term.keys()):
        for child in term['children']:
            child_clade = Phylo.BaseTree.Clade(name = child['name'])
            clade.clades.append(child_clade)
            stack.append((child, child_clade))

ontology_tree = Phylo.BaseTree.Tree(root_clade)

list_category_ko = []
for clade in ontology_tree.clade.clades[0].clades:
    for tip in clade.get_terminals():
        KO = tip.name.split()[0]
        if (KO[0] == 'K'):
            list_category_ko.append([clade.name, KO])
df_category_ko = pd.DataFrame(list_category_ko, columns = ['category', 'KO'])
st_category_ko = []
for clade in ontology_tree.clade.clades[0].clades:
    for tip in clade.get_terminals():
        KO = tip.name.split()[0]
        if (KO[0] == 'K'):
            list_category_ko.append([clade.name, KO])
df_category_ko = pd.DataFrame(list_category_ko, columns = ['category', 'KO'])
df_category_ko = df_category_ko[~df_category_ko.duplicated()]

df_ko_count = pd.DataFrame(df_category_ko.KO.value_counts())
set_ko_with_unique_category = set(df_ko_count[df_ko_count['KO']==1].index)
df_category_ko['unique'] = [(ko in set_ko_with_unique_category) for ko in df_category_ko.KO]
df_uniquecategory_ko = df_category_ko[df_category_ko['unique']]

# color of function categories

colors = ['#66C2A5', '#FC8D62', '#8DA0CB', '#E78AC3', '#555555', '#FC8D62', '#8DA0CB', '#E78AC3', '#66C2A5', '#FC8D62', '#000000']

cm_name = 'Set3' # B->G->R
cm = plt.get_cmap(cm_name)

df_category_ko_module = pd.merge(df_category_ko, df_md_ko, on = 'KO')
df_category_ko_module['Nko'] = 1
df_category_module_count = df_category_ko_module.groupby(['category', 'Module'], as_index = False).sum()
df_maxcategory_module = df_category_module_count.loc[df_category_module_count.groupby('Module')['Nko'].idxmax(),:].sort_values('category')
df_maxcategory_module = df_maxcategory_module.reset_index().loc[:, ['category', 'Module']]
df_category_color = pd.DataFrame([[category, i] for i, category in enumerate(df_maxcategory_module.category.unique())], columns = ["category", 'category_id'])
df_category_color['color'] = [mcolor.rgb2hex(cm(i)) for i in df_category_color['category_id']]
#df_category_color

df_category_ko_pathway = pd.merge(df_category_ko, df_path_ko, on = 'KO')
df_category_ko_pathway['Nko'] = 1
df_category_pathway_count = df_category_ko_pathway.groupby(['category', 'Pathway'], as_index = False).sum()
df_maxcategory_pathway = df_category_pathway_count.loc[df_category_pathway_count.groupby('Pathway')['Nko'].idxmax(),:].sort_values('category')
df_maxcategory_pathway = df_maxcategory_pathway.reset_index().loc[:, ['category', 'Pathway']]
df_maxcategory_pathway

### Relationship between the number of features and the AUC of predicting gene gain/loss of each OG by logistic regression/random forest

In [None]:
# Interpretable features 

df_auc_raw = pd.read_table("/Users/konnonaoki/Documents/backupped/Research/IwasakiLab/Data/MetabolicNetworkEvolution/experiment/NK_M0151/result/ko_auc.mlgtdb.MPPA.Nselected.filtered.txt", names = ['KO', 'target', 'pred_method', 'featureset', 'selec_method', 'Nfeatures', 'AUC']) 
df_auc = df_auc_raw.groupby(['KO', 'target', 'pred_method', 'featureset', 'selec_method', 'Nfeatures'], as_index=False).mean()
df_auc

Relation between number of selected features and median AUC

In [None]:
def plot_Nfeatures_medAUC(df_auc, target, pred_method, featureset, selec_method, ax, color, label = None):
    df_auc_ext = df_auc[(df_auc['target']==target) & (df_auc['pred_method']==pred_method) & (df_auc['featureset']==featureset) & (df_auc['selec_method']==selec_method)]
    #plt.scatter(df_auc_ext[df_auc_ext['KO']=='K00002']['Nfeatures'], df_auc_ext[df_auc_ext['KO']=='K00002']['AUC'])

    df_auc_ext_5 = df_auc_ext.groupby(['Nfeatures'], as_index = False).quantile(0.05)
    df_auc_ext_50 = df_auc_ext.groupby(['Nfeatures'], as_index = False).quantile(0.5)
    df_auc_ext_95 = df_auc_ext.groupby(['Nfeatures'], as_index = False).quantile(0.95)
    df_auc_ext_percentile = \
        pd.merge(
            pd.merge(
                df_auc_ext_5, df_auc_ext_50, on = 'Nfeatures'
                ), 
                df_auc_ext_95
            )
    df_auc_ext_percentile = df_auc_ext_percentile.rename(columns = {'AUC_x':'AUC_5', 'AUC_y': 'AUC_50', 'AUC': 'AUC_95'})
    #print(target, pred_method, selec_method, featureset, df_auc_ext_percentile.sort_values('AUC_50', ascending=False))
    ax.plot(df_auc_ext_percentile['Nfeatures'], df_auc_ext_percentile['AUC_50'], color= color, label = label)

colors = ['#66C2A5', '#FC8D62', '#8DA0CB', '#E78AC3']
for featureset in ['md', 'ec12md', 'ec12mdpath']:
    for target in ['gain', 'loss']:
        fig = plt.figure(figsize=(2,2))
        ax = fig.add_axes([0.1,0.1,0.8,0.8])
        ax.set_xlim(0,51)
        ax.set_ylim(0.5, 0.8)
        ax.set_xlabel("#selected featues")
        ax.set_ylabel("Median AUC")
        ax.set_title(featureset + " " + target)

        i = 0
        for pred_method in ['LR', 'RF']:
            for selec_method in ['ANOVA', 'RandomForest']:
                plot_Nfeatures_medAUC(df_auc, target, pred_method, featureset, selec_method, ax, colors[i])
                i+=1
        plt.savefig("figures/NK_M0151_Nfeatures_medAUC_"+featureset+"_"+target+".pdf", bbox_inches = 'tight')
        plt.show()
        plt.close()

#### Number of features which achieves largest AUC in each condition

In [None]:
featureset = 'md'
target = 'loss'#, 'loss'
pred_method = 'LR' #, 'RF'
selec_method = 'RandomForest'#'ANOVA' #'RandomForest' #'ANOVA' #, 'RandomForest']:

for featureset in ['md', 'ec12md', 'ec12mdpath']:
    for target in ['gain', 'loss']:
        for pred_method in ['LR', 'RF']:
            for selec_method in ['ANOVA', 'RandomForest']:

                df_auc_ext = df_auc[(df_auc['target']==target) & (df_auc['pred_method']==pred_method) & (df_auc['featureset']==featureset) & (df_auc['selec_method']==selec_method)]

                df_auc_ext_5 = df_auc_ext.groupby(['Nfeatures'], as_index = False).quantile(0.05)
                df_auc_ext_50 = df_auc_ext.groupby(['Nfeatures'], as_index = False).quantile(0.5)
                df_auc_ext_95 = df_auc_ext.groupby(['Nfeatures'], as_index = False).quantile(0.95)
                df_auc_ext_percentile = \
                    pd.merge(
                        pd.merge(
                            df_auc_ext_5, df_auc_ext_50, on = 'Nfeatures'
                            ), 
                            df_auc_ext_95
                        )
                df_auc_ext_percentile = df_auc_ext_percentile.rename(columns = {'AUC_x':'AUC_5', 'AUC_y': 'AUC_50', 'AUC': 'AUC_95'})
                #print(target, pred_method, selec_method, featureset, df_auc_ext_percentile.sort_values('AUC_50', ascending=False))
                print(featureset, target, pred_method, selec_method, df_auc_ext_percentile.sort_values("AUC_50", ascending = False).reset_index(drop=True)["Nfeatures"][0])

In [None]:
for featureset in ['md']:
    for target in ['gain', 'loss']:
        # for the original paper
        fig = plt.figure(figsize=(3,3))
        ax = fig.add_axes([0.1,0.1,0.8,0.8])
        ax.set_xlim(0,51)
        ax.set_ylim(0.55, 0.85)
        ax.set_xlabel("#selected featues")
        ax.set_ylabel("Median AUC")
        ax.set_title(featureset + " " + target)
        
        df_auc_category = pd.merge(df_auc, df_category_ko, on = 'KO')

        i = 0
        for category in sorted(list(set(df_auc_category.category))):
            for pred_method in ['RF']:
                for selec_method in ['ANOVA']:
                    plot_Nfeatures_medAUC(df_auc_category[df_auc_category.category == category], target, pred_method, featureset, selec_method, ax, mcolor.rgb2hex(cm(i)), label = category)
                    i+=1
        ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left',borderaxespad=0,)
        
        plt.savefig("figures/NK_M0151_Nfeatures_medAUC_"+featureset+"_"+target+"_category_RF_ANOVA.pdf", bbox_inches = 'tight')
        plt.show()
        plt.close()

        # for the master's thesis
        fig = plt.figure(figsize=(2,2))
        ax = fig.add_axes([0.1,0.1,0.8,0.8])
        ax.set_xlim(0,51)
        ax.set_ylim(0.55, 0.85)
        ax.set_xlabel("#selected featues")
        ax.set_ylabel("Median AUC")
        ax.set_title(featureset + " " + target)
        
        df_auc_category = pd.merge(df_auc, df_category_ko, on = 'KO')

        i = 0
        for category in sorted(list(set(df_auc_category.category))):
            for pred_method in ['RF']:
                for selec_method in ['ANOVA']:
                    plot_Nfeatures_medAUC(df_auc_category[df_auc_category.category == category], target, pred_method, featureset, selec_method, ax, mcolor.rgb2hex(cm(i)), label = category)
                    i+=1
        ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left',borderaxespad=0,)
        
        plt.savefig("figures/NK_M0151_Nfeatures_medAUC_"+featureset+"_"+target+"_category_RF_ANOVA.small.pdf", bbox_inches = 'tight')
        plt.show()
        plt.close()

In [None]:
def sparsemtx2mtx(X, Y, Z):
    X_uniq_list = list(sorted(list(set(X))))
    Y_uniq_list = list(sorted(list(set(Y))))
    Z_matrix    = np.zeros((len(X_uniq_list), len(Y_uniq_list)))
    for x, y, z in zip(X, Y, Z):
        x_idx = X_uniq_list.index(x)
        y_idx = Y_uniq_list.index(y)
        Z_matrix[x_idx, y_idx] = z
    #return X_uniq_list, Y_uniq_list, Z_matrix
    return pd.DataFrame(Z_matrix, index = X_uniq_list, columns = Y_uniq_list,)

In [None]:
for target in ['gain', 'loss']:

    selec_method = 'ANOVA'
    pred_method = 'RF'
    featureset = 'md'
    df_auc_ext = df_auc[(df_auc['target']==target) & (df_auc['pred_method']==pred_method) & (df_auc['featureset']==featureset) & (df_auc['selec_method']==selec_method)]

    df_KO_Nfeatures_AUC = sparsemtx2mtx(df_auc_ext.KO, df_auc_ext.Nfeatures, df_auc_ext.AUC)

    g = sns.clustermap(df_KO_Nfeatures_AUC, col_cluster=False, method = 'ward', metric = 'euclidean', cmap = 'coolwarm', figsize = (5,5), vmin = 0.1, vmax = 0.9)

    g.fig.axes[2].set_yticks([])
    g.fig.axes[2].set_xticks(np.array([1, 10, 20, 30, 40, 50])-0.5)
    g.fig.axes[2].set_xticklabels(np.array([1, 10, 20, 30, 40, 50]))
    g.fig.axes[2].set_ylabel(str(len(df_KO_Nfeatures_AUC.index)) + " OGs")
    g.fig.axes[2].set_xlabel("# selected features")
    g.fig.axes[2].set_title(featureset + " " + target + " " + selec_method + " " + pred_method)
    g.fig.axes[3].set_title("AUC")

    plt.savefig("figures/NK_M0151_KO_Nfeatures_AUC_"+featureset+"_"+target+".pdf", bbox_inches = 'tight')
    #plt.show()
    plt.close()

In [None]:
target = 'gain'
selec_method = 'RandomForest'
pred_method = 'RF'
featureset = 'md'
df_auc_ext = df_auc[(df_auc['target']==target) & (df_auc['pred_method']==pred_method) & (df_auc['featureset']==featureset) & (df_auc['selec_method']==selec_method)]
df_auc_ext_max_AUC = df_auc_ext.loc[df_auc_ext.groupby("KO")["AUC"].idxmax(), :]

In [None]:
# by category 
cm_name = 'Set3' # B->G->R
cm = plt.get_cmap(cm_name)

df_auc_category = pd.merge(df_auc, df_category_ko, on = 'KO')

for target in ['gain', 'loss']:
    fig = plt.figure(figsize=(2,1))
    
    i = 0
    for category in sorted(list(set(df_auc_category.category))):
        for pred_method in ['RF']:
            for selec_method in ['ANOVA']:
                ax = fig.add_axes([0.1,0.1-i,0.8,0.8], label = category)
                ax.set_xlim(0,51)
                if (i == len(set(df_auc_category.category)) - 1): ax.set_xlabel("Optimal # features")
                if (i != len(set(df_auc_category.category)) - 1): ax.set_xticklabels([])
                ax.set_ylabel("# OGs")
                #ax.set_title(category)
                plt.gca().spines['right'].set_visible(False)
                plt.gca().spines['top'].set_visible(False)
                df_auc_category_ext = df_auc_category[(df_auc_category['target']==target) & (df_auc_category['pred_method']==pred_method) & (df_auc_category['featureset']==featureset) & (df_auc_category['selec_method']==selec_method) & (df_auc_category['category']==category)]
                df_auc_ext_max_AUC = df_auc_category_ext.loc[df_auc_category_ext.groupby("KO")["AUC"].idxmax(), :]
                ax.hist(df_auc_ext_max_AUC.Nfeatures, range= (0,50), bins = 25, histtype= 'stepfilled', color = mcolor.rgb2hex(cm(i)), alpha = 0.8)
                i+=1
    plt.savefig("figures/NK_M0151_Nfeatures_of_maxAUC_"+featureset+"_"+target+"_category.pdf", bbox_inches = 'tight')
    plt.show()
    plt.close()

In [None]:
for pred_method in ['LR','RF']:
    for selec_method in ['ANOVA','RandomForest']:
        
        fig = plt.figure(figsize=(2,2))
        ax = fig.add_axes([0.1,0.1,0.8,0.8])
        ax.set_xlim(0,51)
        ax.set_xlabel("Optimal # features")
        ax.set_ylabel("# OGs")
        ax.set_title(featureset+" "+pred_method+" "+selec_method, )
        
        for target, color in [('gain', '#9FA6F1'), ('loss', '#E1BB63')]:
            df_auc_ext = df_auc[(df_auc['target']==target) & (df_auc['pred_method']==pred_method) & (df_auc['featureset']==featureset) & (df_auc['selec_method']==selec_method)]
            df_auc_ext_max_AUC = df_auc_ext.loc[df_auc_ext.groupby("KO")["AUC"].idxmax(), :]
            df_auc_ext_50 = df_auc_ext.groupby(['Nfeatures'], as_index = False).quantile(0.5)
            df_auc_ext_max_AUC['optNfeatures_for_medAUC'] = int(df_auc_ext_50.loc[df_auc_ext_50["AUC"].idxmax(), :].Nfeatures)
            
            
            ax.hist(df_auc_ext_max_AUC.Nfeatures, range= (0,50), bins = 50, histtype= 'step', color = color, alpha = 0.8)
            
            df_auc_ext_max_AUC[['KO','target','optNfeatures_for_medAUC','Nfeatures']].to_csv("tables/KO_target_optNfeatures."+target+"_"+pred_method+"_"+selec_method+".txt", index=False, sep = '\t')
        
        plt.savefig("figures/NK_M0151_Nfeatures_of_maxAUC_"+featureset+"_category_"+pred_method+"_"+selec_method+".pdf", bbox_inches = 'tight')
        plt.show()
        plt.close()

In [None]:
target = 'gain'
selec_method = 'ANOVA'
pred_method = 'RF'
featureset = 'md'

df_auc_ext = df_auc[(df_auc['target']==target) & (df_auc['pred_method']==pred_method) & (df_auc['featureset']==featureset) & (df_auc['selec_method']==selec_method)]
df_auc_ext_max_AUC = df_auc_ext.loc[df_auc_ext.groupby("KO")["AUC"].idxmax(), :]

df_auc_ext_max_AUC_Nreaction = pd.merge(df_auc_ext_max_AUC, df_rn_ko.KO.value_counts().reset_index().rename(columns={'index':'KO', 'KO': 'Nreactions'}), on = 'KO')
df_auc_ext_max_AUC_Nmodule = pd.merge(df_auc_ext_max_AUC, df_md_ko.KO.value_counts().reset_index().rename(columns={'index':'KO', 'KO': 'Nmodules'}), on = 'KO')

fig = plt.figure(figsize=(3,2))
ax = fig.add_axes([0.1,0.1,0.8,0.8])
df_auc_ext_max_AUC_Nreaction['promiscuity'] = ['1' if Nreaction == 1 else '2-10' if Nreaction <= 10 else '11-20' if Nreaction <= 20 else '>20' for Nreaction in df_auc_ext_max_AUC_Nreaction.Nreactions]
sns.boxplot(data = df_auc_ext_max_AUC_Nreaction, y = 'Nfeatures', x = 'promiscuity', linewidth=0.5, color = '#FFFFFF', order = ['1', '2-10', '11-20', '>20'])
sns.stripplot(data = df_auc_ext_max_AUC_Nreaction, y = 'Nfeatures', x = 'promiscuity', linewidth=0.5, color = '#FF0000', alpha = 0.1, s = 1, order = ['1', '2-10', '11-20', '>20'], jitter = 0.3)
ax.set_xlabel("Promiscuity (#reactions involved)")
ax.set_ylabel("Optimal #features")
ax.set_title(featureset + " " + target + " " + selec_method + " " + pred_method)
plt.savefig("figures/NK_M0151_optNfeatures_Nreactions_"+target+"_"+featureset+".pdf", bbox_inches = 'tight')
plt.close()

In [None]:
fig = plt.figure(figsize=(3,2))
ax = fig.add_axes([0.1,0.1,0.8,0.8])
sns.boxplot(data = df_auc_ext_max_AUC_Nmodule, y = 'Nfeatures', x = 'Nmodules', linewidth=0.5, color = '#FFFFFF', )
sns.stripplot (data = df_auc_ext_max_AUC_Nmodule, y = 'Nfeatures', x = 'Nmodules', linewidth=0, color = '#000000', size = 1, alpha=.3, jitter=.3, dodge=True)
ax.set_xlabel("#modules involved")
ax.set_ylabel("Optimal #features")
ax.set_title(featureset + " " + target + " " + selec_method + " " + pred_method)
plt.savefig("figures/NK_M0151_optNfeatures_Nmodules_"+target+"_"+featureset+".pdf", bbox_inches = 'tight')
plt.close()

In [None]:
df_auc_ext_max_AUC_category = pd.merge(df_auc_ext_max_AUC, df_category_ko, on = 'KO')
sns.violinplot(data = df_auc_ext_max_AUC_category, y = 'category', x = 'Nfeatures', hue = 'target', order = sorted(list(set(df_auc_ext_max_AUC_category['category']))), linewidth=0.5, orient = 'h', color = '#FFFFFF', alpha = 0)
sns.stripplot (data = df_auc_ext_max_AUC_category, y = 'category', x = 'Nfeatures', hue = 'target', order = sorted(list(set(df_auc_ext_max_AUC_category['category']))), linewidth=0, orient = 'h', size = 1, alpha=.5,jitter=0.3, dodge=True, palette=['#FF0000'])
plt.close()

In [None]:
df_auc_ext_max_AUC_Ncategory = pd.merge(df_auc_ext_max_AUC, df_category_ko.KO.value_counts().reset_index().rename(columns={'index':'KO', 'KO': 'Ncategories'}), on = 'KO')
fig = plt.figure(figsize=(3,2))
ax = fig.add_axes([0.1,0.1,0.8,0.8])
sns.boxplot(data = df_auc_ext_max_AUC_Ncategory, y = 'Nfeatures', x = 'Ncategories', linewidth=0.5, color = '#FFFFFF', )
sns.stripplot(data = df_auc_ext_max_AUC_Ncategory, y = 'Nfeatures', x = 'Ncategories', linewidth=0,   color = '#000000', size = 1, alpha=.3, jitter=0.3, dodge=True)
plt.close()

In [None]:
# Heatmap of selected features: Module vs Module 

rank_threshold = 14

for target in ['gain', 'loss']:

    featureset = 'md'
    selection  = 'ANOVA'

    df_feature_target_method_ko_selectionscore = pd.read_table("/Users/konnonaoki/Documents/backupped/Research/IwasakiLab/Data/MetabolicNetworkEvolution/experiment/NK_M0151/result/feature_target_method_ko_selectionscore.txt", names = ['featureset', 'target', 'selection', 'KO', 'feature', 'score'])
    df_feature_target_method_ko_selectionscore['abs_score'] = abs(df_feature_target_method_ko_selectionscore['score'])
    df_feature_target_method_ko_selectionscore_ext = \
        df_feature_target_method_ko_selectionscore[
            (df_feature_target_method_ko_selectionscore['featureset']==featureset) &
            (df_feature_target_method_ko_selectionscore['target']    ==target)     &
            (df_feature_target_method_ko_selectionscore['selection'] ==selection)
            ]
    df_feature_target_method_ko_selectionscore_ext = df_feature_target_method_ko_selectionscore_ext.reset_index()
    df_feature_target_method_ko_selectionscore_ext['score_rank_descending'] = df_feature_target_method_ko_selectionscore_ext.groupby(["target", "selection", "KO"])['abs_score'].rank(ascending=False)

    df_feature_target_method_ko_selectionscore_ext['selected_top'+str(rank_threshold)] = \
        df_feature_target_method_ko_selectionscore_ext['score_rank_descending'] < rank_threshold
    df_feature_target_method_ko_selectionscore_ext['score_rank_top'+str(rank_threshold)] = \
        [max(rank_threshold - rank, 0) for rank in df_feature_target_method_ko_selectionscore_ext['score_rank_descending']]
    df_feature_target_method_ko_selectionscore_ext


    df_md_ko = pd.read_table("tables/md_ko.txt", names = ['Module','KO'])
    df_feature_target_method_ko_selectionscore_MD = pd.merge(df_feature_target_method_ko_selectionscore_ext, df_md_ko, on = 'KO').rename(columns = {'Module':'Module_of_KO'})
    df_feature_target_method_ko_selectionscore_MD

    df_feature_target_method_md_selectionscore_MD = df_feature_target_method_ko_selectionscore_MD.groupby(['target', 'selection', 'Module_of_KO', 'feature'], as_index= False).mean()
    df_feature_target_method_md_selectionscore_MD

    df_md_md_rank = sparsemtx2mtx(
        df_feature_target_method_md_selectionscore_MD.Module_of_KO, 
        df_feature_target_method_md_selectionscore_MD.feature,
        df_feature_target_method_md_selectionscore_MD['selected_top'+str(rank_threshold)]
        )

    df_category_ko_module = pd.merge(df_category_ko, df_md_ko, on = 'KO')
    df_category_ko_module['Nko'] = 1
    df_category_module_count = df_category_ko_module.groupby(['category', 'Module'], as_index = False).sum()
    df_maxcategory_module = df_category_module_count.loc[df_category_module_count.groupby('Module')['Nko'].idxmax(),:].sort_values('category')
    df_maxcategory_module = df_maxcategory_module.reset_index().loc[:, ['category', 'Module']]

    df_category_color2 = pd.DataFrame([[category, i] for i, category in enumerate(df_maxcategory_module.category.unique())], columns = ["category", 'category_id'])
    df_maxcategory_module_color = pd.merge(df_maxcategory_module, df_category_color2)

    rows = pd.merge(pd.DataFrame(list(set(df_md_md_rank.index)), columns = ['Module']), df_maxcategory_module_color).sort_values('Module').sort_values('category', kind='mergesort')
    columns = pd.merge(pd.DataFrame(list(set(df_md_md_rank.columns)), columns = ['Module']), df_maxcategory_module_color).sort_values('Module').sort_values('category', kind='mergesort')
    rows.sort_values('category_id')


    # plot a figure
    df_md_md_rank = df_md_md_rank.loc[rows.Module, columns.Module]

    fig = plt.figure(figsize=(5,5))
    ax = fig.add_axes([0.1,0.1,0.8,0.8])
    ax2 = fig.add_axes([0.93,0.1,0.05,0.3])
    sns.heatmap(df_md_md_rank, ax = ax, cbar_ax = ax2, cmap = 'inferno')
    ax.tick_params(bottom = False, left = False, labelbottom = False, labelleft = False)
    ax2.set_yticklabels([0, 0.25, 0.5, 1])

    # user-defined colormap
    cmap = ListedColormap([cm(i) for i in range(11)], name="custom")

    # y-axis
    ax3 = fig.add_axes([0.05,0.1,0.045,0.8])
    sns.heatmap([[color_id] for color_id in rows.category_id], ax = ax3, cbar = False, cmap = cmap)
    ax3.tick_params(bottom = False, left = False, labelbottom = False, labelleft = False)
    ax3.set_ylabel("Modules including predicted OGs")

    # x-axis

    ax4 = fig.add_axes([0.1,0.05,0.8,0.045])
    sns.heatmap([columns.category_id], ax = ax4, cbar = False, cmap = cmap)
    ax4.tick_params(bottom = False, left = False, labelbottom = False, labelleft = False)
    ax4.set_xlabel("Modules as predictors")

    plt.savefig("figures/NK_M0151_selected_features_heatmap_"+featureset+"_"+target+"_"+selection+".pdf")
    plt.close()

In [None]:
len(set(df_feature_target_method_ko_selectionscore[df_feature_target_method_ko_selectionscore["target"]=="loss"].KO))

In [None]:
# Distribution of important features by functional categories

df_test_result_list = []

for target, rank_threshold, featureset, selection in [('gain', 14, 'md', 'ANOVA'), ('gain', 50, 'md', 'RandomForest'), ('loss', 8, 'md', 'ANOVA'), ('loss', 14, 'md', 'RandomForest')]:
    
    df_feature_target_method_ko_selectionscore = pd.read_table("/Users/konnonaoki/Documents/backupped/Research/IwasakiLab/Data/MetabolicNetworkEvolution/experiment/NK_M0151/result/feature_target_method_ko_selectionscore.txt", names = ['featureset', 'target', 'selection', 'KO', 'feature', 'score'])
    df_feature_target_method_ko_selectionscore['abs_score'] = abs(df_feature_target_method_ko_selectionscore['score'])
    df_feature_target_method_ko_selectionscore_ext = \
        df_feature_target_method_ko_selectionscore[
            (df_feature_target_method_ko_selectionscore['featureset'] == featureset) &
            (df_feature_target_method_ko_selectionscore['target']       == target)     &
            (df_feature_target_method_ko_selectionscore['selection']   == selection)
            ]
    df_feature_target_method_ko_selectionscore_ext = df_feature_target_method_ko_selectionscore_ext.reset_index()
    df_feature_target_method_ko_selectionscore_ext['score_rank_descending'] = df_feature_target_method_ko_selectionscore_ext.groupby(["target", "selection", "KO"])['abs_score'].rank(ascending=False)
    df_ko_feature_scorerank = df_feature_target_method_ko_selectionscore_ext.loc[:, ['KO', 'feature', 'score_rank_descending', 'score']]

    # select top-N features
    df_feature_target_method_ko_selectionscore_ext['selected_top'+str(rank_threshold)] = \
        df_feature_target_method_ko_selectionscore_ext['score_rank_descending'] < rank_threshold
    df_feature_target_method_ko_selectionscore_ext['score_rank_top'+str(rank_threshold)] = \
        [max(rank_threshold - rank, 0) for rank in df_feature_target_method_ko_selectionscore_ext['score_rank_descending']]
    df_feature_target_method_ko_selectionscore_ext['signed_score_rank_top'+str(rank_threshold)] = df_feature_target_method_ko_selectionscore_ext['score_rank_top'+str(rank_threshold)] * df_feature_target_method_ko_selectionscore_ext['score'] / df_feature_target_method_ko_selectionscore_ext['abs_score']

    # 
    df_feature_target_method_ko_selectionscore_ext = pd.merge(df_feature_target_method_ko_selectionscore_ext, df_uniquecategory_ko, on = 'KO')
    df_feature_target_method_ko_selectionscore_ext = df_feature_target_method_ko_selectionscore_ext.fillna(0)
    df_feature_target_method_ko_selectionscore_ext_category_sum = df_feature_target_method_ko_selectionscore_ext.groupby(['category', 'feature'], as_index=False).sum()

    # Plot 1: Important features for predicting gain/loss of eech KEGG category
    fig = plt.figure(figsize=(5,0.5))
    list_test_result = []
    for i, category in enumerate(df_category_color.category):
        df = df_feature_target_method_ko_selectionscore_ext_category_sum[df_feature_target_method_ko_selectionscore_ext_category_sum['category'] == category]
        df = pd.merge(df, df_maxcategory_module, left_on = 'feature', right_on = 'Module').sort_values('category_y')
        df = pd.merge(df, df_category_color, left_on = 'category_y', right_on = 'category')
        
        # testing enrichment to the same category
        same_category = list(df[df.category_x == df.category_y]["selected_top"+str(rank_threshold)])
        different_category = list(df[df.category_x != df.category_y]["selected_top"+str(rank_threshold)])
        list_test_result.append([category, target, rank_threshold, featureset, selection, mannwhitneyu(same_category, different_category, use_continuity=True, alternative=None).pvalue])
        
        ax = fig.add_axes([0.1,0.1-i,0.8,0.8])
        ax.set_xlim(0,340)
        ax.bar(x = df.feature, height = df["selected_top"+str(rank_threshold)], color = df.color)
        plt.gca().spines['right'].set_visible(False)
        plt.gca().spines['top'].set_visible(False)
        ax.tick_params(labelbottom=False, bottom=False)
        #ax.set_title(category, x = -0.6, y = 0)
        ax.text(-0.1,0.45,category,c=df_category_color.color[i],ha='right',transform=ax.transAxes)
        #ax.set_ylim(0,130)
        if (i==10): ax.set_xlabel("Features (339 KEGG Modules)")
        if (i==0):  ax.set_title("#OGs", x=-0.05, fontsize= 10)
        if (i==0):  ax.text(0.3,1.5,featureset+" "+target+" "+selection+" top "+str(rank_threshold),ha='left',transform=ax.transAxes)
    plt.savefig("figures/NK_M0151_nKOs_features_"+featureset+"_"+target+"_"+selection+"_top"+str(rank_threshold)+".pdf", bbox_inches='tight')
    plt.close()
    
    df_test_result = pd.DataFrame(list_test_result, columns = ["category", "target", "rank_threshold", "featureset", "selection_method", "p"])
    df_test_result ['q'] =  list(multipletests(list(df_test_result.p), method = "fdr_bh")[1])
    df_test_result['significant'] = ['*' if q<0.05 else '' for q in df_test_result.q]
    
    df_test_result_list.append(df_test_result)
    
    # Plot 2: Rank distribution of feature importance
    df_ko_md_feature_scorerank = pd.merge(df_md_ko, df_ko_feature_scorerank, on = 'KO').loc[:, ['Module', 'KO', 'feature', 'score', 'score_rank_descending']]

    df_ko_md_feature_scorerank_same_md = df_ko_md_feature_scorerank[df_ko_md_feature_scorerank['Module']==df_ko_md_feature_scorerank['feature']].reset_index()
    df_ko_md_feature_scorerank_same_md_med = df_ko_md_feature_scorerank_same_md.groupby('KO').median()
    df_ko_md_feature_scorerank_same_md_med["sign"] = np.sign(df_ko_md_feature_scorerank_same_md_med["score"])
    df_ko_md_feature_scorerank_same_md_med["score_rank_descending_norm"] = 100 - df_ko_md_feature_scorerank_same_md_med["score_rank_descending"]/339 * 100

    df_ko_md_feature_scorerank_same_md_med_positive  = df_ko_md_feature_scorerank_same_md_med[df_ko_md_feature_scorerank_same_md_med["sign"] > 0]
    df_ko_md_feature_scorerank_same_md_med_zero       = df_ko_md_feature_scorerank_same_md_med[df_ko_md_feature_scorerank_same_md_med["sign"] == 0]
    df_ko_md_feature_scorerank_same_md_med_negative = df_ko_md_feature_scorerank_same_md_med[df_ko_md_feature_scorerank_same_md_med["sign"] < 0]

    fig = plt.figure(figsize=(2,2))
    ax = fig.add_axes([0.1,0.1,0.8,0.8])
    ax.hist([df_ko_md_feature_scorerank_same_md_med_zero["score_rank_descending_norm"], df_ko_md_feature_scorerank_same_md_med_negative["score_rank_descending_norm"], df_ko_md_feature_scorerank_same_md_med_positive["score_rank_descending_norm"]], color = ["#CCCCCC", "#99FCFE", "#EE7BA6", ], range=(0,100), bins =20, density=True, lw=2, stacked=True, alpha = 0.5)
    #ax.set_xlim(-5,105)
    ax.set_xlabel("Importance rank of\nbelonging module (%)")
    ax.set_ylabel("Proportion of OGs")
    ax.set_title(featureset+" "+target+" "+selection)
    plt.savefig("figures/NK_M0151_feature_importance_"+featureset+"_"+target+"_"+selection+".pdf", bbox_inches='tight')
    #plt.close()

In [None]:
target, rank_threshold, featureset, selection = 'gain', 14, 'md', 'ANOVA'
Ntotal = df[df["category_x"] == "09111 Xenobiotics biodegradation and metabolism"]["selected_top14"].sum()
Nsame_category = df[(df["category_x"] == "09111 Xenobiotics biodegradation and metabolism") & (df["category_y"] == "09111 Xenobiotics biodegradation and metabolism")]["selected_top14"].sum()
Nsame_category / Ntotal

In [None]:
target, rank_threshold, featureset, selection = 'gain', 14, 'md', 'ANOVA'
category = "09111 Xenobiotics biodegradation and metabolism"

df_feature_target_method_ko_selectionscore = pd.read_table("/Users/konnonaoki/Documents/backupped/Research/IwasakiLab/Data/MetabolicNetworkEvolution/experiment/NK_M0151/result/feature_target_method_ko_selectionscore.txt", names = ['featureset', 'target', 'selection', 'KO', 'feature', 'score'])

df_feature_target_method_ko_selectionscore['abs_score'] = abs(df_feature_target_method_ko_selectionscore['score'])
df_feature_target_method_ko_selectionscore_ext = \
    df_feature_target_method_ko_selectionscore[
        (df_feature_target_method_ko_selectionscore['featureset'] == featureset) &
        (df_feature_target_method_ko_selectionscore['target']       == target)     &
        (df_feature_target_method_ko_selectionscore['selection']   == selection)
        ]
df_feature_target_method_ko_selectionscore_ext = df_feature_target_method_ko_selectionscore_ext.reset_index()
df_feature_target_method_ko_selectionscore_ext['score_rank_descending'] = df_feature_target_method_ko_selectionscore_ext.groupby(["target", "selection", "KO"])['abs_score'].rank(ascending=False)
df_ko_feature_scorerank = df_feature_target_method_ko_selectionscore_ext.loc[:, ['KO', 'feature', 'score_rank_descending', 'score']]

# select top-N features
df_feature_target_method_ko_selectionscore_ext['selected_top'+str(rank_threshold)] = \
    df_feature_target_method_ko_selectionscore_ext['score_rank_descending'] < rank_threshold
df_feature_target_method_ko_selectionscore_ext['score_rank_top'+str(rank_threshold)] = \
    [max(rank_threshold - rank, 0) for rank in df_feature_target_method_ko_selectionscore_ext['score_rank_descending']]
df_feature_target_method_ko_selectionscore_ext['signed_score_rank_top'+str(rank_threshold)] = df_feature_target_method_ko_selectionscore_ext['score_rank_top'+str(rank_threshold)] * df_feature_target_method_ko_selectionscore_ext['score'] / df_feature_target_method_ko_selectionscore_ext['abs_score']

# 
df_feature_target_method_ko_selectionscore_ext = pd.merge(df_feature_target_method_ko_selectionscore_ext, df_uniquecategory_ko, on = 'KO')
df_feature_target_method_ko_selectionscore_ext = df_feature_target_method_ko_selectionscore_ext.fillna(0)
df_feature_target_method_ko_selectionscore_ext_category_sum = df_feature_target_method_ko_selectionscore_ext.groupby(['category', 'feature'], as_index=False).sum()



df = df_feature_target_method_ko_selectionscore_ext_category_sum
df = pd.merge(df, df_maxcategory_module, left_on = 'feature', right_on = 'Module').sort_values('category_y')
df = pd.merge(df, df_category_color, left_on = 'category_y', right_on = 'category')

for i, category in enumerate(df_category_color.category):
    Ntotal = df[df["category_x"] == category]["selected_top14"].sum()
    Nsame_category = df[(df["category_x"] == category) & (df["category_y"] == category)]["selected_top14"].sum()
    print(category, Nsame_category / Ntotal)

In [None]:
# Distribution of important features by functional categories

for target, rank_threshold, featureset, selection in [('gain', 14, 'md', 'ANOVA'), ('loss', 8, 'md', 'ANOVA')]:
    
    df_feature_target_method_ko_selectionscore = pd.read_table("/Users/konnonaoki/Documents/backupped/Research/IwasakiLab/Data/MetabolicNetworkEvolution/experiment/NK_M0151/result/feature_target_method_ko_selectionscore.txt", names = ['featureset', 'target', 'selection', 'KO', 'feature', 'score'])
    df_feature_target_method_ko_selectionscore['abs_score'] = abs(df_feature_target_method_ko_selectionscore['score'])
    df_feature_target_method_ko_selectionscore_ext = \
        df_feature_target_method_ko_selectionscore[
            (df_feature_target_method_ko_selectionscore['featureset'] == featureset) &
            (df_feature_target_method_ko_selectionscore['target']       == target)     &
            (df_feature_target_method_ko_selectionscore['selection']   == selection)
            ]
    df_feature_target_method_ko_selectionscore_ext = df_feature_target_method_ko_selectionscore_ext.reset_index()
    df_feature_target_method_ko_selectionscore_ext['score_rank_descending'] = df_feature_target_method_ko_selectionscore_ext.groupby(["target", "selection", "KO"])['abs_score'].rank(ascending=False)
    df_ko_feature_scorerank = df_feature_target_method_ko_selectionscore_ext.loc[:, ['KO', 'feature', 'score_rank_descending', 'score']]

    # Plot 3: Rank distribution of feature importance + positive/negative effect
    df_ko_md_feature_scorerank = pd.merge(df_md_ko, df_ko_feature_scorerank, on = 'KO').loc[:, ['Module', 'KO', 'feature', 'score', 'score_rank_descending']]

    df_ko_md_feature_scorerank_same_md = df_ko_md_feature_scorerank[df_ko_md_feature_scorerank['Module']==df_ko_md_feature_scorerank['feature']].reset_index()
    df_ko_md_feature_scorerank_same_md_med = df_ko_md_feature_scorerank_same_md.groupby('KO').median()
    df_ko_md_feature_scorerank_same_md_med["sign"] = np.sign(df_ko_md_feature_scorerank_same_md_med["score"])
    df_ko_md_feature_scorerank_same_md_med["score_rank_descending_norm"] = 100 - df_ko_md_feature_scorerank_same_md_med["score_rank_descending"]/339 * 100

    df_ko_md_feature_scorerank_same_md_med_positive  = df_ko_md_feature_scorerank_same_md_med[df_ko_md_feature_scorerank_same_md_med["sign"] > 0]
    df_ko_md_feature_scorerank_same_md_med_zero       = df_ko_md_feature_scorerank_same_md_med[df_ko_md_feature_scorerank_same_md_med["sign"] == 0]
    df_ko_md_feature_scorerank_same_md_med_negative = df_ko_md_feature_scorerank_same_md_med[df_ko_md_feature_scorerank_same_md_med["sign"] < 0]

    fig = plt.figure(figsize=(2,2))
    ax = fig.add_axes([0.1,0.1,0.8,0.8])
    ax.hist([df_ko_md_feature_scorerank_same_md_med_zero["score_rank_descending_norm"], df_ko_md_feature_scorerank_same_md_med_negative["score_rank_descending_norm"], df_ko_md_feature_scorerank_same_md_med_positive["score_rank_descending_norm"]], color = ["#CCCCCC", "#99FCFE", "#EE7BA6", ], range=(0,100), bins =20, density=True, lw=2, stacked=True, alpha = 0.5)
    #ax.set_xlim(-5,105)
    ax.set_xlabel("Importance rank of\nbelonging module (%)")
    ax.set_ylabel("Proportion of OGs")
    ax.set_title(featureset+" "+target+" "+selection)
    plt.savefig("figures/NK_M0151_feature_importance_"+featureset+"_"+target+"_"+selection+".pdf", bbox_inches='tight')
    #plt.close()

In [None]:
# Plot 3: Rank distribution of feature importance + positive/negative effect

for target, rank_threshold, featureset, selection in [('gain', 14, 'md', 'ANOVA'), ('loss', 8, 'md', 'ANOVA')]:
    df_feature_target_method_ko_selectionscore = pd.read_table("/Users/konnonaoki/Documents/backupped/Research/IwasakiLab/Data/MetabolicNetworkEvolution/experiment/NK_M0151/result/feature_target_method_ko_selectionscore.txt", names = ['featureset', 'target', 'selection', 'KO', 'feature', 'score'])
    df_feature_target_method_ko_selectionscore['abs_score'] = abs(df_feature_target_method_ko_selectionscore['score'])
    #    extract dataframe
    df_feature_target_method_ko_selectionscore_ext = \
        df_feature_target_method_ko_selectionscore[
            (df_feature_target_method_ko_selectionscore['featureset'] == featureset) &
            (df_feature_target_method_ko_selectionscore['target']       == target)     &
            (df_feature_target_method_ko_selectionscore['selection']   == selection)
            ]
    #    ranking features
    df_feature_target_method_ko_selectionscore_ext = df_feature_target_method_ko_selectionscore_ext.reset_index()
    df_feature_target_method_ko_selectionscore_ext['score_rank_descending'] = df_feature_target_method_ko_selectionscore_ext.groupby(["target", "selection", "KO"])['abs_score'].rank(ascending=False)
    df_ko_feature_scorerank = df_feature_target_method_ko_selectionscore_ext.loc[:, ['KO', 'feature', 'score_rank_descending', 'score']]
    #.   enumarate corresponding features
    df_ko_md_feature_scorerank = pd.merge(df_md_ko, df_ko_feature_scorerank, on = 'KO').loc[:, ['Module', 'KO', 'feature', 'score', 'score_rank_descending']]

    df_ko_md_feature_scorerank["same_md"]  = [m1==m2 for m1, m2 in zip(df_ko_md_feature_scorerank["Module"], df_ko_md_feature_scorerank["feature"])]
    df_ko_md_feature_scorerank["score_sign"] = np.sign(df_ko_md_feature_scorerank["score"])
    df_ko_md_feature_scorerank_same_md = df_ko_md_feature_scorerank[df_ko_md_feature_scorerank["same_md"]]

    fig = plt.figure(figsize=(2,1.8))
    ax = fig.add_axes([0.1,0.1,0.8,0.8])
    ax.hist(df_ko_md_feature_scorerank_same_md["score_rank_descending"], bins = 20, color = "#009193")
    plt.gca().spines['right'].set_visible(False)
    plt.gca().spines['top'].set_visible(False)
    ax.set_ylabel("# OGs")
    ax.set_xlim(339,1)
    ax.set_ylim(0,500)
    ax.set_xticks([])
    ax2 = fig.add_axes([0.1,-0.3,0.8,0.3])

    fig2 = plt.figure(figsize=(2,1.5))
    ax3 = fig2.add_axes([0.1,0.1, 0.8,0.3])

    data_binned, edge_bins, patches = ax3.hist(
        [
            df_ko_md_feature_scorerank_same_md[df_ko_md_feature_scorerank_same_md["score_sign"]==0]["score_rank_descending"], 
            df_ko_md_feature_scorerank_same_md[df_ko_md_feature_scorerank_same_md["score_sign"]<0]["score_rank_descending"], 
            df_ko_md_feature_scorerank_same_md[df_ko_md_feature_scorerank_same_md["score_sign"]>0]["score_rank_descending"], 
        ],
        color = ["#CCCCCC", "#99FCFE", "#EE7BA6"], 
        bins = 20,
        #histtype="barstacked"
    )


    real_bins = [(edge_bins[i]+edge_bins[i+1])/2 for i in range(20)]

    data_binned = np.array(data_binned)
    data_binned /= data_binned.sum(0)

    colors = ["#CCCCCC", "#99FCFE", "#EE7BA6"]
    for i in range(len(data_binned)):
        ax2.bar(real_bins, data_binned[i], bottom=data_binned[:i].sum(0), color = colors[i], width = 10)

    ax2.set_xlim(339,1)
    ax2.set_xticks([339, 171, 1])
    ax2.set_xlabel("Feature importance rank")
    ax2.set_yticks([0.00, 1.00])
    ax2.set_yticklabels(["0.00", "1.00"])
    ax2.set_ylabel("Proportion")

    ax.set_title(selection + " / " + target)
    fig.savefig("figures/NK_M0151_feature_importance_"+featureset+"_"+target+"_"+selection+".pdf", bbox_inches='tight')
    
    plt.close()

In [None]:
def make_df_feature_target_method_ko_selectionscore_ext(target, rank_threshold, featureset, selection):
    
    df_feature_target_method_ko_selectionscore = pd.read_table("/Users/konnonaoki/Documents/backupped/Research/IwasakiLab/Data/MetabolicNetworkEvolution/experiment/NK_M0151/result/feature_target_method_ko_selectionscore.txt", names = ['featureset', 'target', 'selection', 'KO', 'feature', 'score'])

    df_feature_target_method_ko_selectionscore['abs_score'] = abs(df_feature_target_method_ko_selectionscore['score'])

    df_feature_target_method_ko_selectionscore_ext = \
            df_feature_target_method_ko_selectionscore[
                (df_feature_target_method_ko_selectionscore['featureset']==featureset) &
                (df_feature_target_method_ko_selectionscore['target']    ==target)     &
                (df_feature_target_method_ko_selectionscore['selection'] ==selection)
                ]

    df_feature_target_method_ko_selectionscore_ext = df_feature_target_method_ko_selectionscore_ext.reset_index()
    df_feature_target_method_ko_selectionscore_ext['score_rank_descending'] = df_feature_target_method_ko_selectionscore_ext.groupby(["target", "selection", "KO"])['abs_score'].rank(ascending=False)
    df_ko_feature_scorerank = df_feature_target_method_ko_selectionscore_ext.loc[:, ['KO', 'feature', 'score_rank_descending']]

    # select top-N features
    df_feature_target_method_ko_selectionscore_ext['selected_top'+str(rank_threshold)] = \
        df_feature_target_method_ko_selectionscore_ext['score_rank_descending'] < rank_threshold
    df_feature_target_method_ko_selectionscore_ext['score_rank_top'+str(rank_threshold)] = \
        [max(rank_threshold - rank, 0) for rank in df_feature_target_method_ko_selectionscore_ext['score_rank_descending']]
    df_feature_target_method_ko_selectionscore_ext['signed_score_rank_top'+str(rank_threshold)] = df_feature_target_method_ko_selectionscore_ext['score_rank_top'+str(rank_threshold)] * df_feature_target_method_ko_selectionscore_ext['score'] / df_feature_target_method_ko_selectionscore_ext['abs_score']

    # 
    df_feature_target_method_ko_selectionscore_ext = pd.merge(df_feature_target_method_ko_selectionscore_ext, df_category_ko, on = 'KO')
    df_feature_target_method_ko_selectionscore_ext = df_feature_target_method_ko_selectionscore_ext.fillna(0)
    
    return df_feature_target_method_ko_selectionscore_ext

def make_heatmap_ko_md_rank(df_feature_target_method_ko_selectionscore_ext, category, category_i, rank_threshold, featureset, selection):
    
        df_feature_target_method_ko_selectionscore_ext_of_a_category = df_feature_target_method_ko_selectionscore_ext[
            df_feature_target_method_ko_selectionscore_ext['category'] == category
        ]

        df_ko_md_toprank = sparsemtx2mtx(df_feature_target_method_ko_selectionscore_ext_of_a_category['KO'], df_feature_target_method_ko_selectionscore_ext_of_a_category['feature'], df_feature_target_method_ko_selectionscore_ext_of_a_category['signed_score_rank_top'+str(rank_threshold)])

        df_ko_md_rank       = sparsemtx2mtx(df_feature_target_method_ko_selectionscore_ext_of_a_category['KO'], df_feature_target_method_ko_selectionscore_ext_of_a_category['feature'], df_feature_target_method_ko_selectionscore_ext_of_a_category['score_rank_descending'])

        
        # To sort columns
        df_module_maxcategory_ext = pd.merge(pd.DataFrame(df_ko_md_toprank.columns, columns= ['Module']), df_maxcategory_module).sort_values('category')
        # To sort rows
        df_ko_md_ext = pd.merge(pd.DataFrame(df_ko_md_toprank.index, columns= ['KO']), df_md_ko).sort_values('Module')
        #df_category_pathway_md_ext = pd.merge(df_maxcategory_pathway[df_maxcategory_pathway['category'] == '09111 Xenobiotics biodegradation and metabolism'], df_path_md).sort_values('Module')
        #df_ko_path_md_ext = pd.merge(df_ko_md_ext, df_category_pathway_md_ext, on = 'Module').sort_values('Module')
        #df_ko_md_ext = df_ko_path_md_ext.sort_values('Module'). sort_values('Pathway', kind = 'mergesort')
        i=0
        id_list = []
        prev_md=""
        for md in df_ko_md_ext.Module:
            if (prev_md!=md): i+=1
            id_list.append((i%2+1)*0.2 )
            prev_md=md
        df_ko_md_ext['Module_id'] = id_list

        # Plot: KO-Module-Rank of feature importance
        fig = plt.figure(figsize=(5,3))
        ax = fig.add_axes([0.1,0.1,0.8,0.8])
        ax2 = fig.add_axes([0.95,0.1,0.05, 0.4])
        ax3 = fig.add_axes([0.08, 0.1, 0.015, 0.8])
        ax4 = fig.add_axes([0.1, 0.05, 0.8, 0.045])
        ax5 = fig.add_axes([0.06, 0.1, 0.015, 0.8])

        sns.heatmap(df_ko_md_toprank.reindex(
            columns = df_module_maxcategory_ext.Module,
            index      = df_ko_md_ext.KO,
            ),
            cmap='coolwarm',
            center=0,
            ax = ax,
            cbar_ax = ax2
        )
        ax.set_xlabel("")
        ax.set_ylabel("")
        ax.set_title(category)

        # Columns
        cmap = ListedColormap([cm(i) for i in range(11)], name="custom") # user-defined colormap
        sns.heatmap([pd.merge(df_module_maxcategory_ext, df_category_color).category_id], ax = ax4, cbar = False, cmap = cmap)
        ax4.tick_params(bottom = False, left = False, labelbottom = False, labelleft = False)
        ax4.set_xlabel("Predictor: 339 modules in KEGG Module")

        # Rows
        sns.heatmap([[i] for i in df_ko_md_ext.Module_id], ax = ax3, cbar = False, cmap = 'binary', vmin=0, vmax=1)
        sns.heatmap([[0]], ax = ax5, cbar = False, cmap = ListedColormap([cm(category_i)], name="custom"), vmin=0, vmax=1)
        ax5.set_ylabel("Predicted OGs")

        ax.tick_params(bottom=False, left=False, labelbottom=False, labelleft=False, )
        ax3.tick_params(bottom=False, left=False, labelbottom=False, labelleft=False, )
        ax4.tick_params(bottom=False, left=False, labelbottom=False, labelleft=False, )
        ax5.tick_params(bottom=False, left=False, labelbottom=False, labelleft=False, )

        plt.savefig("figures/NK_M0151_module_ko_importance_"+featureset+"_"+target+"_"+selection+"_top"+str(rank_threshold)+"_"+category+".pdf", bbox_inches='tight')
        plt.close()
        
        #return df_ko_md_toprank
        return df_ko_md_rank

In [None]:
# For each category, create an overview heatmap
for target, rank_threshold, feature_set, selection in [('gain', 14, 'md', 'ANOVA'), ('gain', 50, 'md', 'RandomForest'), ('loss', 8, 'md', 'ANOVA'), ('loss', 14, 'md', 'RandomForest')]:
    
    df_feature_target_method_ko_selectionscore_ext = make_df_feature_target_method_ko_selectionscore_ext(target, rank_threshold, feature_set, selection)
    
   

    for category_i, category in enumerate(df_category_color.category):

        make_heatmap_ko_md_rank(df_feature_target_method_ko_selectionscore_ext, category, category_i, rank_threshold, feature_set, selection)

In [None]:
target, rank_threshold, feature_set, selection = 'gain', 14, 'md', 'ANOVA'

category, category_i = '09111 Xenobiotics biodegradation and metabolism', 10

df_feature_target_method_ko_selectionscore_ext = make_df_feature_target_method_ko_selectionscore_ext(target, rank_threshold, feature_set, selection)
df_ko_md_rank = make_heatmap_ko_md_rank(df_feature_target_method_ko_selectionscore_ext, category, category_i, rank_threshold, feature_set, selection)

In [None]:
# Enumerate pairs of KEGG Modules which share one or more reactions or contains adjacent reactions
network_M = nx.read_gml("/Users/konnonaoki/Documents/backupped/Research/IwasakiLab/Data/SyntrophyExploration/NK_S0002/Single_Filter_N_CompressedNetwork.gml")
mapping = {module: module.split(":")[1] for module in network_M.nodes}
network_M = nx.relabel_nodes(network_M, mapping)
network_M_Xenobiotics = network_M.subgraph(df_maxcategory_module[df_maxcategory_module['category'] == '09111 Xenobiotics biodegradation and metabolism'].Module)
nx.write_gml(network_M_Xenobiotics, "networks/network_module.xenobiotics.gml")