In [None]:
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.colors as mcolor
import sys
import os
import pandas as pd
import numpy as np
from Bio import Phylo
import seaborn as sns
from scipy.stats import t, ttest_1samp, wilcoxon, mannwhitneyu, ttest_rel, zscore, spearmanr
import json
from statsmodels.stats.multitest import multipletests
from scipy.stats import gaussian_kde
from sklearn import linear_model
import re
from matplotlib.colors import ListedColormap
import networkx as nx
import gzip

In [None]:
matplotlib.rcParams['font.family']       = 'Arial'
matplotlib.rcParams['font.sans-serif']   = ["Arial","DejaVu Sans","Lucida Grande","Verdana"]
matplotlib.rcParams['figure.figsize']    = [4,3]
matplotlib.rcParams['font.size']         = 10
matplotlib.rcParams["axes.labelcolor"]   = "#000000"
matplotlib.rcParams["axes.linewidth"]    = 1.0 
matplotlib.rcParams["xtick.major.width"] = 1.0
matplotlib.rcParams["ytick.major.width"] = 1.0
cmap1 = plt.cm.tab20
cmap2 = plt.cm.Set3  
#plt.style.use('default')

In [None]:
os.chdir("/Users/konnonaoki/Documents/backupped/Research/IwasakiLab/Data/MetabolicNetworkEvolution/experiment/NK_M0151")

for dir in ["figures", "tables", "networks"]:
    try:
        os.mkdir(dir)
    except:
        None

In [None]:
# Classess of KOs

df_path_ko = pd.read_table("tables/path_ko.txt", names = ['Pathway', 'KO'])
df_rn_ko = pd.read_table("tables/rn_ko.txt", names = ['Reaction','KO'])
df_md_ko = pd.read_table("tables/md_ko.txt", names = ['Module','KO'])
df_path_md = pd.read_table("tables/path_md.txt", names = ['Pathway','Module'])
ontology = json.load(open("/Users/konnonaoki/Documents/backupped/Research/IwasakiLab/Data/MetabolicNetworkEvolution/experiment/NK_M0151/json/ko00001.json"))

ontology_tree = Phylo.BaseTree.Tree(Phylo.BaseTree.Clade(name=ontology['name']))
root_clade    = Phylo.BaseTree.Clade(name=ontology['name'])
stack = [(ontology, root_clade)]

while len(stack) > 0:
    term, clade = stack.pop()
    if ('children' in term.keys()):
        for child in term['children']:
            child_clade = Phylo.BaseTree.Clade(name = child['name'])
            clade.clades.append(child_clade)
            stack.append((child, child_clade))

ontology_tree = Phylo.BaseTree.Tree(root_clade)

list_category_ko = []
for clade in ontology_tree.clade.clades[0].clades:
    for tip in clade.get_terminals():
        KO = tip.name.split()[0]
        if (KO[0] == 'K'):
            list_category_ko.append([clade.name, KO])
df_category_ko = pd.DataFrame(list_category_ko, columns = ['category', 'KO'])
st_category_ko = []
for clade in ontology_tree.clade.clades[0].clades:
    for tip in clade.get_terminals():
        KO = tip.name.split()[0]
        if (KO[0] == 'K'):
            list_category_ko.append([clade.name, KO])
df_category_ko = pd.DataFrame(list_category_ko, columns = ['category', 'KO'])
df_category_ko = df_category_ko[~df_category_ko.duplicated()]

df_ko_count = pd.DataFrame(df_category_ko.KO.value_counts())
set_ko_with_unique_category = set(df_ko_count[df_ko_count['KO']==1].index)
df_category_ko['unique'] = [(ko in set_ko_with_unique_category) for ko in df_category_ko.KO]
df_uniquecategory_ko = df_category_ko[df_category_ko['unique']]

# color of function categories

colors = ['#66C2A5', '#FC8D62', '#8DA0CB', '#E78AC3', '#555555', '#FC8D62', '#8DA0CB', '#E78AC3', '#66C2A5', '#FC8D62', '#000000']

cm_name = 'Set3' # B->G->R
cm = plt.get_cmap(cm_name)

df_category_ko_module = pd.merge(df_category_ko, df_md_ko, on = 'KO')
df_category_ko_module['Nko'] = 1
df_category_module_count = df_category_ko_module.groupby(['category', 'Module'], as_index = False).sum()
df_maxcategory_module = df_category_module_count.loc[df_category_module_count.groupby('Module')['Nko'].idxmax(),:].sort_values('category')
df_maxcategory_module = df_maxcategory_module.reset_index().loc[:, ['category', 'Module']]
df_category_color = pd.DataFrame([[category, i] for i, category in enumerate(df_maxcategory_module.category.unique())], columns = ["category", 'category_id'])
df_category_color['color'] = [mcolor.rgb2hex(cm(i)) for i in df_category_color['category_id']]
#df_category_color

df_category_ko_pathway = pd.merge(df_category_ko, df_path_ko, on = 'KO')
df_category_ko_pathway['Nko'] = 1
df_category_pathway_count = df_category_ko_pathway.groupby(['category', 'Pathway'], as_index = False).sum()
df_maxcategory_pathway = df_category_pathway_count.loc[df_category_pathway_count.groupby('Pathway')['Nko'].idxmax(),:].sort_values('category')
df_maxcategory_pathway = df_maxcategory_pathway.reset_index().loc[:, ['category', 'Pathway']]
df_maxcategory_pathway

#### Visualize ROC curve of cross validation

##### Open file

In [None]:
tree="mlgtdb"
anc="MPPA"
df_roc = pd.read_table(gzip.open("/Users/konnonaoki/Documents/backupped/Research/IwasakiLab/Data/MetabolicNetworkEvolution/experiment/NK_M0151/result/ko_curve."+tree+"."+anc+".txt.gz"), names=["KO", "type", "prediction", "CVnumber", "ROCx", "ROCy"])
df_roc

##### x = 0, 0.01, 0.02, ..., 1.00でのROC curveの値を出力 (x座標を揃えた) : 時間かかるので実行は１度だけ

In [None]:
x_y_list = []


for Type in ["gain", "loss"]:
    df_roc_type = df_roc[
        (df_roc["type"] == Type) 
    ].reset_index()
    for i, KO in enumerate(list(set(df_roc_type["KO"]))):
        df_roc_type_KO = df_roc_type[
                (df_roc_type["KO"] == KO)
            ]
        for prediction in ["LR", "RF"]:
            for CVnumber in [1,2,3,4,5]:
                df_roc_ext = df_roc_type_KO[
                    (df_roc_type_KO["prediction"] == prediction) &
                    (df_roc_type_KO["CVnumber"] == CVnumber)
                ]

                # 角だけ
                df_roc_ext_ymax = df_roc_ext.groupby(["CVnumber", "ROCx"], as_index = False).max()

                ROCx = [0] + list(df_roc_ext_ymax["ROCx"])
                ROCy = [0] + list(df_roc_ext_ymax["ROCy"])

                def intermediate(x, ROCx, ROCy):
                    prevY = 0
                    for X, Y in zip(ROCx, ROCy):
                        if (x < X):
                            return prevY
                        elif (x == 1):
                            return 1
                        else:
                            prevY = Y

                for x in np.arange(0,1.01, 0.01):
                    x_y_list.append([KO, Type, prediction, CVnumber, x, intermediate(x, ROCx, ROCy)])
        if (i%100==0): print(i)
    


In [None]:
df_x_y_list = pd.DataFrame(x_y_list, columns = ["KO", "type", "prediction", "CVnumber", "ROCx", "ROCy"])
df_x_y_list = df_x_y_list.sort_values(["KO", "type", "prediction", "CVnumber", "ROCx"])
df_x_y_list.to_csv("tables/ko_curve.definedX.txt", sep = "\t", header = True, index = False)
df_x_y_list 

##### いよいよ描画

In [None]:
df_x_y_list = pd.read_table("tables/ko_curve.definedX.txt")
df_x_y_list 

In [None]:
df_x_y_list_mean = df_x_y_list.groupby(["KO", "type", "prediction", "ROCx"], as_index=False).mean()
df_x_y_list_mean

In [None]:
# black background
Type = "gain"
prediction = "RF"

for Type in ["gain", "loss"]:
    for prediction in ["RF", "LR"]:
        df_x_y_list_mean_ext = df_x_y_list_mean[
            (df_x_y_list_mean["type"]==Type) &
            (df_x_y_list_mean["prediction"]==prediction)
        ]

        fig = plt.figure(figsize=(2,2))
        ax = fig.add_axes([0.1,0.1,0.8,0.8])
        ax.set_facecolor("#000000")
        #ax.set_facecolor("#42125B")

        ax.set_xlim(-0.01,1.01)
        ax.set_ylim(-0.01,1.01)

        ax.set_xticks([0.0,0.2,0.4,0.6,0.8,1.0])

        ax.set_xlabel("False positive rate")
        ax.set_ylabel("True positive rate")


        sns.lineplot(
            data=df_x_y_list_mean_ext,
            x="ROCx", y="ROCy", units="KO",
            estimator = None, lw=0.6, alpha = 0.01, 
            color = "#FFFF00",
        )
        
        ax.set_title(Type + " " + prediction)
        
        plt.savefig("figures/NK_M0151_ROC_"+Type+"_"+prediction+".pdf", bbox_inches = 'tight')
        plt.savefig("figures/NK_M0151_ROC_"+Type+"_"+prediction+".png", dpi=300, bbox_inches = 'tight')
        
        plt.close()

In [None]:
# white background
for Type, color in [("gain",'#9FA6F1'), ("loss", '#E1BB63')]:
    for prediction in ["RF", "LR"]:
        df_x_y_list_mean_ext = df_x_y_list_mean[
            (df_x_y_list_mean["type"]==Type) &
            (df_x_y_list_mean["prediction"]==prediction)
        ]

        fig = plt.figure(figsize=(2,2))
        ax = fig.add_axes([0.1,0.1,0.8,0.8])
        #ax.set_facecolor("#000000")
        #ax.set_facecolor("#42125B")

        ax.set_xlim(-0.01,1.01)
        ax.set_ylim(-0.01,1.01)

        ax.set_xticks([0.0,0.2,0.4,0.6,0.8,1.0])

        ax.set_xlabel("False positive rate")
        ax.set_ylabel("True positive rate")


        sns.lineplot(
            data=df_x_y_list_mean_ext,
            x="ROCx", y="ROCy", units="KO",
            estimator = None, lw=0.6, alpha = 0.01, 
            color = color,
        )
        
        ax.set_title(Type + " " + prediction)
        
        plt.savefig("figures/NK_M0151_ROC_"+Type+"_"+prediction+".pdf", bbox_inches = 'tight')
        plt.savefig("figures/NK_M0151_ROC_"+Type+"_"+prediction+".png", dpi=300, bbox_inches = 'tight')
        
        plt.close()