In [None]:
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.colors as mcolor
import os
import json
from Bio import Phylo
import numpy as np
import gzip
from sklearn.metrics import roc_auc_score
import seaborn as sns
import plotly.express as px
from ete3 import Tree
from scipy import stats
from sklearn.decomposition import PCA
import umap

In [None]:
matplotlib.rcParams['font.family']       = 'Arial'
matplotlib.rcParams['font.sans-serif']   = ["Arial","DejaVu Sans","Lucida Grande","Verdana"]
matplotlib.rcParams['figure.figsize']    = [4,3]
matplotlib.rcParams['font.size']         = 10
matplotlib.rcParams["axes.labelcolor"]   = "#000000"
matplotlib.rcParams["axes.linewidth"]    = 1.0 
matplotlib.rcParams["xtick.major.width"] = 1.0
matplotlib.rcParams["ytick.major.width"] = 1.0
cmap1 = plt.cm.tab20
cmap2 = plt.cm.Set3  
#plt.style.use('default')

In [None]:
os.chdir("/Users/konnonaoki/Documents/backupped/Research/IwasakiLab/Data/MetabolicNetworkEvolution/experiment/NK_M0153")

for dir in ["figures", "tables", "networks", "itol", "tree", "itol/ko_ex_prevanc", "itol/evo_pred"]:
    try:
        os.mkdir(dir)
    except:
        None

In [None]:
# Classess of KOs

table_dir = "/Users/konnonaoki/Documents/backupped/Research/IwasakiLab/Data/MetabolicNetworkEvolution/experiment/NK_M0151"

df_path_ko = pd.read_table(table_dir + "/tables/path_ko.txt", names = ['Pathway', 'KO'])
df_rn_ko = pd.read_table(table_dir + "/tables/rn_ko.txt", names = ['Reaction','KO'])
df_md_ko = pd.read_table(table_dir + "/tables/md_ko.txt", names = ['Module','KO'])
df_path_md = pd.read_table(table_dir + "/tables/path_md.txt", names = ['Pathway','Module'])
ontology = json.load(open("/Users/konnonaoki/Documents/backupped/Research/IwasakiLab/Data/MetabolicNetworkEvolution/experiment/NK_M0151/json/ko00001.json"))

ontology_tree = Phylo.BaseTree.Tree(Phylo.BaseTree.Clade(name=ontology['name']))
root_clade    = Phylo.BaseTree.Clade(name=ontology['name'])
stack = [(ontology, root_clade)]

while len(stack) > 0:
    term, clade = stack.pop()
    if ('children' in term.keys()):
        for child in term['children']:
            child_clade = Phylo.BaseTree.Clade(name = child['name'])
            clade.clades.append(child_clade)
            stack.append((child, child_clade))

ontology_tree = Phylo.BaseTree.Tree(root_clade)

list_category_ko = []
for clade in ontology_tree.clade.clades[0].clades:
    for tip in clade.get_terminals():
        KO = tip.name.split()[0]
        if (KO[0] == 'K'):
            list_category_ko.append([clade.name, KO])
df_category_ko = pd.DataFrame(list_category_ko, columns = ['category', 'KO'])
st_category_ko = []
for clade in ontology_tree.clade.clades[0].clades:
    for tip in clade.get_terminals():
        KO = tip.name.split()[0]
        if (KO[0] == 'K'):
            list_category_ko.append([clade.name, KO])
df_category_ko = pd.DataFrame(list_category_ko, columns = ['category', 'KO'])
df_category_ko = df_category_ko[~df_category_ko.duplicated()]

df_ko_count = pd.DataFrame(df_category_ko.KO.value_counts())
set_ko_with_unique_category = set(df_ko_count[df_ko_count['KO']==1].index)
df_category_ko['unique'] = [(ko in set_ko_with_unique_category) for ko in df_category_ko.KO]
df_uniquecategory_ko = df_category_ko[df_category_ko['unique']]

# color of function categories

colors = ['#66C2A5', '#FC8D62', '#8DA0CB', '#E78AC3', '#555555', '#FC8D62', '#8DA0CB', '#E78AC3', '#66C2A5', '#FC8D62', '#000000']

cm_name = 'Set3' # B->G->R
cm = plt.get_cmap(cm_name)

df_category_ko_module = pd.merge(df_category_ko, df_md_ko, on = 'KO')
df_category_ko_module['Nko'] = 1
df_category_module_count = df_category_ko_module.groupby(['category', 'Module'], as_index = False).sum()
df_maxcategory_module = df_category_module_count.loc[df_category_module_count.groupby('Module')['Nko'].idxmax(),:].sort_values('category')
df_maxcategory_module = df_maxcategory_module.reset_index().loc[:, ['category', 'Module']]
df_category_color = pd.DataFrame([[category, i] for i, category in enumerate(df_maxcategory_module.category.unique())], columns = ["category", 'category_id'])
df_category_color['color'] = [mcolor.rgb2hex(cm(i)) for i in df_category_color['category_id']]
#df_category_color

df_category_ko_pathway = pd.merge(df_category_ko, df_path_ko, on = 'KO')
df_category_ko_pathway['Nko'] = 1
df_category_pathway_count = df_category_ko_pathway.groupby(['category', 'Pathway'], as_index = False).sum()
df_maxcategory_pathway = df_category_pathway_count.loc[df_category_pathway_count.groupby('Pathway')['Nko'].idxmax(),:].sort_values('category')
df_maxcategory_pathway = df_maxcategory_pathway.reset_index().loc[:, ['category', 'Pathway']]
df_ko_desc = pd.read_table(table_dir + "/tables/ko_desc.txt", names = ['KO', 'Description'])
df_uniquecategory_ko

#### Open results of ongoing evolution prediction + reference phylogeny

In [None]:
df_future = pd.read_table(gzip.open("result/sp_prob.test.txt.gz"), names = ['KO', 'target', 'pred_method', 'selec_method', 'opt_mode', 'Nfeatures', 'species', 'prob'])
df_future

In [None]:
# all pairs of OG and extant species
df_ko_sp_enumerated = df_future[(df_future['pred_method']=='LR') & (df_future['selec_method']=='ANOVA') & (df_future['opt_mode']=='N_opt_for_AUC_of_the_OG')].loc[:,['KO', 'species']].drop_duplicates() 
df_ko_sp_enumerated

In [None]:
tree = Phylo.read("/Users/konnonaoki/Documents/backupped/Research/IwasakiLab/Data/MetabolicNetworkEvolution/experiment/NK_M0150/tree/bac120_msa_r89.faa.mlgtdb.representative.renamed.rooted.nwk", "newick")
sp2ancsp = {}
for internal in tree.get_nonterminals():
    for child in internal.clades:
        sp2ancsp[child.name] = internal.name

#### possession ratio for each pair of KO and species

In [None]:
df_gn_ko_posratio = pd.read_table("../NK_M0145/list/sp_gn_posratio.txt", names = ["sptaxid", "KO", "posratio"])
df_Ngenomes = pd.read_table("/Users/konnonaoki/Documents/backupped/Research/IwasakiLab/Data/MetabolicNetworkEvolution/experiment/NK_M0145/list/analyzed_Ngenomes.txt",names=["#genomes","sptaxid","kegg","repgenome","name", '#genomes_up_to_100'])
df_gn_ko_posratio = pd.merge(df_gn_ko_posratio,df_Ngenomes.loc[:,["sptaxid","kegg"]])
df_gn_ko_posratio['species'] = [sp.split(':')[1] for sp in df_gn_ko_posratio['kegg']]
df_ko_sp_posratio = df_gn_ko_posratio.loc[:, ['KO', 'species', 'posratio']]
df_ko_sp_posratio

In [None]:
len(set(df_ko_sp_posratio['species']))

In [None]:
tree =  Tree("/Users/konnonaoki/Documents/backupped/Research/IwasakiLab/Data/MetabolicNetworkEvolution/experiment/NK_M0150/tree/bac120_msa_r89.faa.mlgtdb.representative.renamed.rooted.nwk", format = 1)
tree.prune(set(df_ko_sp_posratio['species']))
open("tree/extracted_tree_114_species.nwk", 'w').write(tree.write())

#### Presence/absence of each OFs in each species (ancestral and extant)

In [None]:
# read as a dataframe
df_ko_sp_presence = pd.read_table("/Users/konnonaoki/Documents/backupped/Research/IwasakiLab/Data/MetabolicNetworkEvolution/experiment/NK_M0150/result/ko_gn_weight.mlgtdb_MPPA.txt", names = ['KO', 'species', 'presence'])

# read as a dictionary
ko_sp2presence = {}
for ko, sp, presence in zip(df_ko_sp_presence['KO'], df_ko_sp_presence['species'], df_ko_sp_presence['presence']):
    if ko not in ko_sp2presence.keys():
        ko_sp2presence[ko] = {}
    ko_sp2presence[ko][sp] = presence

df_ko_sp_presence = pd.merge(df_ko_sp_enumerated, df_ko_sp_presence, on = ['KO', 'species'], how = 'left')
df_ko_sp_presence['presence'] = df_ko_sp_presence['presence'].fillna(0)
    
df_ko_sp_presence


In [None]:
df_ko_sp_presence['anc_species'] = [sp2ancsp[sp] if sp != 'Bacteria' else '' for sp in df_ko_sp_presence['species']]
df_ko_sp_presence['anc_presence'] = [(ko_sp2presence[ko][anc] if anc in ko_sp2presence[ko].keys() else 0) if ko in ko_sp2presence.keys() else 0 for ko, anc in zip(df_ko_sp_presence['KO'], df_ko_sp_presence['anc_species'])]
df_ko_sp_presence

In [None]:
analyzed_sp_set = set(df_ko_sp_posratio['species'])
df_ko_sp_presence['pangenome_analyzed'] = [(sp in analyzed_sp_set) for sp in df_ko_sp_presence['species']]
df_ko_sp_presence_ext = df_ko_sp_presence[df_ko_sp_presence['pangenome_analyzed']]
df_ko_sp_presence_ext

In [None]:
# Presence-AbsenceのiTOL label 
for KO in list(set(df_ko_sp_presence['KO'])):

    df_ko_sp_presence_ko = df_ko_sp_presence[df_ko_sp_presence['KO']==KO]

    color_absent = "#E5E5E5"
    color_absent_newly = "#CCCCCC"
    
    color_present = "#3E8D27"
    color_present_newly = "#97F788"

    with open("itol/ko_ex_prevanc/annotation.ex_prevanc."+KO+".txt", 'w') as handle:
        handle.write(
            "DATASET_COLORSTRIP\nSEPARATOR SPACE\nDATASET_LABEL "+KO+"\nCOLOR #ff0000\nDATA\n"
        )

        for species, presence, anc_presence in zip(df_ko_sp_presence_ko['species'], df_ko_sp_presence_ko['presence'], df_ko_sp_presence_ko['anc_presence']):
            if(abs(presence - 1) < 0.01 and anc_presence > 0.9):
                handle.write(
                    species + " " + color_present + " present_"+str(anc_presence)+"\n"
                )
            elif(abs(presence - 1)<0.01 and anc_presence < 0.9):
                handle.write(
                    species + " " + color_present_newly + " newly_present_"+str(anc_presence)+"\n"
                )
            elif(abs(presence - 0)<0.01 and anc_presence > 0.1):
                handle.write(
                    species + " " + color_absent_newly + " newly_absent_"+str(anc_presence)+"\n"
                )
            elif(abs(presence - 0)<0.01 and anc_presence < 0.1):
                handle.write(
                    species + " " + color_absent + " absent_"+str(anc_presence)+"\n"
                )

#### Combine dataframes

In [None]:
df_ko_sp_presence_ext_future = pd.merge(df_ko_sp_presence_ext, df_future, on = ['KO', 'species'], how = 'left')
df_ko_sp_presence_ext_future

In [None]:
df_ko_sp_presence_ext_future_posratio = pd.merge(df_ko_sp_presence_ext_future, df_ko_sp_posratio, on = ['KO', 'species'], how = 'left')
df_ko_sp_presence_ext_future_posratio

In [None]:
df_ko_sp_presence_ext_future_posratio['posratio'] = df_ko_sp_presence_ext_future_posratio['posratio'].fillna(0)
df_ko_sp_presence_ext_future_posratio

#### Histogram of gene possession ratio

In [None]:
df_presence_posratio = \
    df_ko_sp_presence_ext_future_posratio[
        #(df_ko_sp_presence_ext_future_posratio['target']=='gain') &
        (df_ko_sp_presence_ext_future_posratio['pred_method']=='LR') &
        (df_ko_sp_presence_ext_future_posratio['selec_method']=='ANOVA') &
        (df_ko_sp_presence_ext_future_posratio['opt_mode']=='N_opt_for_AUC_of_the_OG') 
    ].loc[:,['KO', 'species', 'presence', 'posratio']]

df_presence_posratio_category = pd.merge(df_presence_posratio, df_uniquecategory_ko, on = 'KO').drop_duplicates()
df_presence_posratio_category

In [None]:
Nnot_possessing_in_KEGG_but_possessed_by_some_strain = len(df_presence_posratio_category[(df_presence_posratio_category["presence"]==0) & (df_presence_posratio_category["posratio"]>0)])
Nnot_possessing_in_KEGG_but_possessed_by_some_strain / 239058

In [None]:


for category in list(set(df_presence_posratio_category['category']))+['any']:
    
    #if(True):
    #category = '09105 Amino acid metabolism'
    
    if (category =='any'):
        df_presence_posratio_category_ext = df_presence_posratio_category
    else:    
        df_presence_posratio_category_ext = df_presence_posratio_category[df_presence_posratio_category['category'] == category]

    fig = plt.figure(figsize=(2.5,1.7))
    ax = fig.add_axes([0.1,0.1,0.8,0.8])
    ax.hist(
        [
            df_presence_posratio_category_ext[(df_presence_posratio_category_ext['posratio']>0) & ((df_presence_posratio_category_ext['presence']>0)==1)]['posratio'], 
            df_presence_posratio_category_ext[(df_presence_posratio_category_ext['posratio']>0) & ((df_presence_posratio_category_ext['presence']>0)==0)]['posratio']
        ], bins=40, color=['#75868C', "#DDE2E0"],stacked=True
    )

    #ax.set_yscale("log")
    plt.gca().spines['right'].set_visible(False)
    plt.gca().spines['top'].set_visible(False)
    ax.set_xlabel("Possesion ratio")
    ax.set_title(category)
    
    ax2 = fig.add_axes([-0.12,0.1,0.03,0.8])
    ax2.hist(
        [
            df_presence_posratio_category_ext[(df_presence_posratio_category_ext['posratio']==0) & ((df_presence_posratio_category_ext['presence']>0)==1)]['posratio'], 
            df_presence_posratio_category_ext[(df_presence_posratio_category_ext['posratio']==0) & ((df_presence_posratio_category_ext['presence']>0)==0)]['posratio']
        ], bins=1, color=['#75868C', "#DDE2E0"],stacked=True
    )
    
    plt.gca().spines['right'].set_visible(False)
    plt.gca().spines['top'].set_visible(False)
    ax2.set_ylabel("# of (OG, Species)")
    ax2.set_xticks([0])
    
    plt.savefig("figures/posratio_distribution."+category+".pdf",bbox_inches='tight')
    #plt.show()  
    plt.close()

#### Visualize ongoing gene gain/loss prediction by scatter plots

In [None]:
target = 'gain'

threshold = 0.10

df_ko_sp_presence_ext_future_posratio

df_presence_posratio = \
    df_ko_sp_presence_ext_future_posratio[
        (df_ko_sp_presence_ext_future_posratio['target']==target) &
        (df_ko_sp_presence_ext_future_posratio['pred_method']=='LR') &
        (df_ko_sp_presence_ext_future_posratio['selec_method']=='ANOVA') &
        (df_ko_sp_presence_ext_future_posratio['opt_mode']=='N_opt_for_AUC_of_the_OG') 
    ].loc[:,['KO', 'species', 'presence', 'anc_presence', 'posratio', 'prob']]

df_presence_posratio_to_be_analyzed = df_presence_posratio[(df_presence_posratio['presence']==0) & (df_presence_posratio['anc_presence']==0)]

gaining_nongaining_list = []

all_count = 0
count = 0

for ko in list(set(df_presence_posratio_to_be_analyzed['KO'])):
    
    all_count += 1

    df_presence_posratio_to_be_analyzed_ko = df_presence_posratio_to_be_analyzed[df_presence_posratio_to_be_analyzed['KO'] == ko]
    
    df_doing = df_presence_posratio_to_be_analyzed_ko[df_presence_posratio_to_be_analyzed_ko['posratio'] >= threshold]
    df_nondoing = df_presence_posratio_to_be_analyzed_ko[df_presence_posratio_to_be_analyzed_ko['posratio'] < threshold]

    if (len(df_doing) > 1 and len(df_nondoing) > 1): 
    
        gaining_nongaining_list.append(
            [
                ko,
                np.median(df_doing.prob),
                np.median(df_nondoing.prob)
            ]
        )
    
    else:
        
        count += 1
    
print("Omitted "+str(count)+" OGs in "+str(all_count)+" OGs")
df_pred_result = pd.DataFrame(gaining_nongaining_list, columns = ['KO', 'doing_medprob', 'nondoing_medprob'])

In [None]:
fig = plt.figure(figsize=(3,3))
ax = fig.add_axes([0.1,0.1,0.8,0.8])
x = np.arange(0, 10)
y = x
ax.plot(x, y, color = "#AAAAAA", alpha = 0.5)

ax.set_xlabel("Median gain prob. predicted\nfor gaining species")#,color='#C0C0C0')
ax.set_ylabel("Median gain prob. predicted\nfor non-gaining species")#,color='#544DE6')

min_lim = 0.0001
df_pred_result['doing_medprob'] = [max(prob, 0.0001) for prob in df_pred_result['doing_medprob']]
df_pred_result['nondoing_medprob'] = [max(prob, 0.0001) for prob in df_pred_result['nondoing_medprob']]
ax.scatter(df_pred_result["doing_medprob"], df_pred_result["nondoing_medprob"],alpha=0.5, s = 5, color ="#645FCA")#color=df_testresult['color_0to50vs50to100'])

ax.set_xlim(min_lim,1.1)
ax.set_ylim(min_lim,1.1)
ax.set_xscale('log')
ax.set_yscale('log')
plt.gca().spines['right'].set_visible(False)
plt.gca().spines['top'].set_visible(False)

plt.savefig("figures/gaining.any.pdf",bbox_inches='tight')

In [None]:
target = 'loss'

gaining_nongaining_list = []

for threshold in [0.90, 0.91, 0.92, 0.93, 0.94, 0.95, 0.96, 0.97, 0.98, 0.99]:
    df_presence_posratio = \
        df_ko_sp_presence_ext_future_posratio[
            (df_ko_sp_presence_ext_future_posratio['target']==target) &
            (df_ko_sp_presence_ext_future_posratio['pred_method']=='LR') &
            (df_ko_sp_presence_ext_future_posratio['selec_method']=='ANOVA') &
            (df_ko_sp_presence_ext_future_posratio['opt_mode']=='N_opt_for_AUC_of_the_OG') 
        ].loc[:,['KO', 'species', 'presence', 'anc_presence', 'posratio', 'prob']]

    df_presence_posratio_to_be_analyzed = df_presence_posratio[(df_presence_posratio['presence']==1) & (df_presence_posratio['anc_presence']==1)]

    all_count = 0
    count = 0

    for ko in list(set(df_presence_posratio_to_be_analyzed['KO'])):

        all_count += 1

        df_presence_posratio_to_be_analyzed_ko = df_presence_posratio_to_be_analyzed[df_presence_posratio_to_be_analyzed['KO'] == ko].reset_index()

        df_doing = df_presence_posratio_to_be_analyzed_ko[df_presence_posratio_to_be_analyzed_ko['posratio'] < threshold]
        df_nondoing = df_presence_posratio_to_be_analyzed_ko[df_presence_posratio_to_be_analyzed_ko['posratio'] >= threshold]
        df_presence_posratio_to_be_analyzed_ko["doing"] = (df_presence_posratio_to_be_analyzed_ko['posratio'] < threshold)

        if (len(df_doing) > 1 and len(df_nondoing) > 1): 

            gaining_nongaining_list.append(
                [
                    threshold,
                    ko,
                    np.median(df_doing.prob),
                    np.median(df_nondoing.prob),
                    roc_auc_score(df_presence_posratio_to_be_analyzed_ko["doing"], df_presence_posratio_to_be_analyzed_ko["prob"])
                ]
            )

        else:

            count += 1

    print("Omitted "+str(count)+" OGs in "+str(all_count)+" OGs")

df_pred_result = pd.DataFrame(gaining_nongaining_list, columns = ['threshold', 'KO', 'doing_medprob', 'nondoing_medprob', 'auc'])

In [None]:
df_pred_result

In [None]:
threshold = 0.90
df_pred_result_ext = df_pred_result[df_pred_result['threshold'] == threshold]

fig = plt.figure(figsize=(3,3))
ax = fig.add_axes([0.1,0.1,0.8,0.8])
x = np.arange(0, 10)
y = x
ax.plot(x, y, color = "#AAAAAA", alpha = 0.5)

ax.set_xlabel("Median loss prob. predicted\nfor gaining species")#,color='#C0C0C0')
ax.set_ylabel("Median loss prob. predicted\nfor non-gaining species")#,color='#544DE6')

min_lim = 0.0001
df_pred_result_ext['doing_medprob'] = [max(prob, 0.0001) for prob in df_pred_result_ext['doing_medprob']]
df_pred_result_ext['nondoing_medprob'] = [max(prob, 0.0001) for prob in df_pred_result_ext['nondoing_medprob']]
ax.scatter(df_pred_result_ext["doing_medprob"], df_pred_result_ext["nondoing_medprob"],alpha=0.5, s = 5, color ='#E1BB63')#color=df_testresult['color_0to50vs50to100'])

ax.set_xlim(min_lim,1.1)
ax.set_ylim(min_lim,1.1)
ax.set_xscale('log')
ax.set_yscale('log')
plt.gca().spines['right'].set_visible(False)
plt.gca().spines['top'].set_visible(False)

plt.savefig("figures/losing.any.pdf",bbox_inches='tight')

#### Draw histogram of AUCs by function, prediction method, and threshold

##### Time-consuming! calculate AUC for each threshold

In [None]:
pred_result_list = []

for pred_method in ['LR', 'RF']:
    for selec_method in ['ANOVA', 'RandomForest']:
        for opt_mode in ['N_opt_for_AUC_of_the_OG', 'N_opt_for_median_AUC']:

            for target, threshold_list in [('gain', [0, 0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1]), ('loss', [0.89, 0.90, 0.91, 0.92, 0.93, 0.94, 0.95, 0.96, 0.97, 0.98, 0.99])]:
                
                for threshold in threshold_list:
                    
                    df_presence_posratio = \
                        df_ko_sp_presence_ext_future_posratio[
                            (df_ko_sp_presence_ext_future_posratio['target']==target) &
                            (df_ko_sp_presence_ext_future_posratio['pred_method']==pred_method) &
                            (df_ko_sp_presence_ext_future_posratio['selec_method']==selec_method) &
                            (df_ko_sp_presence_ext_future_posratio['opt_mode']==opt_mode) 
                        ].loc[:,['KO', 'species', 'presence', 'anc_presence', 'posratio', 'prob']]

                    
                    #########Difference between 'gain' and 'loss'#######
                    if target == 'gain':
                        df_presence_posratio_to_be_analyzed = df_presence_posratio[(df_presence_posratio['presence']==0) & (df_presence_posratio['anc_presence']==0)]
                    elif target == 'loss':
                        df_presence_posratio_to_be_analyzed = df_presence_posratio[(df_presence_posratio['presence']==1) & (df_presence_posratio['anc_presence']==1)]
                    #########################################
                        
                        
                    all_count = 0
                    count = 0

                    for ko in list(set(df_presence_posratio_to_be_analyzed['KO'])):

                        all_count += 1

                        df_presence_posratio_to_be_analyzed_ko = df_presence_posratio_to_be_analyzed[df_presence_posratio_to_be_analyzed['KO'] == ko].reset_index()

                        #########Difference between 'gain' and 'loss'#######
                        if target == 'gain':
                            df_presence_posratio_to_be_analyzed_ko["doing"] = (df_presence_posratio_to_be_analyzed_ko['posratio'] > threshold)
                        elif target == 'loss':
                            df_presence_posratio_to_be_analyzed_ko["doing"] = (df_presence_posratio_to_be_analyzed_ko['posratio'] < threshold)
                        #########################################

                        df_doing        = df_presence_posratio_to_be_analyzed_ko[df_presence_posratio_to_be_analyzed_ko["doing"]]
                        df_nondoing  = df_presence_posratio_to_be_analyzed_ko[~(df_presence_posratio_to_be_analyzed_ko["doing"])]
                        
                        Ndoing = len(df_doing)
                        

                        if (len(df_doing) > 1 and len(df_nondoing) > 1):

                            pred_result_list.append(
                                [
                                    target,
                                    pred_method,
                                    selec_method,
                                    opt_mode,
                                    threshold,
                                    ko,
                                    Ndoing,
                                    np.median(df_doing.prob),
                                    np.median(df_nondoing.prob),
                                    roc_auc_score(df_presence_posratio_to_be_analyzed_ko["doing"], df_presence_posratio_to_be_analyzed_ko["prob"])
                                ]
                            )

                        else:

                            count += 1

                    print("Omitted "+str(count)+" OGs in "+str(all_count)+" OGs")

df_pred_result = pd.DataFrame(pred_result_list, columns = ['target','pred_method','selec_method','opt_mode','threshold', 'KO', 'Ndoing', 'doing_medprob', 'nondoing_medprob', 'auc'])
df_pred_result.to_csv('tables/pred_result.txt',sep = '\t', index = False)

In [None]:
df_pred_result = pd.read_csv('tables/pred_result.txt',sep = '\t')
df_pred_result

In [None]:
for pred_method in ['LR', 'RF']:
    for selec_method in ['ANOVA', 'RandomForest']:
        for opt_mode in ['N_opt_for_AUC_of_the_OG', 'N_opt_for_median_AUC']:

            for target, color  in [('gain',"#645FCA"),('loss',  '#E1BB63')]:
                    
                    df_pred_result_ext = \
                        df_pred_result[
                            (df_pred_result['target']==target) &
                            (df_pred_result['pred_method']==pred_method) &
                            (df_pred_result['selec_method']==selec_method) &
                            (df_pred_result['opt_mode']==opt_mode) 
                    ]
                    

                    fig = plt.figure(figsize=(4,1.5))
                    ax = fig.add_axes([0.1,0.1,0.8,0.8])
                    #sns.violinplot(data = df_pred_result, y = 'auc', linewidth=0.5, color = '#FFFFFF', alpha = 0)
                    #sns.stripplot (data = df_pred_result, y = 'auc', linewidth=0,    size = 1, alpha=.5,jitter=0.3, dodge=True, palette=[color])

                    sns.violinplot(data = df_pred_result_ext, x = 'threshold', y = 'auc', linewidth=0.5, color = '#FFFFFF', alpha = 0)
                    sns.stripplot (data = df_pred_result_ext, x = 'threshold', y = 'auc', linewidth=0, size = 1, alpha=.5,jitter=0.3, dodge=True, palette=[color])
                    ax.tick_params(axis='x', labelrotation= 90)
                    ax.set_title(pred_method+" "+selec_method+" "+opt_mode+" "+target)
                    #ax.get_legend().remove()
                    ax.set_xlabel('Threshold')
                    ax.set_ylabel('AUC')
                    
                    ax.set_ylim(-0.05, 1.05)
                    
                    plt.savefig("figures/threshold_AUC."+pred_method+"."+selec_method+"."+opt_mode+"."+target+".pdf", bbox_inches = 'tight')
                    #plt.show()
                    plt.close()
                    

In [None]:
df_pred_result_category = pd.merge(df_pred_result, df_uniquecategory_ko, on = 'KO')

for pred_method in ['LR', 'RF']:
    for selec_method in ['ANOVA', 'RandomForest']:
        for opt_mode in ['N_opt_for_AUC_of_the_OG', 'N_opt_for_median_AUC']:

            for target, color, threshold  in [('gain',"#645FCA", 0.05),('loss',  '#E1BB63', 0.90)]:
                    
                    df_pred_result_ext = \
                        df_pred_result_category[
                            (df_pred_result_category['target']==target) &
                            (df_pred_result_category['pred_method']==pred_method) &
                            (df_pred_result_category['selec_method']==selec_method) &
                            (df_pred_result_category['opt_mode']==opt_mode) &
                            (df_pred_result_category['threshold']==threshold) 

                    ]
                    
                    category_order = list(reversed(list(df_pred_result_category[
                            (df_pred_result_category['target']=="gain") &
                            (df_pred_result_category['pred_method']==pred_method) &
                            (df_pred_result_category['selec_method']==selec_method) &
                            (df_pred_result_category['opt_mode']==opt_mode) &
                            (df_pred_result_category['threshold']==0.05) ].groupby("category", as_index=False).median().sort_values("auc")["category"])))
                    
                    fig = plt.figure(figsize=(1.8,3))
                    ax = fig.add_axes([0.1,0.1,0.8,0.8])
                    #sns.violinplot(data = df_pred_result, y = 'auc', linewidth=0.5, color = '#FFFFFF', alpha = 0)
                    #sns.stripplot (data = df_pred_result, y = 'auc', linewidth=0,    size = 1, alpha=.5,jitter=0.3, dodge=True, palette=[color])

                    sns.boxplot(data = df_pred_result_ext, y = 'category', x = 'auc', order = category_order, orient = "h", linewidth=0.5, color = '#FFFFFF',sym="") # does not show outliers
                    #sns.violinplot(data = df_pred_result_ext, x = 'category', y = 'auc', order = sorted(list(set(df_pred_result_ext['category']))), linewidth=0.5, color = '#FFFFFF', alpha = 0)
                    sns.stripplot (data = df_pred_result_ext, y = 'category', x = 'auc', order = category_order,orient = "h",  linewidth=0, size = 2, alpha=.5,jitter=0.3, dodge=True, palette=[color])
                    ax.set_title(pred_method+" "+selec_method+" "+opt_mode+" "+target)
                    #ax.get_legend().remove()
                    ax.set_ylabel('Category')
                    ax.set_xlabel('AUC')
                    
                    ax.set_xlim(-0.05, 1.05)
                    
                    plt.savefig("figures/category_AUC."+pred_method+"."+selec_method+"."+opt_mode+"."+target+".pdf", bbox_inches = 'tight')
                    #plt.show()
                    plt.close()

##### Gene gain

In [None]:
pred_method = "RF"
selec_method = "ANOVA"
opt_mode = 'N_opt_for_AUC_of_the_OG'

#target = "gain"
#threshold = 0.05

for target, threshold, color in [("gain", 0, "#645FCA"), ("loss", 0.90, '#E1BB63')]:

    df_pred_result_ext = \
        df_pred_result[
            (df_pred_result['target']==target) &
            (df_pred_result['pred_method']==pred_method) &
            (df_pred_result['selec_method']==selec_method) &
            (df_pred_result['opt_mode']==opt_mode) &
            (df_pred_result['threshold']==threshold)
        ]
    
    fig = plt.figure(figsize=(2.3,1.2))
    ax = fig.add_axes([0.1,0.1,0.8,0.8])
    plt.gca().spines['right'].set_visible(False)
    plt.gca().spines['top'].set_visible(False)
    plt.gca().spines['bottom'].set_visible(False)
    ax2 = fig.add_axes([0.1,-0.1,0.8,0.18])
    plt.gca().spines['right'].set_visible(False)
    plt.gca().spines['top'].set_visible(False)
    plt.gca().spines['left'].set_visible(False)
    ax.tick_params(bottom = False, labelbottom = False)
    ax2.tick_params(left = False,)

    sns.kdeplot(df_pred_result_ext["auc"],color = color, ax = ax, fill=True)
    sns.stripplot(data = df_pred_result_ext, x = "auc", color = color, size = 1, alpha=.5,jitter=0.4,)

    ax.set_xlim(-0.1,1.1)
    ax2.set_xlim(-0.1,1.1)
    ax2.set_xlabel("AUC of "+target+" prediction")

    ax.set_title(target)
    ax.set_ylim(0,3)
    
    ax.axvline(0.5, alpha = 0.5, lw = 0.75, color = "#000000")

    plt.savefig("figures/kdeplot_AUC."+pred_method+"."+selec_method+"."+opt_mode+"."+target+".pdf", bbox_inches = 'tight')
    plt.show()
    plt.close()

    print(target, threshold, stats.ttest_1samp(df_pred_result_ext["auc"], 0.5))

In [None]:
# for iTOL

#target = 'gain'
#threshold = 0.05
#KO_list = ['K05898', 'K16842', 'K01214', 'K02805', 'K18824']

for target, threshold, KO_list in [('gain', 0.05, ['K05898', 'K16842', 'K01214', 'K02805', 'K18824','K01601']), ('loss', 0.90, ['K01523', 'K19270', 'K01664', 'K00336', 'K00991'])]:

    for KO in KO_list:

        target_df = df_ko_sp_presence_ext_future_posratio[
            (df_ko_sp_presence_ext_future_posratio['target']==target) &
            (df_ko_sp_presence_ext_future_posratio['KO']==KO) &
            (df_ko_sp_presence_ext_future_posratio['pred_method']=='RF') &
            (df_ko_sp_presence_ext_future_posratio['selec_method']=='ANOVA') &
            (df_ko_sp_presence_ext_future_posratio['opt_mode']=='N_opt_for_AUC_of_the_OG') 
        ]
        
        if target == 'gain':
            target_df = target_df[(target_df['presence']==0)&(target_df['anc_presence']==0)]
            target_df = target_df.reset_index()
            target_df['doing'] = target_df['posratio'] > threshold
        elif target =='loss':
            target_df = target_df[(target_df['presence']==1)&(target_df['anc_presence']==1)]
            target_df = target_df.reset_index()
            target_df['doing'] = target_df['posratio'] < threshold

        with open("itol/evo_pred/bar_prob."+KO+"."+pred_method+"."+selec_method+"."+opt_mode+"."+target+".txt", 'w') as handle:
                handle.write(
                    "DATASET_MULTIBAR\nSEPARATOR TAB\nDATASET_LABEL	"+KO+" "+target+" prob\nCOLOR	#A0A5EC\nFIELD_COLORS\t#8682E7\t#AAAAAA\nFIELD_LABELS\tf1\tf2\nALIGN_FIELDS\t0\nMARGIN	0\nWIDTH	100\nDATA\n"
                    )
                for i in range(len(target_df)):
                    name     = list(target_df["species"])[i]
                    prob = list(target_df["prob"])[i]
                    doing  = list(target_df["doing"])[i]
                    #posratio = target_df["posratio"][i]
                    if   doing:
                        handle.write(name+"\t"+str(prob)+"\t0\n")
                    else:
                        handle.write(name+"\t0\t"+str(prob)+"\n")