In [None]:
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.colors as mcolor
import sys
import os
import pandas as pd
import numpy as np
from Bio import Phylo
from scipy.stats import gaussian_kde, ttest_rel
import json

In [None]:
matplotlib.rcParams['font.family']       = 'Arial'
matplotlib.rcParams['font.sans-serif']   = ["Arial","DejaVu Sans","Lucida Grande","Verdana"]
matplotlib.rcParams['figure.figsize']    = [4,3]
matplotlib.rcParams['font.size']         = 10
matplotlib.rcParams["axes.labelcolor"]   = "#000000"
matplotlib.rcParams["axes.linewidth"]    = 1.0 
matplotlib.rcParams["xtick.major.width"] = 1.0
matplotlib.rcParams["ytick.major.width"] = 1.0
cmap1 = plt.cm.tab20
cmap2 = plt.cm.Set3  
#plt.style.use('default')

In [None]:
os.chdir("/Users/konnonaoki/Documents/backupped/Research/IwasakiLab/Data/MetabolicNetworkEvolution/experiment/NK_M0150")

for dir in ["figures", "itol", 'table', "itol/md", "itol/md_loss", "itol/md_loss/blue", "itol/md_loss/red", "itol/md_loss/cyan"]:
    try:
        os.mkdir(dir)
    except:
        None

In [None]:
# Classess of KOs

table_dir = "/Users/konnonaoki/Documents/backupped/Research/IwasakiLab/Data/MetabolicNetworkEvolution/experiment/NK_M0151"

df_path_ko = pd.read_table(table_dir + "/tables/path_ko.txt", names = ['Pathway', 'KO'])
df_rn_ko = pd.read_table(table_dir + "/tables/rn_ko.txt", names = ['Reaction','KO'])
df_md_ko = pd.read_table(table_dir + "/tables/md_ko.txt", names = ['Module','KO'])
df_path_md = pd.read_table(table_dir + "/tables/path_md.txt", names = ['Pathway','Module'])
ontology = json.load(open("/Users/konnonaoki/Documents/backupped/Research/IwasakiLab/Data/MetabolicNetworkEvolution/experiment/NK_M0151/json/ko00001.json"))

ontology_tree = Phylo.BaseTree.Tree(Phylo.BaseTree.Clade(name=ontology['name']))
root_clade    = Phylo.BaseTree.Clade(name=ontology['name'])
stack = [(ontology, root_clade)]

while len(stack) > 0:
    term, clade = stack.pop()
    if ('children' in term.keys()):
        for child in term['children']:
            child_clade = Phylo.BaseTree.Clade(name = child['name'])
            clade.clades.append(child_clade)
            stack.append((child, child_clade))

ontology_tree = Phylo.BaseTree.Tree(root_clade)

list_category_ko = []
for clade in ontology_tree.clade.clades[0].clades:
    for tip in clade.get_terminals():
        KO = tip.name.split()[0]
        if (KO[0] == 'K'):
            list_category_ko.append([clade.name, KO])
df_category_ko = pd.DataFrame(list_category_ko, columns = ['category', 'KO'])
st_category_ko = []
for clade in ontology_tree.clade.clades[0].clades:
    for tip in clade.get_terminals():
        KO = tip.name.split()[0]
        if (KO[0] == 'K'):
            list_category_ko.append([clade.name, KO])
df_category_ko = pd.DataFrame(list_category_ko, columns = ['category', 'KO'])
df_category_ko = df_category_ko[~df_category_ko.duplicated()]

df_ko_count = pd.DataFrame(df_category_ko.KO.value_counts())
set_ko_with_unique_category = set(df_ko_count[df_ko_count['KO']==1].index)
df_category_ko['unique'] = [(ko in set_ko_with_unique_category) for ko in df_category_ko.KO]
df_uniquecategory_ko = df_category_ko[df_category_ko['unique']]

# color of function categories

colors = ['#66C2A5', '#FC8D62', '#8DA0CB', '#E78AC3', '#555555', '#FC8D62', '#8DA0CB', '#E78AC3', '#66C2A5', '#FC8D62', '#000000']

cm_name = 'Set3' # B->G->R
cm = plt.get_cmap(cm_name)

df_category_ko_module = pd.merge(df_category_ko, df_md_ko, on = 'KO')
df_category_ko_module['Nko'] = 1
df_category_module_count = df_category_ko_module.groupby(['category', 'Module'], as_index = False).sum()
df_maxcategory_module = df_category_module_count.loc[df_category_module_count.groupby('Module')['Nko'].idxmax(),:].sort_values('category')
df_maxcategory_module = df_maxcategory_module.reset_index().loc[:, ['category', 'Module']]
df_category_color = pd.DataFrame([[category, i] for i, category in enumerate(df_maxcategory_module.category.unique())], columns = ["category", 'category_id'])
df_category_color['color'] = [mcolor.rgb2hex(cm(i)) for i in df_category_color['category_id']]
#df_category_color

df_category_ko_pathway = pd.merge(df_category_ko, df_path_ko, on = 'KO')
df_category_ko_pathway['Nko'] = 1
df_category_pathway_count = df_category_ko_pathway.groupby(['category', 'Pathway'], as_index = False).sum()
df_maxcategory_pathway = df_category_pathway_count.loc[df_category_pathway_count.groupby('Pathway')['Nko'].idxmax(),:].sort_values('category')
df_maxcategory_pathway = df_maxcategory_pathway.reset_index().loc[:, ['category', 'Pathway']]
df_ko_desc = pd.read_table(table_dir + "/tables/ko_desc.txt", names = ['KO', 'Description'])
df_uniquecategory_ko

In [None]:
def rgb2html(R, G, B):
    return '#%02x%02x%02x' % (R, G, B)

color_list = []
for b in [0,1]:
    for k in range(10):
        i = 2*k+b
        rgb =list(np.array(cmap1(i))[:3]*255)
        color_list.append(rgb2html(int(rgb[0]),int(rgb[1]),int(rgb[2])))
for i in range(30):
    color_list.append("#D9D9D9")

In [None]:
df_phylum_Nspecies = pd.read_table("/Users/konnonaoki/Documents/backupped/Research/IwasakiLab/Data/MetabolicNetworkEvolution/experiment/NK_M0150/list/phylum_Ntips.txt", names = ['Phylum', 'Nspecies'])
df_phylum_Nspecies = df_phylum_Nspecies.sort_values('Nspecies', ascending=False, kind = 'mergesort')
df_phylum_Nspecies['color'] = color_list
df_phylum_Nspecies['phylum_name'] = [phy.replace("p__","") for phy in df_phylum_Nspecies['Phylum']]
df_phylum_Nspecies.to_csv("table/phylum_Nspecies_color_name.txt", index = False)
df_phylum_Nspecies

In [None]:
# Number of species analzed in this study by phylum
fig = plt.figure(figsize=(9,2))
ax = fig.add_axes([0.1,0.1,0.8,0.8])
ax.bar(x = df_phylum_Nspecies['phylum_name'], height = df_phylum_Nspecies['Nspecies'],color = df_phylum_Nspecies['color'])
ax.set_yscale("log")
ax.tick_params(axis='x', labelrotation= 90)
ax.set_xlim(-1,50)
ax.set_xlabel("Phylum")
ax.set_ylabel("#species")
plt.savefig("figures/NK_M0150_Nspecies_by_phlum.pdf",bbox_inches='tight')
plt.close()

In [None]:
df_taxonomy =  pd.read_table("/Users/konnonaoki/Documents/backupped/Research/IwasakiLab/Data/MetabolicNetworkEvolution/experiment/NK_M0150/list/gn_taxgtdb2.txt", names = ["ID", "Domain", "Phylum", "Class", "Order", "Family", "Genus", "Species"])

In [None]:
df_ID_tax_merge = pd.merge(df_taxonomy,df_phylum_Nspecies, on = "Phylum")
df_ID_tax_merge.to_csv("table/sp_taxonomy.txt", sep="\t", index=False)
df_ID_tax_merge

In [None]:
with open("itol/phylum_color.txt", 'w') as handle:
    handle.write(
        "DATASET_COLORSTRIP\nSEPARATOR SPACE\nDATASET_LABEL label1\nCOLOR #ff0000\nDATA\n"
    )

    for keggid, phylum, color in zip(list(df_ID_tax_merge["ID"]), list(df_ID_tax_merge["Phylum"]), list(df_ID_tax_merge["color"])): 
        handle.write(
            keggid + " " + color + " " + phylum + "\n"
        )

In [None]:
tree_ml = Phylo.read("/Users/konnonaoki/Documents/backupped/Research/IwasakiLab/Data/MetabolicNetworkEvolution/experiment/NK_M0150/tree/bac120_msa_r89.faa.mlgtdb.representative.renamed.rooted.nwk", 'newick')
tree_nj = Phylo.read("/Users/konnonaoki/Documents/backupped/Research/IwasakiLab/Data/MetabolicNetworkEvolution/experiment/NK_M0150/tree/bac120_msa_r89.faa.nj.representative.renamed.nwk", 'newick')
#tree_ml = Phylo.read("/Users/konnonaoki/Documents/backupped/Research/IwasakiLab/Data/MetabolicNetworkEvolution/experiment/NK_M0150/tree/bac120_msa_r89.faa.mlgtdb.nwk", 'newick')
#tree_nj = Phylo.read("/Users/konnonaoki/Documents/backupped/Research/IwasakiLab/Data/MetabolicNetworkEvolution/experiment/NK_M0150/tree/bac120_msa_r89.faa.nj.rename.nwk", 'newick')

In [None]:
tip_set_ml0 = set([tip.name for tip in tree_ml.clade.clades[0].get_terminals()])
tip_set_ml1 = set([tip.name for tip in tree_ml.clade.clades[1].get_terminals()])
print(len(tip_set_ml0), len(tip_set_ml1))

In [None]:
clade_overlap = []
for clade in tree_nj.get_nonterminals():
    tip_set_nj = set([tip.name for tip in clade.get_terminals()])
    transfer_distance = min(
        len(tip_set_ml0) + len(tip_set_nj) - 2*len(tip_set_ml0 & tip_set_nj),
        len(tip_set_ml1) + len(tip_set_nj) - 2*len(tip_set_ml1 & tip_set_nj),
    )
    clade_overlap.append([clade.name, transfer_distance])

In [None]:
df_clade_overlap = pd.DataFrame(clade_overlap, columns = ["clade", "Transfer_distance"])
df_clade_overlap.sort_values("Transfer_distance")

In [None]:
# Distribution of the ratio of nondetermined states for each KO

tree_method = 'mlgtdb'
acr  = 'MPPA'

tree = Phylo.read('/Users/konnonaoki/Documents/backupped/Research/IwasakiLab/Data/MetabolicNetworkEvolution/experiment/NK_M0150/tree/bac120_msa_r89.faa.'+tree_method+'.representative.renamed.rooted.nwk','newick')
list_extant_species = [clade.name for clade in tree.get_terminals()]
set_extant_species = set(list_extant_species)

df_KO_species_state = pd.read_table("/Users/konnonaoki/Documents/backupped/Research/IwasakiLab/Data/MetabolicNetworkEvolution/experiment/NK_M0150/result/ko_gn_weight."+tree_method+"_"+acr+".txt", names = ['KO', 'species', 'state'])
df_KO_species_state['extant'] = [species in set_extant_species for species in list(df_KO_species_state['species'])]
df_ko = pd.read_table("/Users/konnonaoki/Documents/backupped/Research/IwasakiLab/Data/MetabolicNetworkEvolution/experiment/NK_M0150/dataset/ko_ko.rn.Bacteria.txt", names = ['KO','metabolicKO'])
df_KO_species_state = pd.merge(df_ko, df_KO_species_state)
df_KO_species_state

In [None]:
Ntips = 2894

df_KO_species_state_anc = df_KO_species_state[~df_KO_species_state['extant']]
list_undetermined_ratio = list(df_KO_species_state_anc[df_KO_species_state_anc['state']==0.5].value_counts('KO')/(Ntips - 1))
print(len(list_undetermined_ratio))

In [None]:
list_undetermined_ratio_including_zero = list_undetermined_ratio + [0]*(len(set(df_KO_species_state.KO))-len(list_undetermined_ratio))

print("#total KOs:", len(list_undetermined_ratio_including_zero), ", Average undetermined ratio:", sum(list_undetermined_ratio_including_zero) / len(list_undetermined_ratio_including_zero))

In [None]:
fig = plt.figure(figsize=(2,2))
ax = fig.add_axes([0.1,0.1,0.8,0.8])
ax.hist(list_undetermined_ratio + [0]*(len(set(df_KO_species_state.KO))-len(list_undetermined_ratio)), range = (0,1), bins = 200, color = '#0055FF')
ax.set_xlim(-0.005,0.20)
ax.set_xlabel("Ratio of undetermined states")
ax.set_ylabel("#OGs")
plt.savefig("figures/NK_M0150_undetermined_ratio.pdf",bbox_inches = 'tight')
plt.close()

In [None]:
list_parent_child = []
for node in tree_ml.get_nonterminals():
    for child in node.clades:
        list_parent_child.append([node.name, child.name])
df_parent_child = pd.DataFrame(list_parent_child, columns = ['clade', 'child'])

In [None]:
list_ko_Ngain_Nloss = []
for i, ko in enumerate(list(set(df_ko.KO))):
    if (i%100==0): print(i)
    df_KO_species_state_ext = df_KO_species_state[df_KO_species_state['metabolicKO']==ko]
    df_KO_species_state_ext_merged = pd.merge(df_KO_species_state_ext,df_parent_child, left_on = 'species', right_on = 'child', how = 'right').fillna(0).drop(columns = ['KO', 'metabolicKO', 'species', 'extant']).rename(columns  = {'state':'child_state'})
    df_KO_species_state_ext_merged = pd.merge(df_KO_species_state_ext, df_KO_species_state_ext_merged, left_on = 'species', right_on = 'clade', how = 'right').fillna(0).drop(columns=['metabolicKO', 'extant', 'species'])
    df_KO_species_state_ext_merged['KO'] = ko
    gain_count = len(df_KO_species_state_ext_merged[(df_KO_species_state_ext_merged['state'] == 0) & (df_KO_species_state_ext_merged['child_state'] == 1)])
    loss_count = len(df_KO_species_state_ext_merged[(df_KO_species_state_ext_merged['state'] == 1) & (df_KO_species_state_ext_merged['child_state'] == 0)])
    list_ko_Ngain_Nloss.append([ko, gain_count, loss_count])

df_ko_Ngain_Nloss = pd.DataFrame(list_ko_Ngain_Nloss, columns = ['KO','Ngain', 'Nloss'])
df_ko_Ngain_Nloss.to_csv("table/ko_Ngain_Nloss.mlgtdb_MPPA.txt", sep = '\t', index= False)

#### Count number of gains and losses

In [None]:
df_ko_Ngain_Nloss = pd.read_table("table/ko_Ngain_Nloss.mlgtdb_MPPA.txt")

df_ko_Ngain_Nloss = pd.merge(df_ko_Ngain_Nloss, df_uniquecategory_ko, on = "KO", how = "left")
df_ko_Ngain_Nloss

In [None]:
df_ko_Ngain_Nloss[df_ko_Ngain_Nloss["Nloss"] >= 5]

In [None]:
x = df_ko_Ngain_Nloss.Ngain
y = df_ko_Ngain_Nloss.Nloss
xy = np.vstack([x,y])
z = gaussian_kde(xy)(xy)
idx = z.argsort()
x, y, z = x[idx], y[idx], z[idx]

fig = plt.figure(figsize=(2,2))
ax = fig.add_axes([0.1,0.1,0.8,0.8])
ax.plot([-200, 500], [-200, 500], '-', lw=0.5, alpha = 0.5, color = '#555555')
ax.scatter(x = x, y = y, c = z, s = 1,alpha = 0.5,cmap = 'jet')
ax.set_xlim(-10,320)
ax.set_ylim(-10,320)
#ax.set_xscale("log")
#ax.set_yscale("log")
ax.set_xlabel("#gains")
ax.set_ylabel("#losses")
ax.set_xticks([0,100,200,300])
plt.savefig("figures/NK_M0150_Ngain_Nloss.pdf",bbox_inches = 'tight')
#plt.close()

ttest_rel(x, y)

In [None]:
# by functional category
for category in set(df_uniquecategory_ko["category"]):
    df_ko_Ngain_Nloss_ext = df_ko_Ngain_Nloss[df_ko_Ngain_Nloss["category"] == category].reset_index()
    
    print(len(df_ko_Ngain_Nloss_ext))

    x = df_ko_Ngain_Nloss_ext.Ngain
    y = df_ko_Ngain_Nloss_ext.Nloss
    xy = np.vstack([x,y])
    z = gaussian_kde(xy)(xy)
    idx = z.argsort()
    x, y, z = x[idx], y[idx], z[idx]

    fig = plt.figure(figsize=(2,2))
    ax = fig.add_axes([0.1,0.1,0.8,0.8])
    ax.plot([-200, 500], [-200, 500], '-', lw=0.5, alpha = 0.5, color = '#555555')
    ax.scatter(x = x, y = y, c = z, s = 1,alpha = 0.5,cmap = 'jet')
    ax.set_xlim(-10,320)
    ax.set_ylim(-10,320)
    #ax.set_xscale("log")
    #ax.set_yscale("log")
    ax.set_title(category)
    ax.set_xlabel("#gains")
    ax.set_ylabel("#losses")
    ax.set_xticks([0,100,200,300])
    plt.savefig("figures/NK_M0150_Ngain_Nloss_"+category+".pdf",bbox_inches = 'tight')
    plt.close()

In [None]:
set(df_ko_Ngain_Nloss["category"])

In [None]:
df_ko_Ngain_Nloss.sort_values('Ngain')

In [None]:
sp2phylum = {}
for sp, phylum in zip(df_ID_tax_merge.ID, df_ID_tax_merge.Phylum):
    sp2phylum[sp] = phylum

In [None]:
list_internal_node = tree_ml.get_nonterminals()

for node in reversed(list_internal_node):
    if sp2phylum[node.clades[0].name] == sp2phylum[node.clades[1].name]:
        sp2phylum[node.name] = sp2phylum[node.clades[0].name]
    else:
        sp2phylum[node.name] = 'upstream'

list_sp_phylum = []
for sp in sp2phylum.keys():
    list_sp_phylum.append([sp,sp2phylum[sp]])
df_sp_phylum = pd.DataFrame(list_sp_phylum, columns = ['species', 'phylum'])

In [None]:
df_sp_phylum.to_csv("table/node_phylum.mlgtdb_MPPA.txt", sep = '\t', index= False, header = False)

In [None]:
df_KO_species_state_extant = df_KO_species_state[df_KO_species_state['extant']]
df_KO_species_state_extant


#### Presence/Absence iTOL label

In [None]:
for KO in list(set(df_KO_species_state_extant['KO'])):

    df_KO_species_state_extant_ko = df_KO_species_state_extant[df_KO_species_state_extant['KO']==KO]
    possessing_species_set = set(df_KO_species_state_extant_ko['species'])

    color_absent = "#FEF9B9"
    color_present = "#4C4393"

    with open("itol/ko_ex/annotation.ex."+KO+".txt", 'w') as handle:
        handle.write(
            "DATASET_COLORSTRIP\nSEPARATOR SPACE\nDATASET_LABEL "+KO+"\nCOLOR #ff0000\nDATA\n"
        )

        for species in list_extant_species:
            if(species in possessing_species_set):
                handle.write(
                    species + " " + color_present + " present\n"
                )
            else:
                handle.write(
                    species + " " + color_absent + " absent\n"
                )