# Configure KBase Jupyter Dev Environment
<sub><sup>(contact chenry@anl.gov with questions)</sub></sup>

In [1]:
import platform
print("python version " + platform.python_version())
import sys
import json
from json import dump
import os
import re
import pandas as pd
from pandas import DataFrame, read_csv, concat, set_option
from os.path import exists
from pathlib import Path
import logging
import shutil
import requests
from configparser import ConfigParser

config = ConfigParser()
if not exists(str(Path.home()) + '/.kbase/config'):    
    if exists("/scratch/shared/code/sharedconfig.cfg"):
        shutil.copyfile("/scratch/shared/code/sharedconfig.cfg",str(Path.home()) + '/.kbase/config')
    else:
        print("You much create a config file in ~/.kbase/config before running this notebook. See instructions: https://docs.google.com/document/d/1fQ6iS_uaaZKbjWtw1MgzqilklttIibNO9XIIJWgxWKo/edit")
        sys.exit(1)
config.read(str(Path.home()) + '/.kbase/config')
paths = config.get("DevEnv","syspaths").split(";")
codebase = config.get("DevEnv","codebase",fallback="")
for i,filepath in enumerate(paths):
    if filepath[0:1] != "/":
        paths[i] = codebase+"/"+filepath
sys.path = paths + sys.path

from chenry_utility_module.kbdevutils import KBDevUtils
kbdevutil = KBDevUtils("Ontology")
from modelseedpy import MSPackageManager, MSModelUtil, MSBuilder, MSATPCorrection, MSGapfill, MSGrowthPhenotype, MSGrowthPhenotypes, ModelSEEDBiochem
from modelseedpy.core.annotationontology import convert_to_search_role, split_role
from modelseedpy.core.mstemplate import MSTemplateBuilder
from modelseedpy.helpers import get_template
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
msrecon = kbdevutil.msseedrecon()

logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
msrecon = kbdevutil.msseedrecon()
annoapi = kbdevutil.anno_client(native_python_api=True)

#Code for translating obsolete EC numbers
with open(kbdevutil.codebase+"/cb_annotation_ontology_api/data/obsolete_ec.json") as json_file:
    obs_ec = json.load(json_file)

def trans_ec(ec):
    original_ec = ec
    count=0
    while ec in obs_ec:
        count += 1
        if count == 20:
            #print("Circular reference:",original_ec,"->",ec)
            return original_ec
        ec = obs_ec[ec]
    return ec

python version 3.9.13
KBBaseModules 0.0.1
Output files printed to:/Users/chenry/workspace/Notebooks//Ontology//sessions/default/output when using KBDevUtils.output_dir
modelseedpy 0.3.3
cobrakbase 0.3.1


# Managing sessions

In [None]:
# Listing all sessions
#print(kbdevutil.list_sessions())
# Changing the current session
#kbdevutil.set_session("published_biolog")
# Printing the current session
#print(kbdevutil.session)
# Printing current objects
print(kbdevutil.list())
print(kbdevutil.load("model_mapping"))

# Loading reaction data

In [27]:

biochem = ModelSEEDBiochem.get()

filtered_reactions = pd.read_csv(kbdevutil.codebase+"/cb_annotation_ontology_api/data/FilteredReactions.csv",sep="\t")
filtered_reaction_hash = {}
for [i,row] in filtered_reactions.iterrows():
    filtered_reaction_hash[row["id"]] = row["reason"]

msrxn_data = {}
for rxn in biochem.reactions:
    msrxn_data[rxn.id] = {
        "id":rxn.id,
        "name":rxn.name,
        "equation":rxn.build_reaction_string(use_metabolite_names=True),
        "ec":[],
        "filtered":None
    }
    ecnums = rxn.ec_numbers
    
    for ec in ecnums:
        ec = trans_ec(ec)
        if ec not in msrxn_data[rxn.id]["ec"]:
            msrxn_data[rxn.id]["ec"].append(ec)

    if rxn.id in filtered_reaction_hash:
        msrxn_data[rxn.id]["filtered"] = filtered_reaction_hash[rxn.id]

reaction_ec = pd.read_csv(kbdevutil.config["data"]+"/ModelSEEDDatabase/Biochemistry/Aliases/Unique_ModelSEED_Reaction_ECs.txt",sep="\t")
for [i,row] in reaction_ec.iterrows():
    if row["ModelSEED ID"] in msrxn_data:
        ec = row["External ID"]
        ec = trans_ec(ec)
        if ec not in msrxn_data[row["ModelSEED ID"]]["ec"]:
            msrxn_data[row["ModelSEED ID"]]["ec"].append(ec)

kbdevutil.save("msrxn_data",msrxn_data)

# Loading Rhea data

In [28]:
rhea_data = {}
msrxn_data = kbdevutil.load("msrxn_data")
#Loading GO terms to get names for rhea IDs because GO has a single reaction resolution that corresponds to Rhea
#A problem with this is that not every Rhea ID has a GO term; also, we are assuming the GO mappings are right, and they may not be
with open(kbdevutil.codebase+'cb_annotation_ontology_api/data/GO_dictionary.json') as json_file:
    go_dictionary = json.load(json_file)

#Here I'm also reading in the EC mappings for Rhea so I can get good EC numbers for the Rhea IDs
ec_trans = pd.read_csv(kbdevutil.config["data"]+"/TemplateFunctions/rhea2ec.tsv",sep="\t")
for [i,row] in ec_trans.iterrows():
    row["RHEA_ID"] = str(row["RHEA_ID"])
    if row["RHEA_ID"] not in rhea_data:
        rhea_data[row["RHEA_ID"]] = {
            "id":row["RHEA_ID"],
            "ec":[],
            "name":None,
            "genes":[],
            "msrxn":[]
        }
    ecnum = trans_ec(row["ID"])
    if ecnum not in rhea_data[row["RHEA_ID"]]["ec"]:
        rhea_data[row["RHEA_ID"]]["ec"].append(row["ID"])

#Here I load alternative EC mappings for Rhea so I can get good EC numbers for the Rhea IDs
ec_trans = pd.read_csv(kbdevutil.config["data"]+"/TemplateFunctions/rhea-ec-iubmb.tsv",sep="\t")
for [i,row] in ec_trans.iterrows():
    row["RHEA_ID"] = str(row["RHEA_ID"])
    if row["RHEA_ID"] not in rhea_data:
        rhea_data[row["RHEA_ID"]] = {
            "id":row["RHEA_ID"],
            "ec":[],
            "name":None,
            "genes":[],
            "msrxn":[]
        }
    ecnum = trans_ec(row["EC"])
    if ecnum not in rhea_data[row["RHEA_ID"]]["ec"]:
        rhea_data[row["RHEA_ID"]]["ec"].append(ecnum)

#Here I use the GO mappings to assign names to the Rhea IDs                                      
go_trans = pd.read_csv(kbdevutil.config["data"]+"/TemplateFunctions/rhea2go.tsv",sep="\t")
for [i,row] in go_trans.iterrows():
    row["RHEA_ID"] = str(row["RHEA_ID"])
    if row["RHEA_ID"] not in rhea_data:
        rhea_data[row["RHEA_ID"]] = {
            "id":row["RHEA_ID"],
            "ec":[],
            "name":None,
            "genes":[],
            "msrxn":[]
        }
    rhea_data[row["RHEA_ID"]]["name"] = row["ID"]
    if row["ID"] in go_dictionary["term_hash"]:
        rhea_data[row["RHEA_ID"]]["name"] += ":"+go_dictionary["term_hash"][row["ID"]]["name"]

ms_aliases = pd.read_csv(kbdevutil.codebase+"/cb_annotation_ontology_api/data/ModelSEED_Reaction_Aliases.txt",sep="\t")
for [i,row] in ms_aliases.iterrows():
    if row["Source"]:
        if row["External ID"] in rhea_data:
            msrxn = row["ModelSEED ID"]
            if msrxn not in rhea_data[row["External ID"]]["msrxn"]:
                rhea_data[row["External ID"]]["msrxn"].append(msrxn)
            if msrxn in msrxn_data:
                if msrxn_data[msrxn]["name"] and not rhea_data[row["External ID"]]["name"]:
                    rhea_data[row["External ID"]]["name"] = msrxn_data[msrxn]["name"]
                for ec in msrxn_data[msrxn]["ec"]:
                    ec = trans_ec(ec)
                    if ec not in rhea_data[row["External ID"]]["ec"]:
                        rhea_data[row["External ID"]]["ec"].append(ec)

#TODO: We need a new more robust naming procedure for Rhea reactions
#Step one, if the Rhea is mapped to a ModelSEED reaction with a name, use that name
#Step two, if there is no MS rxn or the MS rxn has no name or just and rxn ID or Rhea ID for a name, do the following:
#Take the "activity" from the first number in the EC number (e.g. 1 = oxidoreductase, 2 = transferase, etc.)
#Then print the reactant list and produce list with the activity of the first EC (e.g. Glucose-6-phosphate hydrolase)
#Possible explore filtering out obvious cofactors from the lists above (e.g. O2, H+, H2O)

kbdevutil.save("rhea_data",rhea_data)

# Loading SSO

In [29]:
sso_data = {}
msrxn_data = kbdevutil.load("msrxn_data")

with open(kbdevutil.codebase+"/cb_annotation_ontology_api/data/SSO_dictionary.json") as json_file:
    sso = json.load(json_file)

with open(kbdevutil.codebase+"/cb_annotation_ontology_api/data/SSO_reactions.json") as json_file:
    sso_rxns = json.load(json_file)

for sso_id in sso["term_hash"]:
    sso_id = sso_id[4:]
    sso_data[sso_id] = {
        "id":sso_id,
        "name":sso["term_hash"]["SSO:"+sso_id]["name"],
        "ec":[],
        "genes":[],
        "msrxn":[],
        "class":None
    }
    if sso_id == "000009137":
        sso_data[sso_id]["class"] = "hypothetical"
    match = re.search(r'(\d+\.[\d-]+\.[\d-]+\.[\d-]+)',sso["term_hash"]["SSO:"+sso_id]["name"])
    if match:
        ec = match.group(0)
        ec = trans_ec(ec)
        if ec not in sso_data[sso_id]["ec"]:
            sso_data[sso_id]["ec"].append(ec)
        
for sso_id in sso_rxns:
    if sso_id in sso_data:
        for rxn in sso_rxns[sso_id]:
            if rxn not in sso_data[sso_id]["msrxn"]:
                sso_data[sso_id]["msrxn"].append(rxn)
            if rxn in msrxn_data:
                for ec in msrxn_data[rxn]["ec"]:
                    if ec not in sso_data[sso_id]["ec"]:
                        sso_data[sso_id]["ec"].append(ec)

kbdevutil.save("sso_data",sso_data)

# Build KO hash

In [30]:
ko_data = {}

with open(kbdevutil.codebase+"/cb_annotation_ontology_api/data/KO_dictionary.json") as json_file:
    kodict = json.load(json_file)

for ko in kodict["term_hash"]:
    if ko not in ko_data:
        ko_data[kodict["term_hash"][ko]["id"]] = {
            "id": kodict["term_hash"][ko]["id"],
            "name": kodict["term_hash"][ko]["name"],
            "ec":[],
            "genes":[],
            "msrxn":[]
        }

with open(kbdevutil.codebase+"/cb_annotation_ontology_api/data/kegg_95_0_ko_seed.tsv") as f:
    korxn = pd.read_csv(f,sep="\t")

for index, row in korxn.iterrows():
    if row["ko_id"] in ko_data:
        ko_data[row["ko_id"]]["msrxn"].append(row["seed_ids"])

kbdevutil.save("ko_data",ko_data)

# Build EC hash

In [31]:
ec_data = {}

with open(kbdevutil.codebase+"/cb_annotation_ontology_api/data/EC_dictionary.json") as json_file:
    ecdict = json.load(json_file)

for ec in ecdict["term_hash"]:
    if ec not in ec_data:
        ec_data[ecdict["term_hash"][ec]["id"]] = {
            "id": ecdict["term_hash"][ec]["id"],
            "name": ecdict["term_hash"][ec]["name"],
            "ec":[ec],
            "dram_genes":[],
            "prokka_genes":[],
            "msrxn":[],
            "rhea":[],
            "sso":[],
            "ko":[]
        }

rhea_data = kbdevutil.load("rhea_data")
sso_data = kbdevutil.load("sso_data")
msrxn_data = kbdevutil.load("msrxn_data")
ko_data = kbdevutil.load("ko_data")

all_data = {
    "rhea":rhea_data,
    "sso":sso_data,
    "msrxn":msrxn_data,
    "ko":ko_data
}

for type in all_data:
    for id in all_data[type]:
        for ec in all_data[type][id]["ec"]:
            if ec not in ec_data:
                ec_data[ec] = {"id":ec,"name":None,"ec":[ec],"rhea":[],"sso":[],"msrxn":[],"ko":[],"dram_genes":[],"prokka_genes":[]}
            ec_data[ec][type].append(id)

#TODO: confirm that EC terms now have ec lists in them
            
kbdevutil.save("ec_data",ec_data)

# Pulling and printing ontology terms

In [32]:
#See this object in this narrative: https://narrative.kbase.us/narrative/154984
output = annoapi.get_annotation_ontology_events({
    "input_ref" : "154984/SwissProtCuratedProteins.RAST.Prokka.DRAM.Rhea"
})
kbdevutil.save("swiss_prot_anno",output)

annotations_by_gene = {}
for event in output["events"]:
    if event["event_id"][0:4] == "RAST":
        for gene in event["ontology_terms"]:
            if gene not in annotations_by_gene:
                annotations_by_gene[gene] = {}
            annotations_by_gene[gene]["sso"] = event["ontology_terms"][gene]
    elif event["event_id"][0:7] == "DRAM:KO":
        for gene in event["ontology_terms"]:
            if gene not in annotations_by_gene:
                annotations_by_gene[gene] = {}
            annotations_by_gene[gene]["ko"] = event["ontology_terms"][gene]
    elif event["event_id"][0:7] == "DRAM:EC":
        for gene in event["ontology_terms"]:
            if gene not in annotations_by_gene:
                annotations_by_gene[gene] = {}
            annotations_by_gene[gene]["dec"] = event["ontology_terms"][gene]
    elif event["id"][0:4] == "RHEA":
        for gene in event["ontology_terms"]:
            if gene not in annotations_by_gene:
                annotations_by_gene[gene] = {}
            annotations_by_gene[gene]["rhea"] = event["ontology_terms"][gene]
    elif event["event_id"][0:6] == "Prokka":
        for gene in event["ontology_terms"]:
            if gene not in annotations_by_gene:
                annotations_by_gene[gene] = {}
            annotations_by_gene[gene]["pec"] = event["ontology_terms"][gene]

rhea_data = kbdevutil.load("rhea_data")
sso_data = kbdevutil.load("sso_data")
msrxn_data = kbdevutil.load("msrxn_data")
ko_data = kbdevutil.load("ko_data")
ec_data = kbdevutil.load("ec_data")
all_data = {
    "rhea":rhea_data,
    "sso":sso_data,
    "msrxn":msrxn_data,
    "ko":ko_data,
    "ec":ec_data
}
for gene in annotations_by_gene:
    for source in annotations_by_gene[gene]:
        for item in annotations_by_gene[gene][source]:
            term = item["term"]
            term = term.split(":")[1]
            if source in all_data:
                if term in all_data[source]:
                    if gene not in all_data[source][term]["genes"]:
                        all_data[source][term]["genes"].append(gene)
                else:
                    print("Term not founds:",term,"in",source)
            elif source == "dec":
                if term in all_data["ec"]:
                    if gene not in all_data["ec"][term]["dram_genes"]:
                        all_data["ec"][term]["dram_genes"].append(gene)
                else:
                    print("Term not founds:",term,"in",source)
            elif source == "pec":
                if term in all_data["ec"]:
                    if gene not in all_data["ec"][term]["prokka_genes"]:
                        all_data["ec"][term]["prokka_genes"].append(gene)
                else:
                    print("Term not founds:",term,"in",source)
            else:
                print("Unknown source:",source)

kbdevutil.save("rhea_data",rhea_data)
kbdevutil.save("sso_data",sso_data)
kbdevutil.save("msrxn_data",msrxn_data)
kbdevutil.save("ko_data",ko_data)
kbdevutil.save("ec_data",ec_data)

kbdevutil.save("annotations_by_gene",annotations_by_gene)

Term not founds: 19849 in rhea
Term not founds: 75299 in rhea
Term not founds: 73767 in rhea
Term not founds: 73775 in rhea
Term not founds: 73779 in rhea
Term not founds: 73783 in rhea
Term not founds: 73787 in rhea
Term not founds: 73791 in rhea
Term not founds: 73795 in rhea
Term not founds: 73799 in rhea
Term not founds: 73803 in rhea
Term not founds: 73771 in rhea
Term not founds: K24570 in ko
Term not founds: 55932 in rhea
Term not founds: 64636 in rhea
Term not founds: 64672 in rhea
Term not founds: 64676 in rhea
Term not founds: 64680 in rhea
Term not founds: 64632 in rhea
Term not founds: 64644 in rhea
Term not founds: 64672 in rhea
Term not founds: 74867 in rhea
Term not founds: 46976 in rhea
Term not founds: 69488 in rhea
Term not founds: 69492 in rhea
Term not founds: 69496 in rhea
Term not founds: 72751 in rhea
Term not founds: 62224 in rhea
Term not founds: 62212 in rhea
Term not founds: 62220 in rhea
Term not founds: 62216 in rhea
Term not founds: 62228 in rhea
Term not 

# Creating domain specific gene lists

In [4]:
domain_specific_lists = {
    "Fungi" : "154984/SwissProt_Rhea_Fungi",
    "Other" : "154984/SwissProt_Rhea_Other",
    "Viridiplantae" : "154984/SwissProt_Rhea_Viridiplantae",
    "Archaea" : "154984/SwissProt_Rhea_Archaea",
    "Bacteria" : "154984/SwissProt_Rhea_Bacteria",
    "Metazoa" : "154984/SwissProt_Rhea_Metazoa"
}
domain_proteins = {}
for domain in domain_specific_lists:
    data = kbdevutil.get_object(domain_specific_lists[domain])
    for item in data["data"]["sequences"]:
        domain_proteins[item["id"]] = domain

kbdevutil.save("domain_proteins",domain_proteins)

# Print ontology names for symantic comparison

In [16]:
annotations_by_gene = kbdevutil.load("annotations_by_gene")

all_data = {
    "rhea":kbdevutil.load("rhea_data"),
    "sso":kbdevutil.load("sso_data"),
    "msrxn":kbdevutil.load("msrxn_data"),
    "ko":kbdevutil.load("ko_data"),
    "dec":kbdevutil.load("ec_data"),
    "pec":kbdevutil.load("ec_data")
}

#TODO: print the domain of each gene (maybe)
#TODO: currently this prints a row for each gene - perhaps change to printing a row for each unique combination of two ontology names and list genes instead of a single name

records = {"Term1":[],"Term2":[],"Name1":[],"Name2":[],"Source1":[],"Source2":[],"Gene":[],"ReactionMatch":[],"ECMatch":[]}
for gene in annotations_by_gene:
    sources = list(annotations_by_gene[gene].keys())
    for i in range(len(sources)):
        for j in range(i+1,len(sources)):
            for item in annotations_by_gene[gene][sources[i]]:
                for oitem in annotations_by_gene[gene][sources[j]]:
                    records["Term1"].append(item["term"])
                    records["Term2"].append(oitem["term"])
                    records["Source1"].append(sources[i])
                    records["Source2"].append(sources[j])
                    records["Gene"].append(gene)
                    term = item["term"].split(":")[1]
                    oterm = oitem["term"].split(":")[1]
                    if term in all_data[sources[i]]:
                        name = all_data[sources[i]][term]["name"]
                        if not name:
                            name = term
                        if "ec" in all_data[sources[i]][term] and len(all_data[sources[i]][term]["ec"]) > 0:
                            name += " ("+";".join(all_data[sources[i]][term]["ec"])+")"
                        records["Name1"].append(name)    
                        if oterm in all_data[sources[j]]:
                            oname = all_data[sources[j]][oterm]["name"]
                            if not oname:
                                oname = oterm
                            if "ec" in all_data[sources[j]][oterm] and len(all_data[sources[j]][oterm]["ec"]) > 0:
                                oname += " ("+";".join(all_data[sources[j]][oterm]["ec"])+")"
                            records["Name2"].append(oname)
                            found = "No"
                            if "msrxn" in all_data[sources[i]][term] and len(all_data[sources[i]][term]["msrxn"]) > 0:
                                for rxn in all_data[sources[i]][term]["msrxn"]:
                                    if "msrxn" in all_data[sources[j]][oterm] and rxn in all_data[sources[j]][oterm]["msrxn"]:
                                        found = "Yes"
                                    elif "msrxn" not in all_data[sources[j]][oterm] or len(all_data[sources[j]][oterm]["msrxn"]) == 0:
                                        found = "No rxn associated with "+oterm
                            else:
                                found = "No rxn associated with "+term
                            records["ReactionMatch"].append(found)
                            found = "No"
                            if "ec" in all_data[sources[i]][term] and len(all_data[sources[i]][term]["ec"]) > 0:
                                for ec in all_data[sources[i]][term]["ec"]:
                                    if "ec" in all_data[sources[j]][oterm] and ec in all_data[sources[j]][oterm]["ec"]:
                                        found = "Yes"
                                    elif "ec" not in all_data[sources[j]][oterm] or len(all_data[sources[j]][oterm]["ec"]) == 0:
                                        found = "No EC associated with "+oterm
                            else:
                                found = "No EC associated with "+term
                            records["ECMatch"].append(found)
                        else:
                            records["Name2"].append(oterm)
                            records["ECMatch"].append(oterm+" not found")
                            records["ReactionMatch"].append(oterm+" not found") 
                    else:
                        if oterm in all_data[sources[j]]:
                            oname = all_data[sources[j]][oterm]["name"]
                            if not oname:
                                oname = oterm
                            if "ec" in all_data[sources[j]][oterm] and len(all_data[sources[j]][oterm]["ec"]) > 0:
                                oname += " ("+";".join(all_data[sources[j]][oterm]["ec"])+")"
                            records["Name2"].append(oname)
                        else:
                            records["Name2"].append(oterm)
                        records["Name1"].append(term)
                        records["ECMatch"].append(term+" not found")
                        records["ReactionMatch"].append(term+" not found")

df = pd.DataFrame.from_dict(records)
df.to_csv(kbdevutil.output_dir+"/annotation_pairs.csv",index=False)

# Printing annotation comparison table

In [7]:
from tqdm import tqdm

list = ["RAST", "DRAM_KO", "DRAM_EC", "Prokka"]
all_data = {"Rhea":rhea_data}
for current in list:
    data = []
    class_counts[current] = {}
    filtered_reaction_count[current] = 0
    all_data[current] = {}
    for gene in tqdm(annotations_by_gene):
        if gene[-4:] == ".CDS":
            continue
        current_row = {"Gene": gene,"Domain": "None","Class":"Missmatch","Extra_SP":0,"Extra_"+current:0,"SP_RHID": None,"SP_EC":None,"SP_RHGO":None,"SP_MSID": None,"SP_MSName":None,"SP_MSEquation":None,current: None,current+"_MSID": None,current+"_MSName": None,current+"_MSEquation": None,current+"_MSEC": None}
        data.append(current_row)
        spmsid = {}
        sp_ec = []
        current_ec = []
        if gene in domain_proteins:
            current_row["Domain"] = domain_proteins[gene]
        if current_row["Domain"] not in class_counts[current]:
            class_counts[current][current_row["Domain"]] = {}
        if "Rhea" in annotations_by_gene[gene]:
            for item in annotations_by_gene[gene]["Rhea"]:
                current_rhea = get_rhea_data(item["term"])
                for i,ec in enumerate(current_rhea["ec"]):
                    current_rhea["ec"][i] = trans_ec(ec)  
                for column in ["GO","ID"]:
                    label = "SP_RH"+column
                    if current_row[label] and len(current_row[label]) > 0:
                        current_row[label] += "\n"
                    else:
                        current_row[label] = ""
                    if column.lower() in current_rhea:
                        if current_rhea[column.lower()]:
                            current_row[label] += current_rhea[column.lower()]
                current_ms = None
                current_rhea["genes"].append(gene)
                if "modelseed_ids" in item:
                    current_rhea["msrxn"].append(item["modelseed_ids"][0])
                    current_ms = get_ms_data(item["modelseed_ids"][0])
                    if item["modelseed_ids"][0] in filtered_reaction_hash and current == "RAST":
                        filtered_reaction_count["Rhea"] += 1
                    for ec in current_ms["ec"]:
                        ec = trans_ec(ec)
                for column in ["Name","Equation","ID"]:
                    label = "SP_MS"+column
                    if current_row[label] and len(current_row[label]) > 0:
                        current_row[label] += "\n"
                    else:
                        current_row[label] = ""
                    if current_ms:
                        spmsid[item["modelseed_ids"][0]] = False
                        if column.lower() in current_ms:
                            current_row[label] += current_ms[column.lower()]
                if current_row["SP_EC"] and len(current_row["SP_EC"]) > 0:
                    current_row["SP_EC"] += "\n"
                else:
                    current_row["SP_EC"] = ""
                current_row["SP_EC"] = "/".join(current_rhea["ec"])
        if current in annotations_by_gene[gene]:
            for item in annotations_by_gene[gene][current]:
                if item["term"] not in all_data[current]:
                    all_data[current][item["term"]] = {"name":None,"ec":[],"msrxn":[],"id":item["term"],"genes":[]}
                all_data[current][item["term"]]["genes"].append(gene)
                if current_row[current] and len(current_row[current]) > 0:
                    current_row[current] += "\n"
                else:
                    current_row[current] = ""
                current_row[current] += item["term"]
                if current_row[current] == "SSO:000009137":
                    current_row["Class"] = "hypothetical"
                if item["term"] in sso["term_hash"]:
                    all_data[current][item["term"]]["name"] = sso["term_hash"][item["term"]]["name"]
                    current_row[current] += ":"+sso["term_hash"][item["term"]]["name"]
                match = re.search(r'(\d+\.[\d-]+\.[\d-]+\.[\d-]+)',current_row[current])
                if match:
                    ec = match.group(0)
                    ec = trans_ec(ec)
                    current_ec.append(ec)
                    if ec not in all_data[current][item["term"]]["ec"]:
                        all_data[current][item["term"]]["ec"].append(ec)
                    if ec not in ec_hash:
                        ec_hash[ec] = {}
                    if current not in ec_hash[ec]:
                        ec_hash[ec][current] = {}
                    if item["term"] not in ec_hash[ec][current]:
                        ec_hash[ec][current][item["term"]] = True
                    if ec in sp_ec:
                        if current_row["Class"] == "Missmatch":
                            current_row["Class"] = "ec match"
                for column in ["Name","Equation","ID","EC"]:
                    label = current+"_MS"+column
                    if current_row[label] and len(current_row[label]) > 0:
                        current_row[label] += "\n"
                    else:
                        current_row[label] = ""
                    if "modelseed_ids" in item:
                        first = True
                        for msrxn in item["modelseed_ids"]:
                            all_data[current][item["term"]]["msrxn"].append(msrxn)
                            if msrxn in filtered_reaction_hash:
                                filtered_reaction_count[current] += 1
                            if msrxn in spmsid:
                                current_row["Class"] = "full match"
                                spmsid[msrxn] = True
                            elif column == "Name":
                                current_row["Extra_"+current] += 1
                            current_ms = get_ms_data(msrxn)
                            if column.lower() in current_ms:
                                if column == "EC":
                                    current_row[label] += "/".join(current_ms[column.lower()])
                                    for ec in current_ms[column.lower()]:
                                        ec = trans_ec(ec)
                                        if ec not in current_ec:
                                            current_ec.append(ec)
                                        if ec in sp_ec:
                                            if current_row["Class"] == "Missmatch":
                                                current_row["Class"] = "ec match"
                                else:
                                    if column == "Name":
                                        all_data[current][item["term"]]["name"] = current_ms["name"]
                                    current_row[label] += current_ms[column.lower()]
                            if not first:
                                current_row[current] += "\n"#Making sure SSOs align with MSRXNs
                            first = False
        else:
            current_row["Class"] = "Unannotated"
        if current_row["Class"] == "Missmatch":
            if len(sp_ec) == 0:
                current_row["Class"] = "No rhea EC"
            elif len(current_ec) == 0:
                current_row["Class"] = "No other EC"
        for msid in spmsid:
            if not spmsid[msid]:
                current_row["Extra_SP"] += 1
        if current_row["Class"] not in class_counts[current][current_row["Domain"]]:
            class_counts[current][current_row["Domain"]][current_row["Class"]] = 0
        class_counts[current][current_row["Domain"]][current_row["Class"]] += 1
    df = pd.DataFrame.from_records(data)
    df.to_csv(kbdevutil.out_dir()+current+"_annotations.tsv", sep='\t', index=False)
    with open(kbdevutil.out_dir()+"ClassCounts.json", 'w') as outfile:
        json.dump(class_counts, outfile)
    #Converting class_counts into dataframe for printing as a table
    all_records = []
    for anno in class_counts:
        for domain in class_counts[anno]:
            record = {"Domain":domain,"Algorithm":anno}
            for class_type in class_counts[anno][domain]:
                record[class_type] = class_counts[anno][domain][class_type]
            all_records.append(record)
    df = pd.DataFrame.from_records(all_records)
    df.to_csv(kbdevutil.out_dir()+"ClassCounts.tsv", sep='\t', index=False)

print("Filtered reaction counts:",str(filtered_reaction_count))

list = ["Rhea","RAST", "DRAM_KO", "DRAM_EC", "Prokka"]
for current in list:
    mapping_records = []
    count = 0
    if current != "RAST":
        continue
    for term in tqdm(all_data[current]):
        count += 1
        #if count == 100:
        #    break
        if term == "SSO:000009137":
            continue
        rxn_candidates = {}
        rxn_list = []
        next_list = []
        if "msrxn" in all_data[current][term]:
            for msrxn in all_data[current][term]["msrxn"]:
                if msrxn[0:6] == "MSRXN:":
                    msrxn = msrxn[6:]
                if msrxn not in rxn_candidates:
                    rxn_candidates[msrxn] = {"score":0,"hits":{},"genes":{}}
                rxn_candidates[msrxn]["score"] += 16
                if "D" not in rxn_candidates[msrxn]["hits"]:
                    rxn_candidates[msrxn]["hits"]["D"] = 0
                rxn_candidates[msrxn]["hits"]["D"] += 1
                if msrxn not in rxn_list:
                    rxn_list.append(msrxn)
                if msrxn in rxn_ec:
                    for ec in rxn_ec[msrxn]:
                        if "MS" in ec_hash[ec]:
                            for msrxn in ec_hash[ec]["MS"]:
                                if msrxn[0:6] == "MSRXN:":
                                    msrxn = msrxn[6:]
                                if msrxn not in rxn_candidates:
                                    rxn_candidates[msrxn] = {"score":0,"hits":{},"genes":{}}
                                rxn_candidates[msrxn]["score"] += 10
                                if "Re" not in rxn_candidates[msrxn]["hits"]:
                                    rxn_candidates[msrxn]["hits"]["Re"] = 0
                                rxn_candidates[msrxn]["hits"]["Re"] += 1
                                if msrxn not in next_list:
                                    next_list.append(msrxn)
        if "ec" in all_data[current][term]:
            for ec in all_data[current][term]["ec"]:
                if ec in ec_hash:
                    if "MS" in ec_hash[ec]:
                        for msrxn in ec_hash[ec]["MS"]:
                            if msrxn[0:6] == "MSRXN:":
                                msrxn = msrxn[6:]
                            if msrxn not in rxn_candidates:
                                rxn_candidates[msrxn] = {"score":0,"hits":{},"genes":{}}
                            rxn_candidates[msrxn]["score"] += 12
                            if "E" not in rxn_candidates[msrxn]["hits"]:
                                rxn_candidates[msrxn]["hits"]["E"] = 0
                            rxn_candidates[msrxn]["hits"]["E"] += 1
                            if msrxn not in rxn_list:
                                rxn_list.append(msrxn)
        for rxn in next_list:
            if rxn not in rxn_list:
                rxn_list.append(rxn)
        next_list = []
        if "genes" in all_data[current][term]:
            for gene in all_data[current][term]["genes"]:
                if gene in annotations_by_gene:
                    for anno in annotations_by_gene[gene]:
                        if anno != current:
                            for item in annotations_by_gene[gene][anno]:
                                newterm = item["term"]
                                if newterm[0:5] == "RHEA:":
                                    newterm = newterm[5:]
                                if newterm in all_data[anno]:
                                    for msrxn in all_data[anno][newterm]["msrxn"]:
                                        if msrxn[0:6] == "MSRXN:":
                                            msrxn = msrxn[6:]
                                        if msrxn not in rxn_candidates:
                                            rxn_candidates[msrxn] = {"score":0,"hits":{},"genes":{}}
                                        ec_match = False
                                        if "ec" in all_data[current][term] and "ec" in all_data[anno][newterm]:
                                            for ec in all_data[current][term]["ec"]:
                                                if ec in all_data[anno][newterm]["ec"]:
                                                    ec_match = True
                                                    break
                                        if ec_match:
                                            rxn_candidates[msrxn]["score"] += 12
                                            if "Gd" not in rxn_candidates[msrxn]["hits"]:
                                                rxn_candidates[msrxn]["hits"]["Gd"] = {}
                                            if anno+"|"+newterm not in rxn_candidates[msrxn]["hits"]["Gd"]:
                                                rxn_candidates[msrxn]["hits"]["Gd"][anno+"|"+newterm] = 0
                                            rxn_candidates[msrxn]["hits"]["Gd"][anno+"|"+newterm] += 1
                                        else:
                                            rxn_candidates[msrxn]["score"] += 3
                                            if "Gnd" not in rxn_candidates[msrxn]["hits"]:
                                                rxn_candidates[msrxn]["hits"]["Gnd"] = {}
                                            if anno+"|"+newterm not in rxn_candidates[msrxn]["hits"]["Gnd"]:
                                                rxn_candidates[msrxn]["hits"]["Gnd"][anno+"|"+newterm] = 0
                                            rxn_candidates[msrxn]["hits"]["Gnd"][anno+"|"+newterm] += 1
                                        if msrxn not in rxn_list:
                                            rxn_list.append(msrxn)
                                        if msrxn in rxn_ec:
                                            for ec in rxn_ec[msrxn]:
                                                if "MS" in ec_hash[ec]:
                                                    for msrxn in ec_hash[ec]["MS"]:
                                                        if msrxn[0:6] == "MSRXN:":
                                                            msrxn = msrxn[6:]
                                                        if msrxn not in rxn_candidates:
                                                            rxn_candidates[msrxn] = {"score":0,"hits":{},"genes":{}}
                                                        if ec in all_data[current][term]["ec"]:
                                                            rxn_candidates[msrxn]["score"] += 8
                                                            if "Gre" not in rxn_candidates[msrxn]["hits"]:
                                                                rxn_candidates[msrxn]["hits"]["Gre"] = {}
                                                            if anno+"|"+newterm not in rxn_candidates[msrxn]["hits"]["Gre"]:
                                                                rxn_candidates[msrxn]["hits"]["Gre"][anno+"|"+newterm] = 0
                                                            rxn_candidates[msrxn]["hits"]["Gre"][anno+"|"+newterm] += 1
                                                        else:
                                                            rxn_candidates[msrxn]["score"] += 1
                                                            if "Gnre" not in rxn_candidates[msrxn]["hits"]:
                                                                rxn_candidates[msrxn]["hits"]["Gnre"] = {}
                                                            if anno+"|"+newterm not in rxn_candidates[msrxn]["hits"]["Gnre"]:
                                                                rxn_candidates[msrxn]["hits"]["Gnre"][anno+"|"+newterm] = 0
                                                            rxn_candidates[msrxn]["hits"]["Gnre"][anno+"|"+newterm] += 1
                                                        if msrxn not in next_list:
                                                            next_list.append(msrxn)
                                if "ec" in all_data[anno][newterm]:
                                    for ec in all_data[anno][newterm]["ec"]:
                                        if ec in ec_hash:
                                            if "MS" in ec_hash[ec]:
                                                for msrxn in ec_hash[ec]["MS"]:
                                                    if msrxn[0:6] == "MSRXN:":
                                                        msrxn = msrxn[6:]
                                                    if msrxn not in rxn_candidates:
                                                        rxn_candidates[msrxn] = {"score":0,"hits":{},"genes":{}}
                                                    if ec in all_data[current][term]["ec"]:
                                                        rxn_candidates[msrxn]["score"] += 10
                                                        if "Ge" not in rxn_candidates[msrxn]["hits"]:
                                                            rxn_candidates[msrxn]["hits"]["Ge"] = {}
                                                        if newterm not in rxn_candidates[msrxn]["hits"]["Ge"]:
                                                            rxn_candidates[msrxn]["hits"]["Ge"][anno+"|"+newterm] = 0
                                                        rxn_candidates[msrxn]["hits"]["Ge"][anno+"|"+newterm] += 1
                                                    else:
                                                        rxn_candidates[msrxn]["score"] += 2
                                                        if "Gne" not in rxn_candidates[msrxn]["hits"]:
                                                            rxn_candidates[msrxn]["hits"]["Gne"] = {}
                                                        if newterm not in rxn_candidates[msrxn]["hits"]["Gne"]:
                                                            rxn_candidates[msrxn]["hits"]["Gne"][anno+"|"+newterm] = 0
                                                        rxn_candidates[msrxn]["hits"]["Gne"][anno+"|"+newterm] += 1
                                                    if msrxn not in rxn_list:
                                                        rxn_list.append(msrxn)
        for rxn in next_list:
            if rxn not in rxn_list:
                rxn_list.append(rxn)
        for msrxn in rxn_list:
            if msrxn in biochem.reactions and biochem.reactions.get_by_id(msrxn).is_obsolete:
                continue
            record = {"Term":term,"Name":None,"EC":None,"Genes":None,"Gene count":0,"MSRXN":msrxn,"MSEC":None,"MSName":None,"Equation":None,"Score":rxn_candidates[msrxn]["score"],"Rhea":"","RAST":"", "DRAM_KO":"", "DRAM_EC":"", "Prokka":"","Rhea-M":"","RAST-M":"", "DRAM_KO-M":"", "DRAM_EC-M":"", "Prokka-M":""}
            evidence_list = ["D","E","Re","Gd","Ge","Gre","Gnd","Gne","Gnre"]
            if "D" in rxn_candidates[msrxn]["hits"]:
                record[current+"-M"] = 1
            termhash = {}
            for evidence in evidence_list:
                if evidence[0:1] != "G" and evidence in rxn_candidates[msrxn]["hits"]:
                    record[current] += evidence+str(rxn_candidates[msrxn]["hits"][evidence])
                if evidence[0:1] == "G" and evidence in rxn_candidates[msrxn]["hits"]:
                    for iterm in rxn_candidates[msrxn]["hits"][evidence]:
                        if iterm not in termhash:
                            termhash[iterm] = {}
                        if evidence not in termhash[iterm]:
                            termhash[iterm][evidence] = 0
                        termhash[iterm][evidence] += 1
            for fullterm in termhash:
                (anno,iterm) = fullterm.split("|")
                if record[anno] != "":
                    record[anno] += "/"
                line = iterm+":"
                for evidence in termhash[fullterm]:
                    if "Gd" == evidence or "Gnd" == evidence:
                        record[anno+"-M"] = 1
                    if line != iterm+":":
                        line += ";"
                    line += evidence+str(termhash[fullterm][evidence])
                record[anno] += line
            if "genes" in all_data[current][term]:
                record["Gene count"] = len(all_data[current][term]["genes"])
                record["Genes"] = ", ".join(all_data[current][term]["genes"])
            if "name" in all_data[current][term] and all_data[current][term]["name"]:
                record["Name"] = all_data[current][term]["name"]
            if "ec" in all_data[current][term]:
                newlist = []
                for ec in all_data[current][term]["ec"]:
                    ec = trans_ec(ec)
                    if ec not in newlist:
                        newlist.append(ec)
                record["EC"] = ", ".join(newlist)
            if msrxn in rxn_ec:
                newlist = []
                currec = rxn_ec[msrxn].keys()
                for ec in currec:
                    ec = trans_ec(ec)
                    if ec not in newlist:
                        newlist.append(ec)
                record["MSEC"] = ", ".join(newlist)
            current_ms = get_ms_data(msrxn)
            if "name" in current_ms and current_ms["name"]:
                record["MSName"] = current_ms["name"]
            if "equation" in current_ms and current_ms["equation"]:
                record["Equation"] = current_ms["equation"]
            if record["Gene count"] > 0:
                mapping_records.append(record)
    df = pd.DataFrame.from_records(mapping_records)
    df.to_csv(kbdevutil.out_dir()+current+"_mappings.tsv", sep='\t', index=False)

100%|██████████| 32100/32100 [00:01<00:00, 29979.48it/s]
100%|██████████| 32100/32100 [00:01<00:00, 31226.10it/s]
100%|██████████| 32100/32100 [00:00<00:00, 56964.56it/s]
100%|██████████| 32100/32100 [00:01<00:00, 18808.61it/s]


Filtered reaction counts: {'RAST': 0, 'DRAM_KO': 0, 'DRAM_EC': 0, 'Prokka': 0}


100%|██████████| 3076/3076 [1:21:35<00:00,  1.59s/it]  


# Testing the annotation ontology API

In [None]:
kbdevutil = KBDevUtils("Ontology",ws_version="appdev")
appdev_annoapi = kbdevutil.anno_client(native_python_api=True)
with open('debug.json') as json_file:
    input_data = json.load(json_file)
output = anno_api.add_annotation_ontology_events(input_data)
output = anno_api.get_annotation_ontology_events({
    "input_ref" : "102004/Methanosarcina_acetivorans_C2A_DRAM_RAST"
#    "input_ref" : "93487/Ruepo_2orMoreRKM"
#    "input_ref" : "77537/Sco_RAST_Prokka_BlastKOALA_PTools_DeepEC_DeepGO"
#    "input_ref" : "77537/Sco_Union_BestUnion_2plus_Best2plus_RASTKEGG"
#    "input_ref" : "77925/Pf5.6"#,
#    "input_workspace" : 
})
with open('output.json', 'w') as outfile:
    json.dump(output, outfile, indent=2)

terms = ontology["events"][0]["ontology_terms"]
ontology["events"][0]["ontology_id"] = "SEED"
for gene in terms:
    terms[gene][0]["evidence"] = "test"
    terms[gene][0]["term"] = terms[gene][0]["term"].split(":")[1]
    
output = anno_api.add_annotation_ontology_events({
    "input_ref" : "GCF_000012265.1",
    "input_workspace" : 77925,
    "output_name" : "TestOntologyOutput",
    "events" : ontology["events"],
    "output_workspace": "kimbrel1:narrative_1606152384556",
    "save" : 1
})

ontology = anno_api.get_annotation_ontology_events({
    "input_ref" : "TestOntologyOutput",
    "input_workspace" : "kimbrel1:narrative_1606152384556"
})

with open('/Users/chenry/output.json', 'w') as outfile:
    json.dump(ontology, outfile, indent=2)

#Escherichia_coli_K-12_MG1655
#Synechocystis_PCC_6803
#Methanosarcina_barkeri_Fusaro
#Clostridium_beijerinckii_NCIMB_8052
#Streptomyces_coelicolor_A3_2

ontology_input = {
    "input_ref":"Streptomyces_coelicolor_A3_2",
    "input_workspace":"chenry:narrative_1612295985064",
    "output_name":"test",
    "output_workspace":"chenry:narrative_1612295985064",
    "clear_existing":0,
    "overwrite_matching":1,
    "save":1,
    "events":[
        {
            "event_id": "annotate_genome:1.8.1:SSO:2020-11-23T17:51:18",
            "original_description": "annotate_genome:2020-11-23T17:51:18:2020-11-23T17:51:18",
            "description": "annotate_genome:2020-11-23T17:51:18:2020-11-23T17:51:18:2020-11-23T17:51:18",
            "ontology_id": "SSO",
            "method": "annotate_genome",
            "method_version": "1.8.1",
            "timestamp": "2020-11-23T17:51:18",
            "ontology_terms":{"sgl0001": [{"term": "SSO:000001563"}]}
        }
    ]
}
#with open('/Users/chenry/ontology_api_input.json') as json_file:
#    ontology_input = json.load(json_file)
#print("Loading ontology terms to genome!")
output = anno_api.add_annotation_ontology_events(ontology_input)

# Comparing Published Models

In [None]:
import sys
import json
import cobra
import cobrakbase
kbase_api = cobrakbase.KBaseAPI()

genome_list = ["Sco","Eco","Cbe","Mba"]
pub_model_hash = {
    "Sco" : "iMK1208",
    "Eco" : "iML1515.kb",
    "Cbe" : "iCM925_GF",
    "Mba" : "iMG746_GF"
}
pub_fba_hash = {
    "Sco" : "iMK1208_FBA",
    "Eco" : "iML1515.kb_FBA",
    "Cbe" : "iCM925_FBA",
    "Mba" : "iMG746_FBA"
}
pub_pheno_hash = {
    "Sco" : "iMK1208_Pheno",
    "Eco" : "iML1515.kb_Pheno",
    "Cbe" : "iCM925_Pheno",
    "Mba" : "iMG746_Pheno"
}
stats = {
    "Sco":{},"Eco":{},"Cbe":{},"Mba":{}
}
types = ["Best","Union","RAST","Published"]
entities = ["gene","reaction","pospheno"]
print("Species\tType\tReactions\tGenes\tGapfilled\tBlocked\tPospheno\tGene match\tReaction match\tPheno match")
for genome in genome_list:
    #Get:gene associated reactions;genes;gapfilled
    models = [genome+"_Best",genome+"_Union",genome+"_StdRAST_Mdl",pub_model_hash[genome]]
    count = 0
    for model in models:
        current_object = kbase_api.get_object(model,"patrikd:narrative_1605639637696")
        stats[genome][types[count]] = {
            "reactions":0,
            "gapfilled":0,
            "blocked":0,
            "genes":0,
            "gene_hash":{},
            "reaction_hash":{},
            "pospheno":0,
            "pospheno_hash":{},
            "match_reaction":0,
            "match_gene":0,
            "match_pospheno":0
        }
        for rxn in current_object["modelreactions"]:
            rxn["id"] = rxn["id"].replace("_z0","_c0")
            if "gapfill_data" in rxn and len(rxn["gapfill_data"]) > 0:
                stats[genome][types[count]]["gapfilled"] += 1
            elif count == 3 and len(rxn["modelReactionProteins"]) == 0:
                stats[genome][types[count]]["gapfilled"] += 1
            if len(rxn["modelReactionProteins"]) > 0:
                stats[genome][types[count]]["reactions"] += 1
                stats[genome][types[count]]["reaction_hash"][rxn["id"]] = 1
                for prot in rxn["modelReactionProteins"]:
                    for subunit in prot["modelReactionProteinSubunits"]:
                        for ftr in subunit["feature_refs"]:
                            ftr = ftr.split("/").pop()
                            stats[genome][types[count]]["gene_hash"][ftr] = 1             
        stats[genome][types[count]]["genes"] = len(stats[genome][types[count]]["gene_hash"])
        count += 1
    
    #Get:blocked
    models = [genome+"_Best_FBA",genome+"_Union_FBA",genome+"_StdRAST_FBA",pub_fba_hash[genome]]
    count = 0
    for model in models:
        current_object = kbase_api.get_object(model,"patrikd:narrative_1605639637696")
        for var in current_object["FBAReactionVariables"]:
            if var["class"] == "Blocked":
                stats[genome][types[count]]["blocked"] += 1
        count += 1
    #Get:Neg;Pos
    models = [genome+"_Best_Pheno",genome+"_Union_Pheno",genome+"_StdRAST_Pheno",pub_pheno_hash[genome]]
    count = 0
    for model in models:
        if not (count == 3 and genome == "Sco"):
            current_object = kbase_api.get_object(model,"patrikd:narrative_1605639637696")
            for pheno in current_object["phenotypeSimulations"]:
                if pheno["simulatedGrowth"] > 0:
                    stats[genome][types[count]]["pospheno_hash"][pheno["id"]] = 1
                    stats[genome][types[count]]["pospheno"] += 1
        count += 1   
    #Computing matches
    for entity in entities:
        for count in range(0,3):
            for entid in stats[genome]["Published"][entity+"_hash"]:
                if entid in stats[genome][types[count]][entity+"_hash"]:
                    stats[genome][types[count]]["match_"+entity] += 1
    #Printing results
    for currtype in types:
        d = stats[genome][currtype]
        print(genome+"\t"+currtype+"\t"+str(d["reactions"])+"\t"+str(d["genes"])+"\t"+str(d["gapfilled"])\
            +"\t"+str(d["blocked"])+"\t"+str(d["pospheno"])+"\t"+str(d["match_gene"])+"\t"+str(d["match_reaction"])+"\t"+str(d["match_pospheno"]))

# Testing Ontology API Against Gold Standard Genomes

In [None]:
import sys
import json
import cobra
import cobrakbase
sys.path.append("/Users/chenry/code/MetabolicModelGapfilling/lib/")
#sys.path.append("/Users/chenry/code/annotation_ontology_api/lib")
from annotation_ontology_api.annotation_ontology_apiServiceClient import annotation_ontology_api
#from annotation_ontology_api.annotation_ontology_api import AnnotationOntologyAPI

#Test for ontology API
kbase_api = cobrakbase.KBaseAPI()
#anno_api = AnnotationOntologyAPI({"data_directory" : "/Users/chenry/code/annotation_ontology_api/data/"},kbase_api.ws_client,None)
anno_api = annotation_ontology_api()
genome_list = ["Ani_RAST"]
#genome_list = ["Sco_RAST","Eco_RAST","Cbe_RAST","Syn_RAST","Mba_RAST"]
genome_hash = {
    "Eco_RAST": "Eco_RAST_Prokka",
    "Cbe_RAST": "Cbe_RAST_Prokka",
    "Syn_RAST": "Syn_RAST_Prokka",
    "Mba_RAST": "Mba_RAST_Prokka",
    "Sco_RAST": "Sco_RAST_Prokka_BlastKOALA_PTools_DeepEC_DeepGO",
    "Ani_RAST": "Ani_RAST_Prokka"
}
for genome in genome_list:
    print(genome)
    ontology_output = anno_api.get_annotation_ontology_events({
        "input_ref" : "patrikd:narrative_1605639637696/"+genome,
    })
    genome_object = kbase_api.get_object(genome,"patrikd:narrative_1605639637696")
    ontology_input = {
        "input_ref":genome_hash[genome],
        "input_workspace":"patrikd:narrative_1605639637696",
        "output_name":genome_hash[genome],
        "output_workspace":"patrikd:narrative_1605639637696",        
        "save":1,
#        "type":"KBaseGenomes.Genome",
#        "object":genome,
        "clear_existing":0,
        "overwrite_matching":1,
        "events":[]
    }
    for event in ontology_output["events"]:
        print(event["ontology_id"])
        if event["ontology_id"] == "SSO":
            ontology_input["events"].append(event)
            break
    
    with open('/Users/chenry/output.json', 'w') as outfile:
        json.dump(ontology_output, outfile, indent=2)
    
    if len(ontology_input["events"]) == 1:
        print(str(len(ontology_input["events"])))
        print(ontology_input["events"][0]["ontology_id"])
        ontology_output["events"][0]["method"] = "RAST annotation"
        ontology_output["events"][0]["description"] = "RAST annotation:"+ontology_output["events"][0]["ontology_id"]+":"+ontology_output["events"][0]["timestamp"]    
        ontology_output["events"][0]["ontology_terms"] = {}
        for ftr in genome_object["features"]:
            if "functions" in ftr:
                for func in ftr["functions"]:
                    if ftr["id"] not in ontology_input["events"][0]["ontology_terms"]:
                        ontology_input["events"][0]["ontology_terms"][ftr["id"]] = []
                    ontology_input["events"][0]["ontology_terms"][ftr["id"]].append({
                        "term": "SSO:"+func
                    })
        for ftr in genome_object["cdss"]:
            if "functions" in ftr:
                for func in ftr["functions"]:
                    if ftr["id"] not in ontology_input["events"][0]["ontology_terms"]:
                        ontology_input["events"][0]["ontology_terms"][ftr["id"]] = []
                    ontology_input["events"][0]["ontology_terms"][ftr["id"]].append({
                        "term": "SSO:"+func
                    })
        ontology_output = anno_api.add_annotation_ontology_events(ontology_input)

# Printing SSO reactions

# Printing Super Annotated E. coli

In [None]:
import sys
sys.path.append("/Users/chenry/code/cb_annotation_ontology_api/lib")
import os
import cobra
import cobrakbase
import json
import csv
import logging
import cplex
import optlang
import re
import pandas as pd
from optlang.symbolics import Zero, add
import cobra.util.solver as sutil
from cobrakbase.core.converters import KBaseFBAModelToCobraBuilder
from cobrakbase.Workspace.WorkspaceClient import Workspace as WorkspaceClient
from annotation_ontology_api.annotation_ontology_api import AnnotationOntologyAPI
from cobra.core.dictlist import DictList
from cobra.core import Gene, Metabolite, Model, Reaction
from IPython.core.display import HTML
#Test for ontology API
kbase_api = cobrakbase.KBaseAPI()
anno_api = AnnotationOntologyAPI({"data_directory" : "/Users/chenry/code/cb_annotation_ontology_api/data/"},
    kbase_api.ws_client,None)

output = anno_api.get_annotation_ontology_events({
    "input_ref" : "Eco_Union_BestUnion_2plus_Best2plus_RASTKEGG.pdb",
    "input_workspace" : 133085
})
with open('EcoliSuperAnnotation', 'w') as outfile:
    json.dump(output, outfile, indent=2)
#Print annotations in tabular form
annotations = {}
for event in output["events"]:
    name = None
    if event["original_description"][0:4] == "RAST":
        name = "RAST"
    elif event["original_description"][0:6] == "Prokka":
        name = "Prokka"
    elif event["original_description"][0:5] == "Blast":
        name = "Koala"
    elif event["original_description"][0:7] == "Pathway":
        name = "PathwayTools"
    elif event["original_description"][0:6] == "DeepEC":
        name = "DeepEC"
    elif event["original_description"][0:6] == "DeepGO":
        name = "DeepGO"
    elif event["original_description"][0:3] == "KBA":
        name = "PDB"
    if name:
        for gene in event["ontology_terms"]:
            for item in event["ontology_terms"][gene]:
                if "modelseed_ids" in item:
                    if gene not in annotations:
                        annotations[gene] = {}
                    for msid in item["modelseed_ids"]:
                        if msid not in annotations[gene]:
                            annotations[gene][msid] = {}
                        if name not in annotations[gene][msid]:
                            annotations[gene][msid][name] = []
                        if item["term"] not in annotations[gene][msid][name]:
                            annotations[gene][msid][name].append(item["term"])
#Loading and saving dataframe
annos = ["RAST","Prokka","Koala","PathwayTools","DeepEC","DeepGO","PDB"]
data = {"Gene":[],"Reactions":[],"RAST":[],"Prokka":[],"Koala":[],"PathwayTools":[],"DeepEC":[],"DeepGO":[],"PDB":[]}
for gene in annotations:
    for rxn in annotations[gene]:
        data["Gene"].append(gene)
        data["Reactions"].append(rxn)
        for anno in annos:
            if anno in annotations[gene][rxn]:
                data[anno].append(",".join(annotations[gene][rxn][anno]))
            else:
                data[anno].append(None)
df = pd.DataFrame(data)
df.to_csv("EcoliSuperAnnotated.csv")

In [16]:
ontology = anno_api.get_annotation_ontology_events({
    "input_ref" : "Pf5.6",
    "input_workspace" : 77925
})
with open('/Users/chenry/translation.json', 'w') as outfile:
    json.dump(anno_api.alias_hash, outfile, indent=2)
with open('/Users/chenry/output.json', 'w') as outfile:
    json.dump(ontology, outfile, indent=2)

terms = ontology["events"][0]["ontology_terms"]
ontology["events"][0]["ontology_id"] = "SEED"
for gene in terms:
    terms[gene][0]["evidence"] = "test"
    terms[gene][0]["term"] = terms[gene][0]["term"].split(":")[1]
    
with open('/Users/chenry/output2.json', 'w') as outfile:
    json.dump(ontology, outfile, indent=2)
    
output = anno_api.add_annotation_ontology_events({
    "input_ref" : "GCF_000012265.1",
    "input_workspace" : 77925,
    "output_name" : "TestOntologyOutput",
    "events" : ontology["events"],
    "output_workspace": "kimbrel1:narrative_1606152384556",
    "save" : 1
})

#with open('/Users/chenry/genome.json', 'w') as outfile:
#    json.dump(output["object"], outfile, indent=2)

NameError: name 'anno_api' is not defined

# Not sure what this code is doing

In [None]:
sso_hash = dict()
with open('/Users/chenry/Dropbox/workspace/KBase Project/TemplateFunctions/genome_sso.json') as json_file:
    sso_hash = json.load(json_file)

sso_template = dict()
with open('/Users/chenry/Dropbox/workspace/KBase Project/TemplateFunctions/SSO_reactions.json') as json_file:
    sso_template = json.load(json_file)

reaction_hash = dict()
with open('/Users/chenry/Dropbox/workspace/KBase Project/TemplateFunctions/genome_reactions.json') as json_file:
    reaction_hash = json.load(json_file)

function_hash = dict()
with open('/Users/chenry/Dropbox/workspace/KBase Project/TemplateFunctions/genome_functions.json') as json_file:
    function_hash = json.load(json_file)

functions = dict()
comparison = dict()
for genome in sso_hash:
    if genome in reaction_hash:
        sso_based_reactions = dict()
        sso_based_genes = dict()
        for gene in sso_hash[genome]:
            for sso in sso_hash[genome][gene]:
                if sso in sso_template:
                    for reaction in sso_template[sso]:
                        if reaction not in sso_based_reactions:
                            sso_based_reactions[reaction] = dict()
                        sso_based_reactions[reaction][gene] = 1
                        if gene not in sso_based_genes:
                            sso_based_genes[gene] = dict()
                        sso_based_genes[gene][reaction] = 1
        comparison[genome] = {
            "SSO_reactions": len(sso_based_reactions),
            "SSO_genes": len(sso_based_genes),
            "Extra_SS_reactions": [],
            "Extra_SS_genes": [],
            "Extra_MS_reactions": [],
            "Extra_MS_genes": [],
            "Extra_SS_reactions_counts": 0,
            "Extra_SS_genes_counts": 0,
            "Extra_MS_reactions_counts": 0,
            "Extra_MS_genes_counts": 0,
            "MS_reactions": len(reaction_hash[genome]),
            "MS_genes" 0,
        }
        ms_based_genes = dict()
        for reaction in reaction_hash[genome]:
            if reaction not in sso_based_reactions:
                comparison[genome]["Extra_MS_reactions"].append(reaction)
                comparison[genome]["Extra_MS_reactions_counts"] += 1
            for gene in reaction_hash[genome][reaction]:
                if gene not in ms_based_genes:
                    ms_based_genes[gene] = dict()
                ms_based_genes[gene][reaction] = 1
        for reaction in sso_based_reactions:
            if reaction not in reaction_hash[genome]:
                comparison[genome]["Extra_SS_reactions"].append(reaction)
                comparison[genome]["Extra_SS_reactions_counts"] += 1
        comparison[genome]["MS_genes"] = len(ms_based_genes)
        for gene in ms_based_genes:
            if gene not in sso_based_genes:
                comparison[genome]["Extra_MS_genes"].append(gene)
                comparison[genome]["Extra_MS_genes_counts"] += 1
        for gene in sso_based_genes:
            if gene not in ms_based_genes:
                comparison[genome]["Extra_SS_genes"].append(gene)
                comparison[genome]["Extra_SS_genes_counts"] += 1
            
with open('/Users/chenry/Dropbox/workspace/KBase Project/TemplateFunctions/comparison.json', 'w') as outfile:
    json.dump(comparison, outfile)
    
with open('/Users/chenry/Dropbox/workspace/KBase Project/TemplateFunctions/problem_functions.json', 'w') as outfile:
    json.dump(functions, outfile)

# Computing reaction gene associations from all models in workspace

In [None]:
objects = msrecon.kbase_api.list_objects("chenry:narrative_1581959452634")
reaction_hash = dict()
count = 0
for obj in objects:
    if obj[1][-14:] == ".RAST.mdl.base":
        count += 1
        genomeid = obj[1][0:-14]
        reaction_hash[genomeid] = dict()
        model = kbase.get_from_ws(obj[1],"chenry:narrative_1581959452634")
        for rxn in model.reactions:
            reaction_hash[genomeid][rxn.id.split("_")[0]] = dict()
            for prot in rxn.data["modelReactionProteins"]:
                for subunit in prot["modelReactionProteinSubunits"]:
                    for ftr in subunit["feature_refs"]:
                        ftrid = ftr.split("/").pop()
                        reaction_hash[genomeid][rxn.id.split("_")[0]][ftrid] = 0

with open(kbdevutil.out_dir()+"genome_reactions.json", 'w') as outfile:
    json.dump(reaction_hash, outfile)