# Configure KBase Jupyter Dev Environment
<sub><sup>(contact chenry@anl.gov with questions)</sub></sup>

In [None]:
import platform
print("python version " + platform.python_version())
import sys
import json
import os
import pandas as pd
from os.path import exists
from pathlib import Path
import logging

sys.path = ["/scratch/shared/code/chenry_utility_module/lib"] + sys.path
from chenry_utility_module.kbdevutils import KBDevUtils
kbdevutil = KBDevUtils("Ontology")

from modelseedpy import ModelSEEDBiochem
from modelseedpy.core.mstemplate import MSTemplateBuilder
from modelseedpy.core.annotationontology import convert_to_search_role,split_role
from modelseedpy.helpers import get_template
import cobra
import cobrakbase

logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
msrecon = kbdevutil.msseedrecon()
annoapi = kbdevutil.anno_client(native_python_api=True)

# Printing annotation comparison table

In [None]:
output = annoapi.get_annotation_ontology_events({
    "input_ref" : "154984/SwissProtCuratedProteins.RAST.Prokka.DRAM.Rhea"
})
with open(kbdevutil.out_dir()+"SwissProtAnnot.json", 'w') as outfile:
    json.dump(output, outfile)

In [None]:
biochem = ModelSEEDBiochem.get()
domain_specific_lists = {
    "Fungi" : "154984/SwissProt_Rhea_Fungi",
    "Other" : "154984/SwissProt_Rhea_Other",
    "Viridiplantae" : "154984/SwissProt_Rhea_Viridiplantae",
    "Archaea" : "154984/SwissProt_Rhea_Archaea",
    "Bacteria" : "154984/SwissProt_Rhea_Bacteria",
    "Metazoa" : "154984/SwissProt_Rhea_Metazoa"
}
domain_proteins = {}
for domain in domain_specific_lists:
    data = kbdevutil.get_object(domain_specific_lists[domain])
    for item in data["sequences"]:
        domain_proteins[item["id"]] = domain

# Testing the annotation ontology API

In [None]:
kbdevutil = KBDevUtils("Ontology",ws_version="appdev")
appdev_annoapi = kbdevutil.anno_client(native_python_api=True)
with open('debug.json') as json_file:
    input_data = json.load(json_file)
output = anno_api.add_annotation_ontology_events(input_data)
output = anno_api.get_annotation_ontology_events({
    "input_ref" : "102004/Methanosarcina_acetivorans_C2A_DRAM_RAST"
#    "input_ref" : "93487/Ruepo_2orMoreRKM"
#    "input_ref" : "77537/Sco_RAST_Prokka_BlastKOALA_PTools_DeepEC_DeepGO"
#    "input_ref" : "77537/Sco_Union_BestUnion_2plus_Best2plus_RASTKEGG"
#    "input_ref" : "77925/Pf5.6"#,
#    "input_workspace" : 
})
with open('output.json', 'w') as outfile:
    json.dump(output, outfile, indent=2)

terms = ontology["events"][0]["ontology_terms"]
ontology["events"][0]["ontology_id"] = "SEED"
for gene in terms:
    terms[gene][0]["evidence"] = "test"
    terms[gene][0]["term"] = terms[gene][0]["term"].split(":")[1]
    
output = anno_api.add_annotation_ontology_events({
    "input_ref" : "GCF_000012265.1",
    "input_workspace" : 77925,
    "output_name" : "TestOntologyOutput",
    "events" : ontology["events"],
    "output_workspace": "kimbrel1:narrative_1606152384556",
    "save" : 1
})

ontology = anno_api.get_annotation_ontology_events({
    "input_ref" : "TestOntologyOutput",
    "input_workspace" : "kimbrel1:narrative_1606152384556"
})

with open('/Users/chenry/output.json', 'w') as outfile:
    json.dump(ontology, outfile, indent=2)

#Escherichia_coli_K-12_MG1655
#Synechocystis_PCC_6803
#Methanosarcina_barkeri_Fusaro
#Clostridium_beijerinckii_NCIMB_8052
#Streptomyces_coelicolor_A3_2

ontology_input = {
    "input_ref":"Streptomyces_coelicolor_A3_2",
    "input_workspace":"chenry:narrative_1612295985064",
    "output_name":"test",
    "output_workspace":"chenry:narrative_1612295985064",
    "clear_existing":0,
    "overwrite_matching":1,
    "save":1,
    "events":[
        {
            "event_id": "annotate_genome:1.8.1:SSO:2020-11-23T17:51:18",
            "original_description": "annotate_genome:2020-11-23T17:51:18:2020-11-23T17:51:18",
            "description": "annotate_genome:2020-11-23T17:51:18:2020-11-23T17:51:18:2020-11-23T17:51:18",
            "ontology_id": "SSO",
            "method": "annotate_genome",
            "method_version": "1.8.1",
            "timestamp": "2020-11-23T17:51:18",
            "ontology_terms":{"sgl0001": [{"term": "SSO:000001563"}]}
        }
    ]
}
#with open('/Users/chenry/ontology_api_input.json') as json_file:
#    ontology_input = json.load(json_file)
#print("Loading ontology terms to genome!")
output = anno_api.add_annotation_ontology_events(ontology_input)

# Comparing Published Models

In [None]:
import sys
import json
import cobra
import cobrakbase
kbase_api = cobrakbase.KBaseAPI()

genome_list = ["Sco","Eco","Cbe","Mba"]
pub_model_hash = {
    "Sco" : "iMK1208",
    "Eco" : "iML1515.kb",
    "Cbe" : "iCM925_GF",
    "Mba" : "iMG746_GF"
}
pub_fba_hash = {
    "Sco" : "iMK1208_FBA",
    "Eco" : "iML1515.kb_FBA",
    "Cbe" : "iCM925_FBA",
    "Mba" : "iMG746_FBA"
}
pub_pheno_hash = {
    "Sco" : "iMK1208_Pheno",
    "Eco" : "iML1515.kb_Pheno",
    "Cbe" : "iCM925_Pheno",
    "Mba" : "iMG746_Pheno"
}
stats = {
    "Sco":{},"Eco":{},"Cbe":{},"Mba":{}
}
types = ["Best","Union","RAST","Published"]
entities = ["gene","reaction","pospheno"]
print("Species\tType\tReactions\tGenes\tGapfilled\tBlocked\tPospheno\tGene match\tReaction match\tPheno match")
for genome in genome_list:
    #Get:gene associated reactions;genes;gapfilled
    models = [genome+"_Best",genome+"_Union",genome+"_StdRAST_Mdl",pub_model_hash[genome]]
    count = 0
    for model in models:
        current_object = kbase_api.get_object(model,"patrikd:narrative_1605639637696")
        stats[genome][types[count]] = {
            "reactions":0,
            "gapfilled":0,
            "blocked":0,
            "genes":0,
            "gene_hash":{},
            "reaction_hash":{},
            "pospheno":0,
            "pospheno_hash":{},
            "match_reaction":0,
            "match_gene":0,
            "match_pospheno":0
        }
        for rxn in current_object["modelreactions"]:
            rxn["id"] = rxn["id"].replace("_z0","_c0")
            if "gapfill_data" in rxn and len(rxn["gapfill_data"]) > 0:
                stats[genome][types[count]]["gapfilled"] += 1
            elif count == 3 and len(rxn["modelReactionProteins"]) == 0:
                stats[genome][types[count]]["gapfilled"] += 1
            if len(rxn["modelReactionProteins"]) > 0:
                stats[genome][types[count]]["reactions"] += 1
                stats[genome][types[count]]["reaction_hash"][rxn["id"]] = 1
                for prot in rxn["modelReactionProteins"]:
                    for subunit in prot["modelReactionProteinSubunits"]:
                        for ftr in subunit["feature_refs"]:
                            ftr = ftr.split("/").pop()
                            stats[genome][types[count]]["gene_hash"][ftr] = 1             
        stats[genome][types[count]]["genes"] = len(stats[genome][types[count]]["gene_hash"])
        count += 1
    
    #Get:blocked
    models = [genome+"_Best_FBA",genome+"_Union_FBA",genome+"_StdRAST_FBA",pub_fba_hash[genome]]
    count = 0
    for model in models:
        current_object = kbase_api.get_object(model,"patrikd:narrative_1605639637696")
        for var in current_object["FBAReactionVariables"]:
            if var["class"] == "Blocked":
                stats[genome][types[count]]["blocked"] += 1
        count += 1
    #Get:Neg;Pos
    models = [genome+"_Best_Pheno",genome+"_Union_Pheno",genome+"_StdRAST_Pheno",pub_pheno_hash[genome]]
    count = 0
    for model in models:
        if not (count == 3 and genome == "Sco"):
            current_object = kbase_api.get_object(model,"patrikd:narrative_1605639637696")
            for pheno in current_object["phenotypeSimulations"]:
                if pheno["simulatedGrowth"] > 0:
                    stats[genome][types[count]]["pospheno_hash"][pheno["id"]] = 1
                    stats[genome][types[count]]["pospheno"] += 1
        count += 1   
    #Computing matches
    for entity in entities:
        for count in range(0,3):
            for entid in stats[genome]["Published"][entity+"_hash"]:
                if entid in stats[genome][types[count]][entity+"_hash"]:
                    stats[genome][types[count]]["match_"+entity] += 1
    #Printing results
    for currtype in types:
        d = stats[genome][currtype]
        print(genome+"\t"+currtype+"\t"+str(d["reactions"])+"\t"+str(d["genes"])+"\t"+str(d["gapfilled"])\
            +"\t"+str(d["blocked"])+"\t"+str(d["pospheno"])+"\t"+str(d["match_gene"])+"\t"+str(d["match_reaction"])+"\t"+str(d["match_pospheno"]))

# Testing Ontology API Against Gold Standard Genomes

In [None]:
import sys
import json
import cobra
import cobrakbase
sys.path.append("/Users/chenry/code/MetabolicModelGapfilling/lib/")
#sys.path.append("/Users/chenry/code/annotation_ontology_api/lib")
from annotation_ontology_api.annotation_ontology_apiServiceClient import annotation_ontology_api
#from annotation_ontology_api.annotation_ontology_api import AnnotationOntologyAPI

#Test for ontology API
kbase_api = cobrakbase.KBaseAPI()
#anno_api = AnnotationOntologyAPI({"data_directory" : "/Users/chenry/code/annotation_ontology_api/data/"},kbase_api.ws_client,None)
anno_api = annotation_ontology_api()
genome_list = ["Ani_RAST"]
#genome_list = ["Sco_RAST","Eco_RAST","Cbe_RAST","Syn_RAST","Mba_RAST"]
genome_hash = {
    "Eco_RAST": "Eco_RAST_Prokka",
    "Cbe_RAST": "Cbe_RAST_Prokka",
    "Syn_RAST": "Syn_RAST_Prokka",
    "Mba_RAST": "Mba_RAST_Prokka",
    "Sco_RAST": "Sco_RAST_Prokka_BlastKOALA_PTools_DeepEC_DeepGO",
    "Ani_RAST": "Ani_RAST_Prokka"
}
for genome in genome_list:
    print(genome)
    ontology_output = anno_api.get_annotation_ontology_events({
        "input_ref" : "patrikd:narrative_1605639637696/"+genome,
    })
    genome_object = kbase_api.get_object(genome,"patrikd:narrative_1605639637696")
    ontology_input = {
        "input_ref":genome_hash[genome],
        "input_workspace":"patrikd:narrative_1605639637696",
        "output_name":genome_hash[genome],
        "output_workspace":"patrikd:narrative_1605639637696",        
        "save":1,
#        "type":"KBaseGenomes.Genome",
#        "object":genome,
        "clear_existing":0,
        "overwrite_matching":1,
        "events":[]
    }
    for event in ontology_output["events"]:
        print(event["ontology_id"])
        if event["ontology_id"] == "SSO":
            ontology_input["events"].append(event)
            break
    
    with open('/Users/chenry/output.json', 'w') as outfile:
        json.dump(ontology_output, outfile, indent=2)
    
    if len(ontology_input["events"]) == 1:
        print(str(len(ontology_input["events"])))
        print(ontology_input["events"][0]["ontology_id"])
        ontology_output["events"][0]["method"] = "RAST annotation"
        ontology_output["events"][0]["description"] = "RAST annotation:"+ontology_output["events"][0]["ontology_id"]+":"+ontology_output["events"][0]["timestamp"]    
        ontology_output["events"][0]["ontology_terms"] = {}
        for ftr in genome_object["features"]:
            if "functions" in ftr:
                for func in ftr["functions"]:
                    if ftr["id"] not in ontology_input["events"][0]["ontology_terms"]:
                        ontology_input["events"][0]["ontology_terms"][ftr["id"]] = []
                    ontology_input["events"][0]["ontology_terms"][ftr["id"]].append({
                        "term": "SSO:"+func
                    })
        for ftr in genome_object["cdss"]:
            if "functions" in ftr:
                for func in ftr["functions"]:
                    if ftr["id"] not in ontology_input["events"][0]["ontology_terms"]:
                        ontology_input["events"][0]["ontology_terms"][ftr["id"]] = []
                    ontology_input["events"][0]["ontology_terms"][ftr["id"]].append({
                        "term": "SSO:"+func
                    })
        ontology_output = anno_api.add_annotation_ontology_events(ontology_input)

# Printing SSO reactions

# Printing Super Annotated E. coli

In [None]:
import sys
sys.path.append("/Users/chenry/code/cb_annotation_ontology_api/lib")
import os
import cobra
import cobrakbase
import json
import csv
import logging
import cplex
import optlang
import re
import pandas as pd
from optlang.symbolics import Zero, add
import cobra.util.solver as sutil
from cobrakbase.core.converters import KBaseFBAModelToCobraBuilder
from cobrakbase.Workspace.WorkspaceClient import Workspace as WorkspaceClient
from annotation_ontology_api.annotation_ontology_api import AnnotationOntologyAPI
from cobra.core.dictlist import DictList
from cobra.core import Gene, Metabolite, Model, Reaction
from IPython.core.display import HTML
#Test for ontology API
kbase_api = cobrakbase.KBaseAPI()
anno_api = AnnotationOntologyAPI({"data_directory" : "/Users/chenry/code/cb_annotation_ontology_api/data/"},
    kbase_api.ws_client,None)

output = anno_api.get_annotation_ontology_events({
    "input_ref" : "Eco_Union_BestUnion_2plus_Best2plus_RASTKEGG.pdb",
    "input_workspace" : 133085
})
with open('EcoliSuperAnnotation', 'w') as outfile:
    json.dump(output, outfile, indent=2)
#Print annotations in tabular form
annotations = {}
for event in output["events"]:
    name = None
    if event["original_description"][0:4] == "RAST":
        name = "RAST"
    elif event["original_description"][0:6] == "Prokka":
        name = "Prokka"
    elif event["original_description"][0:5] == "Blast":
        name = "Koala"
    elif event["original_description"][0:7] == "Pathway":
        name = "PathwayTools"
    elif event["original_description"][0:6] == "DeepEC":
        name = "DeepEC"
    elif event["original_description"][0:6] == "DeepGO":
        name = "DeepGO"
    elif event["original_description"][0:3] == "KBA":
        name = "PDB"
    if name:
        for gene in event["ontology_terms"]:
            for item in event["ontology_terms"][gene]:
                if "modelseed_ids" in item:
                    if gene not in annotations:
                        annotations[gene] = {}
                    for msid in item["modelseed_ids"]:
                        if msid not in annotations[gene]:
                            annotations[gene][msid] = {}
                        if name not in annotations[gene][msid]:
                            annotations[gene][msid][name] = []
                        if item["term"] not in annotations[gene][msid][name]:
                            annotations[gene][msid][name].append(item["term"])
#Loading and saving dataframe
annos = ["RAST","Prokka","Koala","PathwayTools","DeepEC","DeepGO","PDB"]
data = {"Gene":[],"Reactions":[],"RAST":[],"Prokka":[],"Koala":[],"PathwayTools":[],"DeepEC":[],"DeepGO":[],"PDB":[]}
for gene in annotations:
    for rxn in annotations[gene]:
        data["Gene"].append(gene)
        data["Reactions"].append(rxn)
        for anno in annos:
            if anno in annotations[gene][rxn]:
                data[anno].append(",".join(annotations[gene][rxn][anno]))
            else:
                data[anno].append(None)
df = pd.DataFrame(data)
df.to_csv("EcoliSuperAnnotated.csv")

In [None]:
ontology = anno_api.get_annotation_ontology_events({
    "input_ref" : "Pf5.6",
    "input_workspace" : 77925
})
with open('/Users/chenry/translation.json', 'w') as outfile:
    json.dump(anno_api.alias_hash, outfile, indent=2)
with open('/Users/chenry/output.json', 'w') as outfile:
    json.dump(ontology, outfile, indent=2)

terms = ontology["events"][0]["ontology_terms"]
ontology["events"][0]["ontology_id"] = "SEED"
for gene in terms:
    terms[gene][0]["evidence"] = "test"
    terms[gene][0]["term"] = terms[gene][0]["term"].split(":")[1]
    
with open('/Users/chenry/output2.json', 'w') as outfile:
    json.dump(ontology, outfile, indent=2)
    
output = anno_api.add_annotation_ontology_events({
    "input_ref" : "GCF_000012265.1",
    "input_workspace" : 77925,
    "output_name" : "TestOntologyOutput",
    "events" : ontology["events"],
    "output_workspace": "kimbrel1:narrative_1606152384556",
    "save" : 1
})

#with open('/Users/chenry/genome.json', 'w') as outfile:
#    json.dump(output["object"], outfile, indent=2)

# Not sure what this code is doing

In [None]:
sso_hash = dict()
with open('/Users/chenry/Dropbox/workspace/KBase Project/TemplateFunctions/genome_sso.json') as json_file:
    sso_hash = json.load(json_file)

sso_template = dict()
with open('/Users/chenry/Dropbox/workspace/KBase Project/TemplateFunctions/SSO_reactions.json') as json_file:
    sso_template = json.load(json_file)

reaction_hash = dict()
with open('/Users/chenry/Dropbox/workspace/KBase Project/TemplateFunctions/genome_reactions.json') as json_file:
    reaction_hash = json.load(json_file)

function_hash = dict()
with open('/Users/chenry/Dropbox/workspace/KBase Project/TemplateFunctions/genome_functions.json') as json_file:
    function_hash = json.load(json_file)

functions = dict()
comparison = dict()
for genome in sso_hash:
    if genome in reaction_hash:
        sso_based_reactions = dict()
        sso_based_genes = dict()
        for gene in sso_hash[genome]:
            for sso in sso_hash[genome][gene]:
                if sso in sso_template:
                    for reaction in sso_template[sso]:
                        if reaction not in sso_based_reactions:
                            sso_based_reactions[reaction] = dict()
                        sso_based_reactions[reaction][gene] = 1
                        if gene not in sso_based_genes:
                            sso_based_genes[gene] = dict()
                        sso_based_genes[gene][reaction] = 1
        comparison[genome] = {
            "SSO_reactions": len(sso_based_reactions),
            "SSO_genes": len(sso_based_genes),
            "Extra_SS_reactions": [],
            "Extra_SS_genes": [],
            "Extra_MS_reactions": [],
            "Extra_MS_genes": [],
            "Extra_SS_reactions_counts": 0,
            "Extra_SS_genes_counts": 0,
            "Extra_MS_reactions_counts": 0,
            "Extra_MS_genes_counts": 0,
            "MS_reactions": len(reaction_hash[genome]),
            "MS_genes" 0,
        }
        ms_based_genes = dict()
        for reaction in reaction_hash[genome]:
            if reaction not in sso_based_reactions:
                comparison[genome]["Extra_MS_reactions"].append(reaction)
                comparison[genome]["Extra_MS_reactions_counts"] += 1
            for gene in reaction_hash[genome][reaction]:
                if gene not in ms_based_genes:
                    ms_based_genes[gene] = dict()
                ms_based_genes[gene][reaction] = 1
        for reaction in sso_based_reactions:
            if reaction not in reaction_hash[genome]:
                comparison[genome]["Extra_SS_reactions"].append(reaction)
                comparison[genome]["Extra_SS_reactions_counts"] += 1
        comparison[genome]["MS_genes"] = len(ms_based_genes)
        for gene in ms_based_genes:
            if gene not in sso_based_genes:
                comparison[genome]["Extra_MS_genes"].append(gene)
                comparison[genome]["Extra_MS_genes_counts"] += 1
        for gene in sso_based_genes:
            if gene not in ms_based_genes:
                comparison[genome]["Extra_SS_genes"].append(gene)
                comparison[genome]["Extra_SS_genes_counts"] += 1
            
with open('/Users/chenry/Dropbox/workspace/KBase Project/TemplateFunctions/comparison.json', 'w') as outfile:
    json.dump(comparison, outfile)
    
with open('/Users/chenry/Dropbox/workspace/KBase Project/TemplateFunctions/problem_functions.json', 'w') as outfile:
    json.dump(functions, outfile)

# Computing reaction gene associations from all models in workspace

In [None]:
objects = msrecon.kbase_api.list_objects("chenry:narrative_1581959452634")
reaction_hash = dict()
count = 0
for obj in objects:
    if obj[1][-14:] == ".RAST.mdl.base":
        count += 1
        genomeid = obj[1][0:-14]
        reaction_hash[genomeid] = dict()
        model = kbase.get_from_ws(obj[1],"chenry:narrative_1581959452634")
        for rxn in model.reactions:
            reaction_hash[genomeid][rxn.id.split("_")[0]] = dict()
            for prot in rxn.data["modelReactionProteins"]:
                for subunit in prot["modelReactionProteinSubunits"]:
                    for ftr in subunit["feature_refs"]:
                        ftrid = ftr.split("/").pop()
                        reaction_hash[genomeid][rxn.id.split("_")[0]][ftrid] = 0

with open(kbdevutil.out_dir()+"genome_reactions.json", 'w') as outfile:
    json.dump(reaction_hash, outfile)