# TODO - assemble a representative pangenome model for all models that map to a 16S sequence

In [1]:
# determine the frequency of reactions in all models that are mapped to a single 16S genome
# filter reactions that are below a frequency threshold

In [1]:
from json import load, dump
from pandas import read_excel

spreadsheet = read_excel(r"MicrobeID_to_GCF_mapping_AGORA2.xlsx", "Sheet1")
model_gcf_mapping = dict(zip(spreadsheet["GCF ID"].to_numpy(), spreadsheet["MicrobeID"].to_numpy()))
print(model_gcf_mapping)
with open("model_gcf_mapping.json", "w") as jsonOut: dump(model_gcf_mapping, jsonOut, indent=3)

with open("AGORA2 to ASV Exact Matches/AGORA2_6_14_23_16S_ASV_to_Genome.json", "r") as jsonIn:
    asv_to_genomes = load(jsonIn)
    
unique_asv_mappings = {k: list(set(v)) for k,v in asv_to_genomes.items()}
print(unique_asv_mappings)
with open("unique_asv_mappings.json", "w") as jsonOut:
    dump(unique_asv_mappings, jsonOut, indent=3)

{'GCF_000160075.2': 'Abiotrophia_defectiva_ATCC_49176', 'GCF_000376245.1': 'Acaricomes_phytoseiuli_DSM_14247', 'GCF_000018105.1': 'Acaryochloris_marina_MBIC11017', 'GCF_900103835.1': 'Acetanaerobacterium_elongatum_CGMCC_1_5012', 'GCF_900248245.1': 'Acetatifactor_muris_GP69', 'GCF_000179595.2': 'Acetivibrio_cellulolyticus_CD2', 'GCF_001461035.1': 'Acetivibrio_ethanolgignens_ACET_33324', 'GCF_001766835.1': 'Acetobacterium_wieringae_DSM_1911', 'GCF_000219125.1': 'Acetonema_longum_DSM_6540', 'GCF_000219745.1': 'Achromobacter_insuavis_AXX_A', 'GCF_000165835.1': 'Achromobacter_xylosoxidans_A8', 'GCF_001598595.1': 'Achromobacter_xylosoxidans_NBRC_15126', 'GCF_001457475.1': 'Achromobacter_xylosoxidans_NCTC10807', 'GCF_900103005.1': 'Acidaminobacter_hydrogenoformans_DSM_2784', 'GCF_000025305.1': 'Acidaminococcus_fermentans_DSM_20731', 'GCF_000230275.1': 'Acidaminococcus_intestini_RyC_MR95', 'GCF_900095825.1': 'Acidaminococcus_massiliensis_Marseille_P2828', 'GCF_000468835.1': 'Acidaminococcus_sp

In [3]:
from cobra.io import read_sbml_model
from json import dump

broken_model_gcfs = []
for asv, gcfs in unique_asv_mappings.items():
    reaction_counts_per_asv = {}
    print("\n", asv)
    for gcf in gcfs:
        try:
            model = read_sbml_model("/vol/ml/AGORA2/" + f"{model_gcf_mapping[gcf]}.xml")
            for rxn in model.reactions:
                if rxn.id in reaction_counts_per_asv:  reaction_counts_per_asv[rxn.id] += 1
                else: reaction_counts_per_asv[rxn.id] = 1
            print(model.id)
        except Exception as e:
            print("broken", gcf, model_gcf_mapping[gcf])
            broken_model_gcfs.append(gcf)
    if reaction_counts_per_asv == {}:  continue
    reaction_counts_per_asv = {rxnID:(count/len(gcfs)) for rxnID,count in reaction_counts_per_asv.items()}
    reaction_counts_per_asv["numGCFs"] = len(gcfs)
    with open(f"reaction_counts_per_asv/{asv}.json", "w") as jsonOut:
        dump(reaction_counts_per_asv, jsonOut, indent=3)


 ASV00001;seqs=10697401;samples=1978
M_Blautia_coccoides_YL58
M_Blautia_sp_YL58

 ASV00001;seqs=15073372;samples=2336
M_Blautia_coccoides_YL58
M_Blautia_sp_YL58

 ASV00002;seqs=7303020;samples=1403
M_Enterococcus_faecium_VAN_476
M_Enterococcus_faecium_VRE_110
M_Enterococcus_faecium_7330519_3
M_Enterococcus_faecium_HF50215
M_Enterococcus_faecium_TC_6
M_Enterococcus_faecium_7330446_2
M_Enterococcus_faecium_EnGen0318
M_Enterococcus_faecium_E2620
broken GCF_000322445.1 Enterococcus_faecium_E6012
M_Enterococcus_faecium_E1162
M_Enterococcus_faecium_UAA721
M_Enterococcus_faecium_109_A1
M_Enterococcus_faecium_E1904
M_Enterococcus_faecium_UAA945
M_Enterococcus_faecium_E1050
M_Enterococcus_faecium_U0317
M_Enterococcus_faecium_E3548
M_Enterococcus_faecium_E0045
M_Enterococcus_faecium_VAN_335
M_Enterococcus_faecium_UAA951
M_Enterococcus_faecium_LCT_EF90
M_Enterococcus_faecium_7330381_1
M_Enterococcus_faecium_E2039
M_Enterococcus_faecium_S658_3
M_Enterococcus_faecium_E0680
M_Enterococcus_faecium_E

In [1]:
with open("asv_model_print_output.txt", "r") as models_output:
    asv_model_print_output = models_output.readlines()

In [2]:
broken_model_names = [line.split(" ")[2].strip() for line in asv_model_print_output if "broken" in line]
print(broken_model_names)

['Enterococcus_faecium_E6012', 'Enterococcus_faecium_E6045', 'Enterococcus_faecium_VRE_108', 'Enterococcus_faecium_VRE_84', 'Escherichia_coli_O104_H4_str_11_4632_C5', 'Escherichia_coli_DEC14A', 'Shigella_flexneri_2850_71', 'Escherichia_coli_KTE100', 'Escherichia_coli_KTE205', 'Escherichia_coli_STEC_MHI813', 'Escherichia_coli_O104_H4_str_11_4632_C1', 'Escherichia_coli_ED1a', 'Escherichia_coli_MS_119_7', 'Escherichia_coli_DH1', 'Escherichia_coli_KTE6', 'Escherichia_coli_O111_H_str_11128', 'Escherichia_coli_DEC12B', 'Escherichia_coli_DEC4B', 'Escherichia_coli_KTE26', 'Escherichia_coli_DEC6D', 'Escherichia_coli_DEC11B', 'Escherichia_coli_UM146', 'Escherichia_coli_DEC1A', 'Escherichia_coli_O25b_H4_ST131_str_EC958', 'Escherichia_coli_LF82', 'Escherichia_coli_KTE221', 'Escherichia_coli_KTE39', 'Escherichia_coli_KTE233', 'Escherichia_coli_3003', 'Escherichia_coli_DEC12A', 'Escherichia_coli_KTE209', 'Escherichia_coli_DEC8E', 'Escherichia_coli_O157_H7_str_EC1212', 'Escherichia_coli_2720900', 'Es

In [23]:
from numpy import save, array
save("broken_model_gcfs.npy", array(broken_model_gcfs))

NameError: name 'broken_model_gcfs' is not defined

In [25]:
for modelName in broken_model_names:
    print("\n\n", modelName)
    with open(f'/vol/ml/AGORA2/{modelName}.xml', 'rb') as file:  sbytes=file.read()
    for i in range(len(sbytes)):
        try:
            s = sbytes[i:i + 1].decode('utf-8')
        except Exception as e:
            print(sbytes[i-10:i + 10], end="\t\t")
            sbytes = b"".join(sbytes.split(sbytes[i:i + 1], 1))
            print(sbytes[i-10:i + 10])
    with open(f'/vol/ml/AGORA2/{modelName.strip()}.xml', 'w') as file:  file.write(sbytes.decode("utf-8"))



 Enterococcus_faecium_E6012
b'rafenib-N-\xdf-glucuron'		b'rafenib-N--glucuroni'
b'rafenib-N-\xdf-glucuron'		b'rafenib-N--glucuroni'
b'"Tolcapone\xa0" compart'		b'"Tolcapone" compartm'
b'"Tolcapone\xa0" compart'		b'"Tolcapone" compartm'
b'Tolcapone?\xe8 exchange'		b'Tolcapone? exchange"'
b'Tolcapone?\xe8" reversi'		b'Tolcapone?" reversib'


 Enterococcus_faecium_E6045
b'rafenib-N-\xdf-glucuron'		b'rafenib-N--glucuroni'
b'rafenib-N-\xdf-glucuron'		b'rafenib-N--glucuroni'
b'"Tolcapone\xa0" compart'		b'"Tolcapone" compartm'
b'"Tolcapone\xa0" compart'		b'"Tolcapone" compartm'
b'Tolcapone?\xe8 exchange'		b'Tolcapone? exchange"'
b'Tolcapone?\xe8" reversi'		b'Tolcapone?" reversib'


 Enterococcus_faecium_VRE_108
b'rafenib-N-\xdf-glucuron'		b'rafenib-N--glucuroni'
b'"Tolcapone\xa0" compart'		b'"Tolcapone" compartm'
b'"Tolcapone\xa0" compart'		b'"Tolcapone" compartm'
b'Tolcapone?\xe8 exchange'		b'Tolcapone? exchange"'
b'Tolcapone?\xe8" reversi'		b'Tolcapone?" reversib'


 Enterococcus_faecium_

In [None]:
from cobra.io import read_sbml_model
for modelName in broken_model_names:
    try:
        read_sbml_model(f'/vol/ml/AGORA2/{modelName}.xml')
        print(modelName, "\tgood")
    except:
        print(modelName, "\tbad")

Enterococcus_faecium_E6012 	good
Enterococcus_faecium_E6045 	good
Enterococcus_faecium_VRE_108 	good
Enterococcus_faecium_VRE_84 	good
Escherichia_coli_O104_H4_str_11_4632_C5 	good
Escherichia_coli_DEC14A 	good
Shigella_flexneri_2850_71 	good
Escherichia_coli_KTE100 	good
Escherichia_coli_KTE205 	good
Escherichia_coli_STEC_MHI813 	good
Escherichia_coli_O104_H4_str_11_4632_C1 	good
Escherichia_coli_ED1a 	good
Escherichia_coli_MS_119_7 	good
Escherichia_coli_DH1 	good
Escherichia_coli_KTE6 	good
Escherichia_coli_O111_H_str_11128 	good
Escherichia_coli_DEC12B 	good
Escherichia_coli_DEC4B 	good
Escherichia_coli_KTE26 	good
Escherichia_coli_DEC6D 	good
Escherichia_coli_DEC11B 	good
Escherichia_coli_UM146 	good
Escherichia_coli_DEC1A 	good
Escherichia_coli_O25b_H4_ST131_str_EC958 	good
Escherichia_coli_LF82 	good
Escherichia_coli_KTE221 	good
Escherichia_coli_KTE39 	good
Escherichia_coli_KTE233 	good
Escherichia_coli_3003 	good
Escherichia_coli_DEC12A 	good
Escherichia_coli_KTE209 	good
Esch

In [4]:
%run ../../../andrew_MSpy/modelseedpy/community/mscompatibility.py

from cobra.io import write_sbml_model, read_sbml_model
from cobra import Model, Reaction
from numpy import load as npload
from glob import glob
from json import load
from os import path, environ
import re

# environ["HOME"] = "/home/afreiburger/shared/code/cobrakbase"
import cobrakbase
with open("/home/afreiburger/token") as token_file:
    kbase_api = cobrakbase.KBaseAPI(token_file.readline())
from cobrakbase.core.kbasefba.fbamodel_from_cobra import CobraModelConverter


with open("model_gcf_mapping.json", "r") as jsonIn: model_gcf_mapping = load(jsonIn)
with open("unique_asv_mappings.json", "r") as jsonIn:  unique_asv_mappings = load(jsonIn)
# with open("broken_model_gcfs.npy", "r") as aryIn:  broken_model_gcfs = npload(aryIn)

threshold = .5
for asv, gcfs in unique_asv_mappings.items():
    if not path.exists(f"reaction_counts_per_asv/{asv}.json"):
        print(f"The {asv} has no matching AGORA2 model") ; continue
    with open(f"reaction_counts_per_asv/{asv}.json", "r") as jsonIn:  asv_mappings = load(jsonIn)
    # asv = re.search(r"(?<=asv\/)(.+)(?=\.json)", asv).group()
    asv_name = asv.split(";")[0]
    reactions = [rxnID for rxnID, proportion in asv_mappings.items()] # if proportion > threshold and rxnID != "numGCFs"]
    megaModel = CobraModelConverter(Model(asv_name, f"MegaModel for {asv_name} from {asv_mappings['numGCFs']} GCFs")).build()
    captured_reactions, captured_rxnIDs = set(), set()
    print("\n", asv)
    for gcf in gcfs:
        # if gcf in broken_model_gcfs:  continue
        try:
            model = read_sbml_model("/vol/ml/AGORA2/" + f"{model_gcf_mapping[gcf]}.xml")
            # model = MSCompatibility.standardize(model, exchanges=False)
            print(model.id)
        except Exception as e:
            # print(e)
            print(f"Broken: {model_gcf_mapping[gcf]}") ; continue
        captured_reactions.update([rxn for rxn in model.reactions if rxn.id in reactions and rxn.id not in captured_rxnIDs])
        captured_rxnIDs.update([rxn.id for rxn in model.reactions])
    if captured_reactions == []:
        print(f"No models for {asv_name} are defined.")
        continue
    megaModel.add_reactions(list(captured_reactions))
    for rxn in megaModel.reactions:
        # print(rxn.reaction)
        rxn.probability = asv_mappings[rxn.id]
        # break
    print(len(megaModel.reactions), len(reactions), megaModel.reactions[0].probability)
    # compatibilize the model
    
    # write_sbml_model(megaModel, f"reaction_counts_per_asv/{asv}.xml")
    write_sbml_model(megaModel, f"/vol/ml/AGORA2/MegaModels/{asv_name}.xml")
    # megaModel = read_sbml_model(f"{model_gcf_mapping[gcf]}.xml")
    # megaModel.notes['kbase_genome_ref'] = ''
    # megaModel.notes['kbase_template_refs'] = ''
    # megaModel.notes['kbase_compartment_data_c'] = json.dumps({
    #     'compartmentIndex': 0,
    #     'pH': 7,
    #     'potential': 0,
    # })
    # megaModel.notes['kbase_compartment_data_e'] = json.dumps({
    #     'compartmentIndex': 0,
    #     'pH': 7,
    #     'potential': 0,
    # })
    # kbase_api.save_object(f"{asv_name}_pangenome", 155148, 'KBaseFBA.FBAModel', megaModel, meta=None)


 ASV00001;seqs=10697401;samples=1978
M_Blautia_coccoides_YL58
M_Blautia_sp_YL58
1132 1133 1.0

 ASV00001;seqs=15073372;samples=2336
M_Blautia_coccoides_YL58
M_Blautia_sp_YL58
1132 1133 1.0

 ASV00002;seqs=7303020;samples=1403
M_Enterococcus_faecium_VAN_476
M_Enterococcus_faecium_VRE_110
M_Enterococcus_faecium_7330519_3
M_Enterococcus_faecium_HF50215
M_Enterococcus_faecium_TC_6
M_Enterococcus_faecium_7330446_2
M_Enterococcus_faecium_EnGen0318
M_Enterococcus_faecium_E2620
M_Enterococcus_faecium_E6012
M_Enterococcus_faecium_E1162
M_Enterococcus_faecium_UAA721
M_Enterococcus_faecium_109_A1
M_Enterococcus_faecium_E1904
M_Enterococcus_faecium_UAA945
M_Enterococcus_faecium_E1050
M_Enterococcus_faecium_U0317
M_Enterococcus_faecium_E3548
M_Enterococcus_faecium_E0045
M_Enterococcus_faecium_VAN_335
M_Enterococcus_faecium_UAA951
M_Enterococcus_faecium_LCT_EF90
M_Enterococcus_faecium_7330381_1
M_Enterococcus_faecium_E2039
M_Enterococcus_faecium_S658_3
M_Enterococcus_faecium_E0680
M_Enterococcus_fa

In [7]:
from json import dump
reaction_counts_per_asv = {}
models_count = 0
for model_path in glob("/vol/ml/AGORA2/*.xml"):
    try:
        model = read_sbml_model(model_path)
    except:
        print("broken:\t", model_path.split("/")[-1])
    for rxn in model.reactions:
        if rxn.id in reaction_counts_per_asv:  reaction_counts_per_asv[rxn.id] += 1
        else: reaction_counts_per_asv[rxn.id] = 1
        # TODO storing a list of the rxn objects will save computational effort in the subsequent step
    print(model.id)
    models_count += 1
reaction_counts_per_asv = {rxnID:(count/models_count) for rxnID,count in reaction_counts_per_asv.items()}
reaction_counts_per_asv["numGCFs"] = models_count
with open(f"reaction_counts_per_asv/all_models.json", "w") as jsonOut:
    dump(reaction_counts_per_asv, jsonOut, indent=3)

M_Abiotrophia_defectiva_ATCC_49176
M_Acaricomes_phytoseiuli_DSM_14247
M_Acaryochloris_marina_MBIC11017
M_Acetanaerobacterium_elongatum_CGMCC_1_5012
M_Acetatifactor_muris_GP69
M_Acetivibrio_cellulolyticus_CD2
M_Acetivibrio_ethanolgignens_ACET_33324
M_Acetobacterium_wieringae_DSM_1911
M_Acetonema_longum_DSM_6540
M_Achromobacter_insuavis_AXX_A
M_Achromobacter_xylosoxidans_A8
M_Achromobacter_xylosoxidans_ERR2221244
M_Achromobacter_xylosoxidans_ERR2221245
M_Achromobacter_xylosoxidans_ERR2221246
M_Achromobacter_xylosoxidans_ERR2221247
M_Achromobacter_xylosoxidans_NBRC_15126
M_Achromobacter_xylosoxidans_NCTC10807
M_Acidaminobacter_hydrogenoformans_DSM_2784
M_Acidaminococcus_fermentans_DSM_20731
M_Acidaminococcus_intestini_RyC_MR95
M_Acidaminococcus_massiliensis_Marseille_P2828
M_Acidaminococcus_sp_BV3L6
M_Acidaminococcus_sp_D21
M_Acidaminococcus_sp_HPA0509
M_Acidobacterium_ailaaui_PMMR2
M_Acidobacterium_capsulatum_ATCC_51196
M_Acidovorax_caeni_R_24608
M_Acinetobacter_baumannii_1656_2
M_Acinet

NameError: name 'dump' is not defined

In [10]:
set1 = set([1,2,3,4])
set2 = set([3,4,5])
set1 -= set2
print(set1)

{1, 2}


In [8]:
from json import dump
with open(f"reaction_counts_per_asv/all_models.json", "w") as jsonOut:
    dump(reaction_counts_per_asv, jsonOut, indent=3)

In [1]:
from json import load
with open(f"reaction_counts_per_asv/all_models.json", "r") as jsonIn:
    reaction_counts_per_asv = load(jsonIn)
    
import cobrakbase
with open("/home/afreiburger/token") as token_file:
    kbase_api = cobrakbase.KBaseAPI(token_file.readline())
from cobrakbase.core.kbasefba.fbamodel_from_cobra import CobraModelConverter

from cobra.io import write_sbml_model, read_sbml_model
from cobra import Model, Reaction
from glob import glob
    
reactions = [rxnID for rxnID, proportion in reaction_counts_per_asv.items()] # if proportion > threshold and rxnID != "numGCFs"]
megaModel = CobraModelConverter(Model("AGORA2_model", f"MegaModel from all AGORA2 models")).build()
remaining_rxnIDs = set(list(reaction_counts_per_asv.keys()))
captured_reactions, captured_rxnIDs = [], set()
for model_path in glob("/vol/ml/AGORA2/*.xml"):
    if remaining_rxnIDs == set():  continue
    try:
        model = read_sbml_model(model_path)
    except Exception as e:
        print("broken:\t", model_path.split("/")[-1])
        continue
    captured_reactions.extend([rxn for rxn in model.reactions if rxn.id not in captured_rxnIDs])
    captured_rxnIDs.update([rxn.id for rxn in model.reactions])
    remaining_rxnIDs -= captured_rxnIDs
megaModel.add_reactions(list(captured_reactions))
for rxn in megaModel.reactions:
    rxn.probability = reaction_counts_per_asv[rxn.id]
print(len(megaModel.reactions), len(reactions), megaModel.reactions[0].probability)
write_sbml_model(megaModel, f"/vol/ml/AGORA2/MegaModels/all_models.xml")

modelseedpy 0.2.2
cobrakbase 0.3.1
broken:	 Something went wrong reading the SBML model. Most likely the SBML model is not valid. Please check that your model is valid using the `cobra.io.sbml.validate_sbml_model` function or via the online validator at https://sbml.org/validator_servlet/ .
	`(model, errors) = validate_sbml_model(filename)`
If the model is valid and cannot be read please open an issue at https://github.com/opencobra/cobrapy/issues . 	 Acinetobacter_pittii_ANC_4050.xml
broken:	 Something went wrong reading the SBML model. Most likely the SBML model is not valid. Please check that your model is valid using the `cobra.io.sbml.validate_sbml_model` function or via the online validator at https://sbml.org/validator_servlet/ .
	`(model, errors) = validate_sbml_model(filename)`
If the model is valid and cannot be read please open an issue at https://github.com/opencobra/cobrapy/issues . 	 Aeromicrobium_massiliense_JC14.xml
broken:	 Something went wrong reading the SBML model. 

In [None]:
from cobra.io import write_sbml_model
write_sbml_model(megaModel, f"/vol/ml/AGORA2/MegaModels/all_models.xml")

In [None]:
from cobra.io.sbml import validate_sbml_model
print(validate_sbml_model("/vol/ml/AGORA2/Shigella_sonnei_Ss046.xml"))

In [None]:
%run ../../../andrew_MSpy/modelseedpy/community/mscompatibility.py

model = read_sbml_model("/vol/ml/AGORA2/M_Enterococcus_faecium_UAA720.xml")
standardized_model = MSCompatibility.standardize(model)

