# Important links
* https://github.com/ModelSEED/MicrobiomeNotebooks.git
* Workspace with Cliff's MAGs: https://narrative.kbase.us/narrative/155805
* Workspace with additional redundant MAGs: https://narrative.kbase.us/narrative/163264
* FAA of representative sequences: /scratch/fliu/data/cliff/mmseqs_ani_prob_rep_genome.faa  
* Cluster genome mapping: /scratch/fliu/data/cliff/mmseqs_ani_prob.json  

# Loading hit data from json file

In [1]:
%run cliffcommutil.py
with open("/scratch/fliu/data/cliff/mmseqs_ani_prob_v2.json", 'r') as f:
    hit_data = json.load(f)

python version 3.11.1
KBBaseModules 0.0.1
modelseedpy 0.3.3
cobrakbase 0.3.1
Output files printed to:/scratch/chenry/MicrobiomeNotebooks/Cliff/nboutput when using KBDevUtils.output_dir
ModelSEED: /scratch/shared//sdkmount/kb_sdk_home/run_local/workdir/tmp/


# Parsing hits for close genome list and establishing thresholds

In [2]:
%run cliffcommutil.py
mag_ani_data = {}
mag_thresholds = {}
mag_hit_distribution = {}
for protein in hit_data:
    for mag in hit_data[protein]:
        if mag not in mag_ani_data:
            mag_ani_data[mag] = {}
            mag_hit_distribution[mag] = [0] * 21
        for hit in hit_data[protein][mag]:
            genome = hit.split(":")[0]
            if hit_data[protein][mag][hit] < 100:
                if genome not in mag_ani_data[mag]:
                    mag_hit_distribution[mag][int(hit_data[protein][mag][hit]-80)] += 1
                mag_ani_data[mag][genome] = hit_data[protein][mag][hit]
util.save("mag_ani_data",mag_ani_data)
util.save("mag_hit_distribution",mag_hit_distribution)
records = []
for mag in mag_hit_distribution:
    if mag not in mag_thresholds:
        mag_thresholds[mag] = {"threshold":0,"threshold_count":0}
    record = {"mag":mag}
    threshold = 0
    for i in range(21):
        record["ani_" + str(80+i)] = mag_hit_distribution[mag][i]
    threshold_count = 0
    for i in range(21):
        index = 20-i  
        if mag_hit_distribution[mag][index] > 0 and (threshold == 0 or threshold == 80+index+1):
            threshold_count += mag_hit_distribution[mag][index]
            threshold = 80+index
    mag_thresholds[mag]["threshold"] = threshold
    mag_thresholds[mag]["threshold_count"] = threshold_count
    if threshold_count == 1:
        new_threshold = 0
        additional_count = 0
        start = 21-(threshold-80)
        for i in range(start,21,1):
            index = 20-i
            if mag_hit_distribution[mag][index] > 0 and (new_threshold == 0 or new_threshold == 80+index+1):
                additional_count += mag_hit_distribution[mag][index]
                new_threshold = 80+index
        if new_threshold != 0:
            record["new_threshold"] = new_threshold
            record["new_threshold_count"] = additional_count
            if (threshold-new_threshold)<=10 or threshold != 100:
                mag_thresholds[mag]["threshold"] = new_threshold
                mag_thresholds[mag]["threshold_count"] = threshold_count+additional_count
    record["threshold"] = threshold
    record["threshold_count"] = threshold_count
    records.append(record)
df = pd.DataFrame.from_records(records)
df.to_csv(util.output_dir+"/mag_hit_distribution.csv",index=False)
util.save("mag_thresholds",mag_thresholds)

Output files printed to:/scratch/chenry/MicrobiomeNotebooks/Cliff/nboutput when using KBDevUtils.output_dir
ModelSEED: /scratch/shared//sdkmount/kb_sdk_home/run_local/workdir/tmp/


# Getting workspace IDs for MAGs

In [8]:
%run cliffcommutil.py
mag_wsids = {}
mag_list = []
mags = util.msrecon.kbase_api.list_objects(155805, object_type="KBaseGenomes.Genome", include_metadata=False)
mag_list = mags
for item in mags:
    mag_wsids[item[1]] = item[7]
mags = util.msrecon.kbase_api.list_objects(163264, object_type="KBaseGenomes.Genome", include_metadata=False)
for item in mags:
    mag_wsids[item[1]] = item[7]
util.save("mag_wsids",mag_wsids)
util.save("mag_list",mag_list)

Output files printed to:/scratch/chenry/MicrobiomeNotebooks/Cliff/nboutput when using KBDevUtils.output_dir
ModelSEED: /scratch/shared//sdkmount/kb_sdk_home/run_local/workdir/tmp/


# Identifying supplemental proteins for MAGs based on thresholds and hits

In [5]:
%run cliffcommutil.py
mag_protein_supplements = {}
mag_thresholds = util.load("mag_thresholds")
mag_list = util.load("mag_list")
mag_wsids = util.load("mag_wsids")
all_mag_deep_data = {}
for mag in mag_wsids:
    all_mag_deep_data[mag] = {}
for protein in hit_data:
    for mag in hit_data[protein]:
        nr_mags_found = {}
        rejected_mags_found = {}
        genomes_found = {}
        all_mag_deep_data[mag][protein] = {
            "self":False,
            "total":[0,0],
            "nr_mags":[0,0],
            "rejected_mags":[0,0],
            "genomes":[0,0]
        }
        for hit in hit_data[protein][mag]:
            (genome,gene) = hit.split(":")
            if genome == "self":
                all_mag_deep_data[mag][protein]["self"] = True
            elif genome in mag_list:
                if genome not in nr_mags_found:
                    all_mag_deep_data[mag][protein]["total"][0] += 1
                    all_mag_deep_data[mag][protein]["nr_mags"][0] += 1
                    if hit_data[protein][mag][hit] >= mag_thresholds[mag]["threshold"]/100:
                        all_mag_deep_data[mag][protein]["total"][1] += 1
                        all_mag_deep_data[mag][protein]["nr_mags"][1] += 1
                    nr_mags_found[genome] = hit_data[protein][mag][hit]
                elif nr_mags_found[genome] != hit_data[protein][mag][hit]:
                    print("Warning: different similarity values for the same genome")
            elif genome in mag_wsids:
                if genome not in rejected_mags_found:
                    all_mag_deep_data[mag][protein]["total"][0] += 1
                    all_mag_deep_data[mag][protein]["rejected_mags"][0] += 1
                    if hit_data[protein][mag][hit] >= mag_thresholds[mag]["threshold"]/100:
                        all_mag_deep_data[mag][protein]["total"][1] += 1
                        all_mag_deep_data[mag][protein]["rejected_mags"][1] += 1
                    rejected_mags_found[genome] = hit_data[protein][mag][hit]
                elif rejected_mags_found[genome] != hit_data[protein][mag][hit]:
                    print("Warning: different similarity values for the same genome")
            else:
                if genome not in genomes_found:
                    all_mag_deep_data[mag][protein]["total"][0] += 1
                    all_mag_deep_data[mag][protein]["genomes"][0] += 1
                    if hit_data[protein][mag][hit] >= mag_thresholds[mag]["threshold"]/100:
                        all_mag_deep_data[mag][protein]["total"][1] += 1
                        all_mag_deep_data[mag][protein]["genomes"][1] += 1
                    genomes_found[genome] = hit_data[protein][mag][hit]
                elif genomes_found[genome] != hit_data[protein][mag][hit]:
                    print("Warning: different similarity values for the same genome")
        #We only add supplemental proteins for a family that does not already have a protein in the MAG
        if not all_mag_deep_data[mag][protein]["self"]:
            if mag not in mag_protein_supplements:
                mag_protein_supplements[mag] = {}
            mag_protein_supplements[mag][protein] = [all_mag_deep_data[mag][protein]["total"][1],all_mag_deep_data[mag][protein]["nr_mags"][1],all_mag_deep_data[mag][protein]["rejected_mags"][1]]
for mag in all_mag_deep_data:
    util.save("deepdata/"+mag,all_mag_deep_data[mag])
util.save("mag_protein_supplements",mag_protein_supplements)

Output files printed to:/scratch/chenry/MicrobiomeNotebooks/Cliff/nboutput when using KBDevUtils.output_dir
ModelSEED: /scratch/shared//sdkmount/kb_sdk_home/run_local/workdir/tmp/


In [10]:
mag_protein_supplements = util.load("mag_protein_supplements")
print(len(mag_protein_supplements.keys()))

271


In [11]:
mag_function_supplements = util.load("mag_function_supplements")
print(len(mag_function_supplements.keys()))

234


# Load functional data

In [8]:
%run cliffcommutil.py
function_hit_data = json.load(open('/home/fliu/cliff_mags/data/annotation_ani_prob_lo_85.json'))
mag_function_supplements = {}
mag_thresholds = util.load("mag_thresholds")
mag_list = util.load("mag_list")
mag_wsids = util.load("mag_wsids")
all_func_mag_deep_data = {}
for mag in mag_wsids:
    all_func_mag_deep_data[mag] = {}
for protein in function_hit_data:
    for mag in function_hit_data[protein]:
        nr_mags_found = {}
        rejected_mags_found = {}
        genomes_found = {}
        all_func_mag_deep_data[mag][protein] = {
            "self":False,
            "total":[0,0],
            "nr_mags":[0,0],
            "rejected_mags":[0,0],
            "genomes":[0,0]
        }
        for hit in function_hit_data[protein][mag]:
            (genome,gene) = hit.split(":")
            if genome == "self":
                all_func_mag_deep_data[mag][protein]["self"] = True
            elif genome in mag_list:
                if genome not in nr_mags_found:
                    all_func_mag_deep_data[mag][protein]["total"][0] += 1
                    all_func_mag_deep_data[mag][protein]["nr_mags"][0] += 1
                    if function_hit_data[protein][mag][hit] >= mag_thresholds[mag]["threshold"]/100:
                        all_func_mag_deep_data[mag][protein]["total"][1] += 1
                        all_func_mag_deep_data[mag][protein]["nr_mags"][1] += 1
                    nr_mags_found[genome] = function_hit_data[protein][mag][hit]
                elif nr_mags_found[genome] != function_hit_data[protein][mag][hit]:
                    print("Warning: different similarity values for the same genome")
            elif genome in mag_wsids:
                if genome not in rejected_mags_found:
                    all_func_mag_deep_data[mag][protein]["total"][0] += 1
                    all_func_mag_deep_data[mag][protein]["rejected_mags"][0] += 1
                    if function_hit_data[protein][mag][hit] >= mag_thresholds[mag]["threshold"]/100:
                        all_func_mag_deep_data[mag][protein]["total"][1] += 1
                        all_func_mag_deep_data[mag][protein]["rejected_mags"][1] += 1
                    rejected_mags_found[genome] = function_hit_data[protein][mag][hit]
                elif rejected_mags_found[genome] != function_hit_data[protein][mag][hit]:
                    print("Warning: different similarity values for the same genome")
            else:
                if genome not in genomes_found:
                    all_func_mag_deep_data[mag][protein]["total"][0] += 1
                    all_func_mag_deep_data[mag][protein]["genomes"][0] += 1
                    if function_hit_data[protein][mag][hit] >= mag_thresholds[mag]["threshold"]/100:
                        all_func_mag_deep_data[mag][protein]["total"][1] += 1
                        all_func_mag_deep_data[mag][protein]["genomes"][1] += 1
                    genomes_found[genome] = function_hit_data[protein][mag][hit]
                elif genomes_found[genome] != function_hit_data[protein][mag][hit]:
                    print("Warning: different similarity values for the same genome")
        #We only add supplemental proteins for a family that does not already have a protein in the MAG
        if not all_func_mag_deep_data[mag][protein]["self"]:
            if mag not in mag_function_supplements:
                mag_function_supplements[mag] = {}
            mag_function_supplements[mag][protein] = [all_func_mag_deep_data[mag][protein]["total"][1],all_func_mag_deep_data[mag][protein]["nr_mags"][1],all_func_mag_deep_data[mag][protein]["rejected_mags"][1]]
for mag in mag_function_supplements:
    util.save("funcdeepdata/"+mag,all_func_mag_deep_data[mag])
util.save("mag_function_supplements",mag_function_supplements)

Output files printed to:/scratch/chenry/MicrobiomeNotebooks/Cliff/nboutput when using KBDevUtils.output_dir
ModelSEED: /scratch/shared//sdkmount/kb_sdk_home/run_local/workdir/tmp/


# Computing stats on additional proteins

In [27]:
#%run cliffcommutil.py
#mag_list = util.load("mag_list")
#mag_thresholds = util.load("mag_thresholds")
#mag_protein_supplements = util.load("mag_protein_supplements")
records = []
mag_probability_threshold = {}
for item in mag_list:
    if item[1] in mag_protein_supplements:
        count = mag_thresholds[item[1]]["threshold_count"]
        record = {"mag":item[1]}
        records.append(record)
        abundance_count = [0] * 400
        for protein in mag_protein_supplements[item[1]]:
            fraction = mag_protein_supplements[item[1]][protein][0]/count
            entry = int(fraction*100/5)
            abundance_count[entry] += 1
        total = 0
        final_total = None
        mag_probability_threshold[item[1]] = 0.1
        for (i,entry) in enumerate(abundance_count):
            index = len(abundance_count)-i-1
            lasttotal = total
            total += abundance_count[index]
            if total >= 20000 and final_total == None:
                threshold = 5*(index+1)/100
                if threshold >= 0.1:
                    mag_probability_threshold[item[1]] = threshold
                    final_total = lasttotal
            if index == 2 and final_total == None:
                final_total = total
        print(item[1],mag_probability_threshold[item[1]],final_total)
        for (i,entry) in enumerate(abundance_count):
            record[i] = entry
util.save("mag_probability_threshold",mag_probability_threshold)
df = pd.DataFrame.from_records(records)
df.to_csv(util.output_dir+"/protein_count_abundance.csv",index=False)

Salt_Pond_MetaGSF2_B_H2O_MG_DASTool_bins_metabat.8.contigs__.RAST 0.1 14737
Salt_Pond_MetaG_R1_B_D1_MG_DASTool_bins_metabat.20.contigs__.RAST 0.1 3860
Salt_Pond_MetaG_R2_B_D2_MG_DASTool_bins_concoct_out.4.contigs__.RAST 0.1 4004
Salt_Pond_MetaG_R1_A_D2_MG_DASTool_bins_concoct_out.21.contigs__.RAST 0.1 4192
Salt_Pond_MetaG_R1_C_D1_MG_DASTool_bins_metabat.31.contigs__.RAST 0.1 5106
Salt_Pond_MetaG_R2_A_H2O_MG_DASTool_bins_concoct_out.29.contigs__.RAST 0.1 3670
Salt_Pond_MetaG_R2_B_D2_MG_DASTool_bins_metabat.50.contigs__.RAST 0.1 4271
Salt_Pond_MetaG_R1_B_D2_MG_DASTool_bins_metabat.18.contigs__.RAST 0.1 4144
Salt_Pond_MetaG_R1_A_D1_MG_DASTool_bins.metabat.15.contigs__.RAST 0.1 1457
Salt_Pond_MetaG_R2_C_D1_MG_DASTool_bins_concoct_out.88.contigs__.RAST 0.1 2330
Salt_Pond_MetaG_R2_restored_C_black_MG_DASTool_bins_concoct_out.57.contigs__.RAST 0.1 1738
Salt_Pond_MetaG_R2_A_D1_MG_DASTool_bins_metabat.10.contigs__.RAST 0.1 2713
Salt_Pond_MetaGSF2_B_D1_MG_DASTool_bins_concoct_out.44.contigs__.RA

# Loading protein sequence data

In [14]:
%run cliffcommutil.py
from Bio import SeqIO
protein_hash = {}
for record in SeqIO.parse('/scratch/fliu/data/cliff/mmseqs_ani_prob_rep_genome_v2.faa', 'fasta'):
    protein_hash[record.id] = record.seq

Output files printed to:/scratch/chenry/MicrobiomeNotebooks/Cliff/nboutput when using KBDevUtils.output_dir
ModelSEED: /scratch/shared//sdkmount/kb_sdk_home/run_local/workdir/tmp/


# Building genomes and assemblies

In [28]:
%run cliffcommutil.py
mag_list = util.load("mag_list")
mag_probability_threshold = util.load("mag_probability_threshold")
#mag_protein_supplements = util.load("mag_protein_supplements")
for item in mag_list:
    full_data = util.get_object(item[1],item[7])
    genome_obj = full_data["data"]
    genome_obj["assembly_ref"] = str(item[6])+"/"+str(item[0])+"/"+str(item[4])+";"+genome_obj["assembly_ref"]
    firstgene = genome_obj["features"][0]
    if item[1] in mag_protein_supplements:
        count = 1
        total_genomes = mag_thresholds[item[1]]["threshold_count"]
        for protein in mag_protein_supplements[item[1]]:
            if mag_protein_supplements[item[1]][protein][0]/total_genomes >= mag_probability_threshold[item[1]]:
                ftrid = 'pangenome.'+str(count)
                count += 1
                protseq = str(protein_hash[protein])
                dnaseq = util.translate_protein_to_gene(protseq)
                result = hashlib.md5(protseq.encode())
                md5 = result.hexdigest()
                result = hashlib.md5(dnaseq.encode())
                dnamd5 = result.hexdigest()
                newftr = {
                    "aliases": [["MMseqMD5",protein]],
                    "cdss": [
                        ftrid+".CDS"
                    ],
                    "functions":["Hypothetical protein"],
                    "dna_sequence": dnaseq,
                    "dna_sequence_length": 3*len(protseq),
                    "id": ftrid,
                    "location": [
                        [
                            firstgene["location"][0][0],
                            1,
                            "+",
                            3*len(protseq)
                        ]
                    ],
                    "md5": dnamd5,
                    "ontology_terms": {},
                    "protein_md5": md5,
                    "protein_translation": protseq,
                    "protein_translation_length": len(protseq),
                    "warnings": []
                }
                cdsftr = newftr.copy()
                del cdsftr["cdss"]
                cdsftr["id"] = ftrid+".CDS"
                cdsftr["parent_gene"] = ftrid
                genome_obj["features"].append(newftr)
                genome_obj["cdss"].append(cdsftr)
    print(item[1],count)
    #Saving MAG
    util.save("genome/"+item[1],full_data)

Output files printed to:/scratch/chenry/MicrobiomeNotebooks/Cliff/nboutput when using KBDevUtils.output_dir
ModelSEED: /scratch/shared//sdkmount/kb_sdk_home/run_local/workdir/tmp/
Salt_Pond_MetaGSF2_B_H2O_MG_DASTool_bins_metabat.8.contigs__.RAST 14738
Salt_Pond_MetaG_R1_B_D1_MG_DASTool_bins_metabat.20.contigs__.RAST 3861
Salt_Pond_MetaG_R2_B_D2_MG_DASTool_bins_concoct_out.4.contigs__.RAST 4005
Salt_Pond_MetaG_R2_restored_DShore_MG_DASTool_bins_concoct_out.46.contigs__.RAST 4005
Salt_Pond_MetaG_R1_A_D2_MG_DASTool_bins_concoct_out.21.contigs__.RAST 4193
Salt_Pond_MetaG_R1_C_D1_MG_DASTool_bins_metabat.31.contigs__.RAST 5107
Salt_Pond_MetaG_R2_A_H2O_MG_DASTool_bins_concoct_out.29.contigs__.RAST 3671
Salt_Pond_MetaG_R2_B_D2_MG_DASTool_bins_metabat.50.contigs__.RAST 4272
Salt_Pond_MetaG_R1_B_D2_MG_DASTool_bins_metabat.18.contigs__.RAST 4145
Salt_Pond_MetaGSF2_C_D2_MG_DASTool_bins_concoct_out.32.contigs__.RAST 4145
Salt_Pond_MetaG_R1_A_D1_MG_DASTool_bins.metabat.15.contigs__.RAST 1458
Salt_Po

# Loading genomes to KBase

In [29]:
%run cliffcommutil.py
mag_list = util.load("mag_list")
#mag_protein_supplements = util.load("mag_protein_supplements")
from datetime import datetime
now = datetime.now()
timestamp = datetime.timestamp(now)
workspace = 186698
anno = util.anno_client()
anno.clients["GenomeFileUtil"] = util.gfu_client()
done = []#["Lactobacillus.3","Clostridium.11","Muribaculum.1"]#This genome actually failed...
finished = util.load("finished_genomes",[])
for item in mag_list:
    asvname = item[1]
    if asvname not in done and asvname not in finished:
        data = util.load("genome/"+asvname)
        genome = data["data"]
        count = 0
        for ftr in genome["features"]:
            if ftr["id"][0:9] == "pangenome":
                count += 1
        print(asvname,count)
        util.save_ws_object(asvname+".pangenome",186698,genome,"KBaseGenomes.Genome")
        finished.append(asvname)
        util.save("finished_genomes",finished)

Output files printed to:/scratch/chenry/MicrobiomeNotebooks/Cliff/nboutput when using KBDevUtils.output_dir
ModelSEED: /scratch/shared//sdkmount/kb_sdk_home/run_local/workdir/tmp/
Salt_Pond_MetaGSF2_B_H2O_MG_DASTool_bins_metabat.8.contigs__.RAST 14737
Salt_Pond_MetaG_R1_B_D1_MG_DASTool_bins_metabat.20.contigs__.RAST 3860
Salt_Pond_MetaG_R2_B_D2_MG_DASTool_bins_concoct_out.4.contigs__.RAST 4004
Salt_Pond_MetaG_R2_restored_DShore_MG_DASTool_bins_concoct_out.46.contigs__.RAST 0
Salt_Pond_MetaG_R1_A_D2_MG_DASTool_bins_concoct_out.21.contigs__.RAST 4192
Salt_Pond_MetaG_R1_C_D1_MG_DASTool_bins_metabat.31.contigs__.RAST 5106
Salt_Pond_MetaG_R2_A_H2O_MG_DASTool_bins_concoct_out.29.contigs__.RAST 3670
Salt_Pond_MetaG_R2_B_D2_MG_DASTool_bins_metabat.50.contigs__.RAST 4271
Salt_Pond_MetaG_R1_B_D2_MG_DASTool_bins_metabat.18.contigs__.RAST 4144
Salt_Pond_MetaGSF2_C_D2_MG_DASTool_bins_concoct_out.32.contigs__.RAST 0
Salt_Pond_MetaG_R1_A_D1_MG_DASTool_bins.metabat.15.contigs__.RAST 1457
Salt_Pond_Met

# Printing feature probabilities

In [None]:
%run cliffcommutil.py
mag_list = util.load("mag_list")
mag_thresholds = util.load("mag_thresholds")
mag_protein_supplements = util.load("mag_protein_supplements")
feature_probabilities = {}
for item in mag_list:
    data = util.load("genome/"+item[1])
    feature_probabilities[item[1]] = {}
    ftrs = data["data"]["features"]
    total_genomes = mag_thresholds[item[1]]["threshold_count"]
    for ftr in ftrs:
        if ftr["id"][0:9] == "pangenome":
            feature_probabilities[item[1]][ftr["id"]] = mag_protein_supplements[item[1]][ftr["aliases"][0][1]][0]/total_genomes
            if feature_probabilities[item[1]][ftr["id"]] > 1:
                feature_probabilities[item[1]][ftr["id"]] = 1
        else:
            feature_probabilities[item[1]][ftr["id"]] = 1
util.save("feature_probabilities",feature_probabilities)

# Processing metabolite data

In [1]:
%run cliffcommutil.py
metabolite_hash = {"2'-Deoxyuridine": 'cpd00412',
 '2-Oxoglutarate': 'cpd00024',
 '2-Oxoisocaproate': 'cpd00200',
 '3-Hydroxybutyrate': 'cpd29193',
 '3-Hydroxyisovalerate': 'cpd02569',
 '3-Methyl-2-oxovalerate': 'cpd03737',
 '4-Aminobutyrate': 'cpd00281',
 'Acetate': 'cpd00029',
 'Acetone': 'cpd00178',
 'Alanine': 'cpd00035',
 'Arginine': 'cpd00051',
 'Aspartate': 'cpd00041',
 'Benzoate': 'cpd00153',
 'Betaine': 'cpd00540',
 'Dimethylamine': 'cpd00425',
 'Ethanol': 'cpd00363',
 'Formate': 'cpd00047',
 'Fructose': 'cpd01184',
 'Fumarate': 'cpd00106',
 'Glucose': 'cpd00027',
 'Glutamate': 'cpd00023',
 'Glycerol': 'cpd00100',
 'Isobutyrate': 'cpd01711',
 'Isoleucine': 'cpd00322',
 'Isopropanol': 'cpd01269',
 'Isovalerate': 'cpd05178',
 'Lactate': 'cpd00159',
 'Leucine': 'cpd00107',
 'Maltose': 'cpd00179',
 'Methanol': 'cpd00116',
 'Methionine': 'cpd00060',
 'Methylamine': 'cpd00187',
 'Methylguanidine': 'cpd01544',
 'N,N-Dimethylglycine': 'cpd00756',
 'Phenylacetate': 'cpd19069',
 'Phenylalanine': 'cpd00066',
 'Proline': 'cpd00129',
 'Propionate': 'cpd00141',
 'Propylene glycol': 'cpd00453',
 'Pyroglutamate': 'cpd01293',
 'Succinate': 'cpd00036',
 'Sucrose': 'cpd00076',
 'Thymidine': 'cpd00184',
 'Trehalose': 'cpd00794',
 'Trimethylamine': 'cpd00441',
 'Tryptophan': 'cpd00065',
 'Tyrosine': 'cpd00069',
 'Uracil': 'cpd00092',
 'Uridine': 'cpd00249',
 'Valine': 'cpd00156'}
metabolite_names = {}
metabolites = []
for item in metabolite_hash:
    metabolite_names[metabolite_hash[item]] = item
    metabolites.append(metabolite_hash[item])
util.save("metabolite_names",metabolite_names)
util.save("metabolites",metabolites)

python version 3.9.19
KBBaseModules 0.0.1


1721792105.8129761 INFO: Note: NumExpr detected 16 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
1721792105.813834 INFO: NumExpr defaulting to 8 threads.


modelseedpy 0.3.3
cobrakbase 0.3.1
Output files printed to:/Users/chenry/code/notebooks/MicrobiomeNotebooks/Cliff/nboutput when using KBDevUtils.output_dir
ModelSEED: /Users/chenry/code//kb_sdk/run_local/workdir/tmp/


# Computing SMIPPS

In [38]:
%run cliffcommutil.py
model_ws = 181152
model_suffix = ".mdl"
procid = 0
mag_list = util.load("mag_list")
metabolites = util.load("metabolites")
feature_probabilities = util.load("feature_probabilities")
gf_phenotype_results = util.load("new_gf_phenotype_results_"+str(procid),{})
probability_finished = util.load("probability_finished_"+str(procid),[])
problemlist = util.load("problemlist",[])
auxo_media = util.msrecon.get_media("181152/AuxoMedia")
gmm_base_media = util.msrecon.get_media("181152/BaseAerobicMM")
uptake_phenoset = util.create_phenotypeset_from_compounds(
    metabolites,
    base_media=auxo_media,
    base_uptake=0,
    base_excretion=1000,
    global_atom_limits={},
    type="uptake"
)
excretion_phenoset = util.create_phenotypeset_from_compounds(
    metabolites,
    base_media=auxo_media,
    base_uptake=0,
    base_excretion=1000,
    global_atom_limits={},
    type="excretion"
)
growth_phenoset = util.create_phenotypeset_from_compounds(
    metabolites,
    base_media=gmm_base_media,
    base_uptake=0,
    base_excretion=1000,
    global_atom_limits={},
    type="growth"
)
phenosets = {"uptake":uptake_phenoset,"excretion":excretion_phenoset,"growth":growth_phenoset}
for i,item in enumerate(mag_list):
    asvname = item[1]
    if i%8 == procid:
        if asvname not in probability_finished and asvname not in problemlist:
            mdlutl = util.msrecon.get_model(item[1]+model_suffix,model_ws)
            reaction_probabilities = {}
            for rxn in mdlutl.model.reactions:
                highest_prob = None
                for gene in rxn.genes:
                    if gene.id in feature_probabilities[item[1]]:
                        if highest_prob == None or feature_probabilities[item[1]][gene.id] > highest_prob:
                            highest_prob = feature_probabilities[item[1]][gene.id]
                if highest_prob != None:
                    rxn.probability = highest_prob
                    reaction_probabilities[rxn.id] = highest_prob
            
            reaction_hash = mdlutl.annoont.get_reaction_gene_hash(feature_type="gene")
            for rxn in reaction_hash:
                highest_prob = None
                for gene in reaction_hash[rxn]:
                    if gene in feature_probabilities[item[1]]:
                        if highest_prob == None or feature_probabilities[item[1]][gene] > highest_prob:
                            highest_prob = feature_probabilities[item[1]][gene]
                if highest_prob != None and highest_prob >= reaction_probabilities[rxn]:
                    mdlutl.model.reactions.get_by_id(rxn).probability = highest_prob
                    reaction_probabilities[rxn] = highest_prob
            
            filters = mdlutl.get_attributes("gf_filter")
            tests = mdlutl.get_atp_tests(core_template=util.msrecon.core_template,atp_media_filename=util.msrecon.module_dir+"/data/atp_medias.tsv",recompute=False)
            msgapfill = MSGapfill(
                mdlutl,
                [util.msrecon.get_template(mdlutl.model.template_ref)],
                [],
                tests,
                blacklist=[],
                default_target="bio1",
                minimum_obj=0.01,
                base_media=None,
                base_media_target_element=None
            )

            #Adding missing transporter for metabolites to gapfilling database
            for cpd in metabolites:
                if "EX_"+cpd+"_e0" not in msgapfill.gfmodelutl.model.reactions:
                    transport = msgapfill.gfmodelutl.add_transport_and_exchange_for_metabolite(cpd,direction="=",prefix="trans",override=False)

            coefficients = {}
            gf_penalties = msgapfill.gfpkgmgr.getpkg("GapfillingPkg").gapfilling_penalties
            gfrxn = 0
            probrxn = 0
            otherrxn = 0
            for reaction in msgapfill.gfmodelutl.model.reactions:
                if reaction.id in reaction_probabilities:
                    probrxn += 2
                    coefficients[">"+reaction.id] = 1-reaction_probabilities[reaction.id]
                    coefficients["<"+reaction.id] = 1-reaction_probabilities[reaction.id]
                elif reaction.id in gf_penalties:
                    if "forward" in gf_penalties[reaction.id]:
                        gfrxn += 1
                        coefficients[">"+reaction.id] = 1+gf_penalties[reaction.id]["forward"]
                    else:
                        otherrxn += 1
                        coefficients[">"+reaction.id] = 0.95
                    if "reverse" in gf_penalties[reaction.id]:
                        gfrxn += 1
                        coefficients["<"+reaction.id] = 1+gf_penalties[reaction.id]["reverse"]
                    else:
                        otherrxn += 1
                        coefficients["<"+reaction.id] = 0.95
                else:
                    otherrxn += 2
                    coefficients[">"+reaction.id] = 0.95
                    coefficients["<"+reaction.id] = 0.95
            print(asvname,"GF:",gfrxn,"Prob:",probrxn,"Other:",otherrxn)

            msgapfill.prefilter(test_conditions=tests,growth_conditions=[],use_prior_filtering=True,base_filter_only=True)
            
            gf_phenotype_results[asvname] = {}
            for phenoid in phenosets:
                gf_phenotype_results[asvname][phenoid] = {}
                output = phenosets[phenoid].simulate_phenotypes(
                    msgapfill.gfmodelutl,
                    multiplier=2,
                    add_missing_exchanges=True,
                    save_fluxes=False,
                    save_reaction_list=True,
                    gapfill_negatives=False,
                    msgapfill=None,
                    test_conditions=None,
                    ignore_experimental_data=True,
                    flux_coefficients=coefficients
                )
                for index, row in output["details"].iterrows():
                    if "reactions" in output["data"][row["Phenotype"]]:
                        output["data"][row["Phenotype"]]["average_probability"] = 0
                        for rxn in output["data"][row["Phenotype"]]["reactions"]:
                            direction = rxn[0:1]
                            rxnid = rxn[1:]
                            if direction == ">":
                                if rxnid not in gf_penalties or "forward" not in gf_penalties[rxnid]:
                                    if rxnid in reaction_probabilities:
                                        output["data"][row["Phenotype"]]["average_probability"] += reaction_probabilities[rxnid]
                                    else:
                                        output["data"][row["Phenotype"]]["average_probability"] += 0.05
                            elif direction == "<":
                                if rxnid not in gf_penalties or "reverse" not in gf_penalties[rxnid]:
                                    if rxnid in reaction_probabilities:
                                        output["data"][row["Phenotype"]]["average_probability"] += reaction_probabilities[rxnid]
                                    else:
                                        output["data"][row["Phenotype"]]["average_probability"] += 0.05
                        output["data"][row["Phenotype"]]["average_probability"] = output["data"][row["Phenotype"]]["average_probability"]/len(output["data"][row["Phenotype"]]["reactions"])
                    gf_phenotype_results[asvname][phenoid][row["Phenotype"]] = output["data"][row["Phenotype"]]
            probability_finished.append(asvname)
            util.save("new_gf_phenotype_results_"+str(procid),gf_phenotype_results)
            util.save("probability_finished_"+str(procid),probability_finished)

Output files printed to:/scratch/chenry/MicrobiomeNotebooks/Cliff/nboutput when using KBDevUtils.output_dir


1721789912.3593426 ERROR: Requested data metabolites doesn't exist at /scratch/chenry/MicrobiomeNotebooks/Cliff/datacache/metabolites.json


ModelSEED: /scratch/shared//sdkmount/kb_sdk_home/run_local/workdir/tmp/


ValueError: Requested data metabolites doesn't exist at /scratch/chenry/MicrobiomeNotebooks/Cliff/datacache/metabolites.json