In [9]:
import sys
import re
import cobra
import cplex
import json
from configparser import ConfigParser
config = ConfigParser()
config.read("config.cfg")
paths = config.get("script", "syspaths").split(";")
for path in paths:
    sys.path.append(path)
import cobrakbase


In [10]:
mapped_rules = [
    [-18.01,-18.02,"Carbamidomethylation","GammaLactamFormation;EpsilonLactamFormation"],
    [-17.03,-17.03,"Carbamidomethylation","GammaLactamFormation;EpsilonLactamFormation"],
    [0.98,0.98,"Deamidation (R)","ImineDeamination"],
    [13.03,14.03,"Michael addition with methylamine","AmineMethylation"],
    [14.02,14.03,"Methylation(others);Methylation(KR);Methylation(Protein N-term);Carbamidomethylation","AmineMethylation"],
    [26.02,26.04,"Acetaldehyde +26","SchiffBaseFormation_Acetaldehyde"],
    [42.01,42.04,"Acetylation (N-term);Acetylation (K);Acetylation (Protein N-term)","NucleophileAcylation_AcetylCoa"],
    [43.01,43.02,"Carbamylation","Carbamylation_Urea;Carbamylation_CarbamoylPhosphate"],
    [43.99,43.02,"Carboxylation (E);Carbamidomethylation;Carboxylation (DKW);Carbamidomethylation (DHKE  X@N-term)","Carbamylation_Urea;Carbamylation_CarbamoylPhosphate"],
    [54.01,54.05,"MDA adduct +54","EnaminoneFormation_Malondialdehyde"],
    [54.01,54.05,"Methylglyoxal-derived hydroimidazolone","SchiffBaseFormation_MGlyoxal;NucleophilicAdditionCondensation_Mglyoxal"],
    [71.04,72.06,"Propionamide (K  X@N-term)","NucleophilicAddition_MGlyoxal"],
    [156.12,156.22,"4-hydroxynonenal (HNE)","MichaelAdditionCyclization_HNE"],
    [162.05,162.14,"Hexose (NSY)","MailardReaction_Glucose"]
]

abbreviation = {
    "M":"Methionine",
    "C":"Cysteine",
    "A":"Alanine",
    "T":"Threonine",
    "V":"Valine",
    "F":"Phenylalanine",
    "W":"Tryptophan",
    "Y":"Tyrosine",
    "I":"Isoleucine",
    "P":"Proline",
    "L":"Leucine",
    "D":"Aspartate",
    "R":"Arginine",
    "H":"Histidine",
    "K":"Lysine",
    "G":"Glycine",
    "S":"Serine",
    "Q":"Glutamine",
    "E":"Glutamate",
    "N":"Asparagine"
}

protein_data = {}
data = ""
with open('PTMDataRough.txt', 'r') as file:
    data = file.read()
lines = data.split("\n")
for line in lines:
    array = line.split("\t")
    residue = array[4][0:1]
    seqarray = array[5].split(residue.lower())
    unmodified_total = 0
    modified_total = 0
    alt = 0
    for i in range(6,len(array)):
            if alt == 0:
                if re.search('^\d+$', array[i]) == None:
                    modified_total += 0
                else:
                    modified_total += int(array[i])
                alt = 1
            else:
                if re.search('^\d+$', array[i]) == None:
                    unmodified_total += 0
                else:
                    unmodified_total += int(array[i])
                alt = 0
    fraction = modified_total / (modified_total + unmodified_total)
    if array[0] not in protein_data:
        protein_data[array[0]] = {}
    if array[1] not in protein_data[array[0]]:
        protein_data[array[0]][array[1]] = {}
    if array[3] not in protein_data[array[0]][array[1]]:
        protein_data[array[0]][array[1]][array[3]] = {}
    if abbreviation[residue] not in protein_data[array[0]][array[1]][array[3]]:
        protein_data[array[0]][array[1]][array[3]][abbreviation[residue]] = []
    protein_data[array[0]][array[1]][array[3]][abbreviation[residue]].append([seqarray[0],seqarray[1],fraction])
    #print(array[0]+";"+array[1]+";"+str(fraction))

data = ""
with open('PTMData.txt', 'r') as file:
    data = file.read()
lines = data.split("\n")
modifications = {}
modfract = {}
for line in lines:
    array = line.split("\t")
    if re.search('(.+)(.)\(([\+-])(\d*\.*\d+)\)(.+)', array[3]) != None:
        m = re.search('(.+)(.)\(([\+-])(\d*\.*\d+)\)(.+)', array[3])
        upstream = m[1]
        downstream = m[5]
        residue = m[2]
        if residue.upper() in abbreviation:
            residue = abbreviation[residue.upper()]
        mass = float(m[4])
        if m[3] == "-":
            mass = -1*mass
        if mass not in modifications:
            modifications[mass] = {}
            modfract[mass] = {}
        if len(array[4]) > 0:
            subarray = array[4].split(";")
            for op in subarray:
                if op not in modifications[mass]:
                    modifications[mass][op] = {}
                    modfract[mass][op] = {}
                if residue not in modifications[mass][op]:
                    modifications[mass][op][residue] = 0
                    modfract[mass][op][residue] = {}
                modifications[mass][op][residue] += 1
                if array[0] in protein_data:
                    if array[1] in protein_data[array[0]]:
                        if op in protein_data[array[0]][array[1]]:
                            if residue in protein_data[array[0]][array[1]][op]:
                                for item in protein_data[array[0]][array[1]][op][residue]:
                                    #print(item[0]+"\t"+upstream)
                                    frontmatch = 0
                                    if len(item[0]) <= len(upstream):
                                        if item[0] == upstream[-1*len(item[0]):]:
                                            frontmatch = 1
                                    else:
                                        if item[0][-1*len(upstream):] == upstream:
                                            frontmatch = 1
                                    if frontmatch == 1:
                                        if len(item[1]) <= len(downstream):
                                            if item[1] == downstream[0:len(item[1])]:
                                                if item[2] not in modfract[mass][op][residue]:
                                                    modfract[mass][op][residue][item[2]] = 0
                                                modfract[mass][op][residue][item[2]] += 1
                                                break
                                        else:
                                            if item[1][0:len(downstream)] == downstream:
                                                if item[2] not in modfract[mass][op][residue]:
                                                    modfract[mass][op][residue][item[2]] = 0
                                                modfract[mass][op][residue][item[2]] += 1
                                                break

In [11]:
data = ""
with open('AA_residues.tsv', 'r') as file:
    data = file.read()
lines = data.split("\n")
residue_data = {}
headers = None
for line in lines:
    if headers == None:
        headers = line.split("\t")
    else:
        array = line.split("\t")
        residue_data[array[0]+"_c0"] = {
            "name":array[1],
            "structure":array[2],
            "formula":array[3],
            "mass":array[4],
            "exact_mass":array[5]
        }

mods_hash = {}
product_hash = {}
reaction_hash = {}
predicted_mods = {}
kbase_api = cobrakbase.KBaseAPI()
model = kbase_api.get_from_ws("AAResidueDamageProducts",95771)
kbmodel = kbase_api.get_object("AAResidueDamageProducts",95771)
rxnops = {}
for rxn in kbmodel["modelreactions"]:
    rxnops[rxn["id"]] = []
    if "dblinks" in rxn and "PickAxe" in rxn["dblinks"]:
        for op in rxn["dblinks"]["PickAxe"]:
            if op[0:5] == "spont":
                op = op.replace("spontaneous.","")
                rxnops[rxn["id"]].append(op)
for metabolite in model.metabolites:
    if metabolite.id in residue_data:
        metabolite.formula = residue_data[metabolite.id]["formula"]
for reaction in model.reactions:
    if len(rxnops[reaction.id]) > 0:
        reactant = None
        other_reactant = []
        for metabolite in reaction.metabolites:
            if reaction.metabolites[metabolite] < 0:
                if metabolite.id in residue_data:
                    reactant = metabolite
                else:
                    other_reactant.append(metabolite)
        if reactant != None:
            closest_metabolite = None
            closest_mass = None
            for metabolite in reaction.metabolites:
                if reaction.metabolites[metabolite] > 0 and metabolite != reactant:
                    if closest_mass == None or abs(metabolite.formula_weight - reactant.formula_weight) < closest_mass:
                        closest_mass = abs(metabolite.formula_weight - reactant.formula_weight)
                        closest_metabolite = metabolite
            closest_mass = closest_metabolite.formula_weight - reactant.formula_weight
            closest_mass = round(closest_mass, 2)
            product_hash[closest_metabolite.id] = 1
            reaction_hash[reaction.id] = 1
            if closest_mass not in predicted_mods:
                predicted_mods[closest_mass] = {}
                mods_hash[closest_mass] = {}
            residue = reactant.id
            residue = residue[0:-10]
            if residue not in mods_hash[closest_mass]:
                mods_hash[closest_mass][residue] = []
            mods_hash[closest_mass][residue].append(reaction.id)
            for op in rxnops[reaction.id]:
                if op not in predicted_mods[closest_mass]:
                    predicted_mods[closest_mass][op] = {}
                if residue not in predicted_mods[closest_mass][op]:
                    predicted_mods[closest_mass][op][residue] = {"count":0,"others":other_reactant,"reactions":[]}
                predicted_mods[closest_mass][op][residue]["count"] += 1
                if reaction.id not in predicted_mods[closest_mass][op][residue]["reactions"]:
                    predicted_mods[closest_mass][op][residue]["reactions"].append(reaction.id)

match_rxn = {}                    
header = "ExpMassdiff\tPredMassDiff\tDiff\tExpOp\tPredOp\tOthers\tReactions"
abbrev_keys = abbreviation.keys()
for abbrev in abbrev_keys:
    header += "\t"+abbrev
print(header)
cpd_hash = {}
for mass in modifications:
    closest = None
    closest_mass = None
    for pmass in predicted_mods:
        if closest == None or abs(pmass-mass) < abs(closest):
            closest = pmass-mass
            closest_mass = pmass
    if abs(closest) > 2:
        closest = None
    ops = []
    for op in modifications[mass]:
        ops.append(op)
    pops = []
    full_len = len(ops)
    if closest != None:
        pops = []
        for pop in predicted_mods[closest_mass]:
            pops.append(pop)
        if len(pops) > full_len:
            full_len = len(pops)
    for i in range(0,full_len):
        line = ""
        if closest != None:
            line += str(mass)+"\t"+str(closest_mass)+"\t"+str(closest)+"\t"
        else:
            line += str(mass)+"\t\t\t"
        residues = []
        fractions = []
        if i < len(ops):
            for abbrev in abbrev_keys:
                if abbreviation[abbrev] in modifications[mass][ops[i]]:
                    residues.append(str(modifications[mass][ops[i]][abbreviation[abbrev]])+"/")
                    if closest != None:
                        if abbreviation[abbrev] in mods_hash[closest_mass]:
                            for rxn in mods_hash[closest_mass][abbreviation[abbrev]]:
                                match_rxn[rxn] = 1
                    count = 0
                    avefract = 0
                    highfract = 0
                    if abbreviation[abbrev] in modfract[mass][ops[i]] and modifications[mass][ops[i]][abbreviation[abbrev]] > 0:
                        for fraction in modfract[mass][ops[i]][abbreviation[abbrev]]:
                            for j in range(0,modfract[mass][ops[i]][abbreviation[abbrev]][fraction]):
                                count += 1
                                avefract += fraction
                                if fraction > highfract:
                                    highfract = fraction
                    if count > 0:
                        avefract = avefract/count
                    fractions.append(str(avefract)+"/"+str(highfract)+"/"+str(count))
                else:
                    residues.append("0/")
                    fractions.append("0/0/0")
            line += ops[i]
        else:
            for abbrev in abbrev_keys:
                residues.append("0/")
                fractions.append("0/0/0")
        line += "\t"
        if i < len(pops):
            count = 0
            others = []
            reactions = []
            for abbrev in abbrev_keys:
                if abbreviation[abbrev] in predicted_mods[closest_mass][pops[i]]:
                    residues[count] += str(predicted_mods[closest_mass][pops[i]][abbreviation[abbrev]]["count"])
                    for rxn in predicted_mods[closest_mass][pops[i]][abbreviation[abbrev]]["reactions"]:
                        if rxn not in reactions:
                            reactions.append(rxn)
                    for other in predicted_mods[closest_mass][pops[i]][abbreviation[abbrev]]["others"]:
                        if other not in others:
                            others.append(other.name)
                else:
                    residues[count] += "0"
                count += 1
            line += pops[i]+"\t"+";".join(others)+"\t"+";".join(reactions)
        else:
            count = 0
            for abbrev in abbrev_keys:
                residues[count] += "0"
                count += 1
            line += "\t\t"
        #line += "\t"+"\t".join(residues)
        line += "\t"+"\t".join(fractions)
        print(line)

print("Number of mass differences:"+str(len(predicted_mods)))
print("Number of products:"+str(len(product_hash)))
print("Number of reactions:"+str(len(reaction_hash)))
for rxn in match_rxn:
    print(rxn)

ExpMassdiff	PredMassDiff	Diff	ExpOp	PredOp	Others	Reactions	M	C	A	T	V	F	W	Y	I	P	L	D	R	H	K	G	S	Q	E	N
0.98	0.98	0.0	Deamidation (NQ)	ImineDeamination	H2O;H+	spontr86_c0	0/0/0	0/0/0	0/0/0	0/0/0	0/0/0	0/0/0	0/0/0	0/0/0	0/0/0	0/0/0	0/0/0	0/0/0	0/0/0	0/0/0	0/0/0	0/0/0	0/0/0	0.04462255392357123/1.0/668	0/0/0	0.20451996482140034/1.0/1135
0.98	0.98	0.0	Pyro-glu from Q				0/0/0	0/0/0	0/0/0	0/0/0	0/0/0	0/0/0	0/0/0	0/0/0	0/0/0	0/0/0	0/0/0	0/0/0	0/0/0	0/0/0	0/0/0	0/0/0	0/0/0	0/0/0	0/0/0	0/0/0
0.98	0.98	0.0	Oxidation (M)				0/0/0	0/0/0	0/0/0	0/0/0	0/0/0	0/0/0	0/0/0	0/0/0	0/0/0	0/0/0	0/0/0	0/0/0	0/0/0	0/0/0	0/0/0	0/0/0	0/0/0	0/0/0	0/0/0	0/0/0
0.98	0.98	0.0	Carbamidomethylation (DHKE  X@N-term)				0/0/0	0/0/0	0/0/0	0/0/0	0/0/0	0/0/0	0/0/0	0/0/0	0/0/0	0/0/0	0/0/0	0/0/0	0/0/0	0/0/0	0/0/0	0/0/0	0/0/0	0/0/0	0/0/0	0/0/0
0.98	0.98	0.0	Dehydration				0/0/0	0/0/0	0/0/0	0/0/0	0/0/0	0/0/0	0/0/0	0/0/0	0/0/0	0/0/0	0/0/0	0/0/0	0/0/0	0/0/0	0/0/0	0/0/0	0/0/0	0/0/0	0/0/0	0/0/0
0.98	0.98	0.0	Carbamidomethylation				0/

In [12]:
header = "ExpMassdiff\tPredMassDiff\tExpOp\tPredOp"
abbrev_keys = abbreviation.keys()
for abbrev in abbrev_keys:
    header += "\t"+abbreviation[abbrev]
print(header)
cpd_hash = {}
for rules in mapped_rules:
    line = str(rules[0])+"\t"+str(rules[1])+"\t"+rules[2]+"\t"+rules[3]
    op_array = rules[2].split(";")
    pop_array = rules[3].split(";")
    for abbrev in abbrev_keys:
        total_op_count = 0
        total_pop_count = 0
        count = 0
        avefract = 0
        highfract = 0
        for op in op_array:
            if op in modifications[rules[0]]:
                if abbreviation[abbrev] in modifications[rules[0]][op]:
                    if modifications[rules[0]][op][abbreviation[abbrev]] > 0:
                        total_op_count += modifications[rules[0]][op][abbreviation[abbrev]]
                        if abbreviation[abbrev] in modfract[rules[0]][op]:
                            for fraction in modfract[rules[0]][op][abbreviation[abbrev]]:
                                for j in range(0,modfract[rules[0]][op][abbreviation[abbrev]][fraction]):
                                    count += 1
                                    avefract += fraction
                                    if fraction > highfract:
                                        highfract = fraction
                        else:
                            print("MISSING:"+str(rules[0])+"\t"+op+"\t"+abbreviation[abbrev])
        if count > 0:
            avefract = avefract/count
        for pop in pop_array:
            if abbreviation[abbrev] in predicted_mods[rules[1]][pop]:
                total_pop_count += predicted_mods[rules[1]][pop][abbreviation[abbrev]]["count"]
        #line += "\t"+str(total_op_count)+"/"+str(total_pop_count)
        line += "\t"+str(total_op_count)+"/"+str(avefract)+"/"+str(highfract)+"/"+str(count)
    print(line)

ExpMassdiff	PredMassDiff	ExpOp	PredOp	Methionine	Cysteine	Alanine	Threonine	Valine	Phenylalanine	Tryptophan	Tyrosine	Isoleucine	Proline	Leucine	Aspartate	Arginine	Histidine	Lysine	Glycine	Serine	Glutamine	Glutamate	Asparagine
-18.01	-18.02	Carbamidomethylation	GammaLactamFormation;EpsilonLactamFormation	0/0/0/0	0/0/0/0	0/0/0/0	6/0/0/0	0/0/0/0	0/0/0/0	0/0/0/0	2/0/0/0	0/0/0/0	0/0/0/0	0/0/0/0	17/0/0/0	0/0/0/0	0/0/0/0	0/0/0/0	0/0/0/0	7/0/0/0	0/0/0/0	0/0/0/0	0/0/0/0
-17.03	-17.03	Carbamidomethylation	GammaLactamFormation;EpsilonLactamFormation	0/0/0/0	0/0/0/0	0/0/0/0	0/0/0/0	0/0/0/0	0/0/0/0	0/0/0/0	0/0/0/0	0/0/0/0	0/0/0/0	0/0/0/0	0/0/0/0	0/0/0/0	0/0/0/0	0/0/0/0	0/0/0/0	0/0/0/0	0/0/0/0	0/0/0/0	49/0/0/0
0.98	0.98	Deamidation (R)	ImineDeamination	0/0/0/0	0/0/0/0	0/0/0/0	0/0/0/0	0/0/0/0	0/0/0/0	0/0/0/0	0/0/0/0	0/0/0/0	0/0/0/0	0/0/0/0	0/0/0/0	0/0/0/0	0/0/0/0	0/0/0/0	0/0/0/0	0/0/0/0	0/0/0/0	0/0/0/0	0/0/0/0
13.03	14.03	Michael addition with methylamine	AmineMethylation	0/0/0/0	0/0/0/0	0/0/0/0	7/0.

In [27]:
reaction_hash = {}
with open('/Users/chenry/code/fba_tools/data/Reactions.json') as json_file:
    input_data = json.load(json_file)
    for rxn in input_data:
        if "compound_ids" in rxn:
            for cpd in rxn["compound_ids"]:
                if cpd+"_c0" in cpd_hash:
                    reaction_hash[rxn["id"]] = rxn
for rxnid in reaction_hash:
    roles = ""
    if "roles" in reaction_hash[rxnid]:
        roles = "|".join(reaction_hash[rxnid]["roles"])
        print(rxnid+"\t"+reaction_hash[rxnid]["definition"]+"\t"+roles)