In [1]:
import json, numpy as np, tqdm, requests
from collections import defaultdict

In [2]:
# load all gnps data
with open("../ALL_GNPS.json") as f:
    all_gnps = json.load(f)
del_list = []
for i in tqdm.tqdm(range(len(all_gnps))):
    if all_gnps[i]["peaks_json"] == "[]":
        del_list.append(i)
print("Data Len: ", len(all_gnps))
for i in reversed(del_list):
    del all_gnps[i]
print("Data Len after: ", len(all_gnps))

FileNotFoundError: [Errno 2] No such file or directory: '../ALL_GNPS.json'

In [7]:
with open("./data.json", "r") as f:
    obj = json.load(f)


### Recovery of Errors

In [12]:
bad = [k for k,v in obj.items() if v == "Error"]
keys = ["class_results", "superclass_results", "pathway_results", "isglycoside"]
print(bad)

for s in tqdm.tqdm(bad):
    URL = "https://npclassifier.ucsd.edu/classify"
    smiles_to_data = {}
    params = {"smiles": s}
    r = requests.get(url = URL, params = params)
    data = r.json()
    obj[s] = {key: data[key] for key in keys}
bad = [k for k,v in obj.items() if v == "Error"]

with open("./data_fixerr.json", "w") as f:
    json.dump(obj, f)
print(bad)

[]


0it [00:00, ?it/s]


[]


### Histogram

In [45]:
# smiles -> pathways classifications
with open("./data_fixerr.json", "r") as f:
    obj = json.load(f)

In [46]:
pathways = defaultdict(int)
superclasses = defaultdict(int)
classes = defaultdict(int)
for k,v in obj.items():
    if len(v["pathway_results"]) == 0:
        pathways["EMPTY"] += 1
    for pathway in v["pathway_results"]:
        pathways[pathway] += 1
    if len(v["superclass_results"]) == 0:
        superclasses["EMPTY"] += 1
    for superclass in v["superclass_results"]:
        superclasses[superclass] += 1
    if len(v["class_results"]) == 0:
        classes["EMPTY"] += 1
    for classs in v["class_results"]:
        classes[classs] += 1
p = sorted(pathways.items(), key = lambda x : -x[1])
s = sorted(superclasses.items(), key = lambda x : -x[1])
c = sorted(classes.items(), key = lambda x : -x[1])
print(len(p), p)
print()
print(len(s), s)
print()
print(len(c), c)
print()

8 [('Alkaloids', 6249), ('Shikimates and Phenylpropanoids', 4864), ('Terpenoids', 3771), ('Fatty acids', 2519), ('Amino acids and Peptides', 2421), ('Polyketides', 1924), ('EMPTY', 1240), ('Carbohydrates', 845)]

73 [('EMPTY', 4179), ('Flavonoids', 1704), ('Tryptophan alkaloids', 1248), ('Small peptides', 1196), ('Glycerophospholipids', 1191), ('Steroids', 1024), ('Oligopeptides', 819), ('Coumarins', 798), ('Pseudoalkaloids', 697), ('Triterpenoids', 618), ('Tyrosine alkaloids', 610), ('Diterpenoids', 588), ('Sesquiterpenoids', 541), ('Anthranilic acid alkaloids', 529), ('Nicotinic acid alkaloids', 480), ('Fatty Acids and Conjugates', 457), ('Phenylpropanoids (C6-C3)', 416), ('Phenolic acids (C6-C1)', 387), ('Nucleosides', 367), ('Lysine alkaloids', 362), ('Monoterpenoids', 331), ('Saccharides', 318), ('Aromatic polyketides', 290), ('Lignans', 266), ('Ornithine alkaloids', 247), ('Isoflavonoids', 245), ('Polycyclic aromatic polyketides', 236), ('Peptide alkaloids', 216), ('Macrolides', 

### Creating splits

**./pathways_metadata/smiles_to_cmm**: Maps all valid smiles to cmm's with positive ion modes.

**./pathways_metadata/path_to_cmm**: Maps each pathway to positive ion-mode cmm's with certain smiles of those pathways

**./pathways_metadata/sc_to_cmm**: Maps each superclass to positive ion-mode cmm's with certain smiles of those pathways

In [52]:
# need to map pathway -> cmm_id
# smiles -> cmm_id, as pathway -> smiles exists

# smiles -> cmm
ion_modes = set()

with open("ALL_GNPS_trimmedkeys_filteredsmiles.json", "r") as f:
    all_gnps = json.load(f)

ct_all = len(all_gnps)
ct_positive = 0

# extract all positive Smiles->[CMM]
smiles_to_cmm = defaultdict(list)
for x in all_gnps:
    ion_modes.add(x["Ion_Mode"])
    if x["Ion_Mode"].strip().lower() == "positive":
        ct_positive += 1
        smiles_to_cmm[x["Smiles"]].append(x["spectrum_id"])
print("Any [] in smiles_to_cmm: ", any(filter(lambda x : len(x) == 0, smiles_to_cmm.values())))
print(f"Count positive smiles: {len(smiles_to_cmm)}")
print(f"Count positive cmms: {sum(map(lambda x : len(x), smiles_to_cmm.values()))}")
print(f"Percentage positive: {ct_positive / ct_all}")
print(f"Ion Modes: {list(ion_modes)}")

pathway_to_smiles = defaultdict(set)
sc_to_smiles = defaultdict(set)

# pathway/sc -> [smiles]
for k,v in obj.items():
    if len(v["pathway_results"]) == 0:
        pathway_to_smiles["EMPTY"].add(k)
    for pathway in v["pathway_results"]:
        pathway_to_smiles[pathway].add(k)
    if len(v["superclass_results"]) == 0:
        sc_to_smiles["EMPTY"].add(k)
    for sc in v["superclass_results"]:
        sc_to_smiles[sc].add(k)



pathway_to_cmm = defaultdict(list)
unique_smiles_in_pathway_output = set()
# pathway->smiles ===> pathway->smiles->cmms
for path, smiles in pathway_to_smiles.items():
    for smile in smiles:
        if smile in smiles_to_cmm:
            unique_smiles_in_pathway_output.add(smile)
            for cmm in smiles_to_cmm[smile]:
                pathway_to_cmm[path].append(cmm)

sc_to_cmm = defaultdict(list)
unique_smiles_in_sc_output = set()
# sc->smiles ===> sc->smiles->cmms
for sc, smiles in sc_to_smiles.items():
    for smile in smiles:
        if smile in smiles_to_cmm:
            unique_smiles_in_sc_output.add(smile)
            for cmm in smiles_to_cmm[smile]:
                sc_to_cmm[sc].append(cmm)
print([(k, len(v))for k,v in sc_to_cmm.items()])
with open("./pathways_metadata/smiles_to_cmm.json", "w") as f:
    json.dump(smiles_to_cmm, f)
with open("./pathways_metadata/path_to_cmm.json", "w") as f:
    json.dump(pathway_to_cmm, f)
with open("./pathways_metadata/sc_to_cmm.json", "w") as f:
    json.dump(sc_to_cmm, f)


Any [] in smiles_to_cmm:  False
Count positive smiles: 18627
Count positive cmms: 69673
Percentage positive: 0.7064937435356629
Ion Modes: [' Positive', 'negative', 'N/A', 'Negative', 'positive', 'Positive', ' Negative']
[('EMPTY', 16157), ('Coumarins', 2046), ('Lignans', 756), ('Nicotinic acid alkaloids', 1742), ('Tryptophan alkaloids', 3835), ('Tyrosine alkaloids', 2378), ('Nucleosides', 1702), ('Meroterpenoids', 471), ('Saccharides', 890), ('Lysine alkaloids', 1442), ('Fatty acyls', 297), ('Macrolides', 615), ('Flavonoids', 5691), ('Small peptides', 4766), ('Anthranilic acid alkaloids', 2241), ('Fatty Acids and Conjugates', 826), ('Glycerophospholipids', 242), ('Xanthones', 172), ('Oligopeptides', 1017), ('Steroids', 3931), ('Fatty amides', 385), ('Triterpenoids', 1219), ('Chromanes', 414), ('Monoterpenoids', 776), ('Carotenoids (C40)', 225), ('Amino acid glycosides', 165), ('Linear polyketides', 446), ('Phenolic acids (C6-C1)', 923), ('Diterpenoids', 1133), ('Diarylheptanoids', 125

In [22]:
# print samples for chen
import random
with open("./pathways_metadata/path_to_cmm.json", "r") as f:
    path_to_cmm_load = json.load(f)
out = {}
for k, v in path_to_cmm_load.items():
    out[k] = random.sample(v, 8)

with open("./pathways_metadata/path_to_cmm_samples.json", "w") as f:
    json.dump(out, f)

In [14]:
obj_keys = set(obj.keys())
# obj_keys subset of all keys, smiles_keys subset of all keys
print(len(obj_keys), len(smiles_to_cmm.keys()), len(unique_smiles_in_pathway_output), len(obj_keys.intersection(smiles_to_cmm.keys())))

22669 18627 18504 18504


In [41]:
with open("./pathways_metadata/smiles_to_cmm.json", "r") as f:
    obj = json.load(f)

In [42]:
all_units = []
for k, v in tqdm.tqdm(obj.items()):
    all_units += v
all_units = list(all_units)
print("num of units:", len(all_units))
from sklearn.model_selection import train_test_split
X_train, X_tmp, y_train, y_tmp = train_test_split(all_units, np.zeros_like(all_units), test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_tmp, y_tmp, test_size=0.5, random_state=42)

100%|██████████| 18627/18627 [00:00<00:00, 1862923.85it/s]

num of units: 69673





In [43]:
print(type(X_train), len(X_train), len(X_val), len(X_test))

<class 'list'> 55738 6967 6968


In [44]:
with open("./pathways_metadata/train.json", "w") as f:
    json.dump(X_train, f)
with open("./pathways_metadata/val.json", "w") as f:
    json.dump(X_val, f)
with open("./pathways_metadata/test.json", "w") as f:
    json.dump(X_test, f)