In [1]:
import os
import pandas as pd
import json
from extractor.gnps import GnpsAnnotationsFile
from extractor.gnps import GnpsCacher
from extractor.gnps import GnpsParametersFile
from extractor.gnps import GnpsInchiScore
from extractor.mgfs import MgfFiles
from rdkit import Chem
from rdkit.Chem import Draw
from rdkit.Chem import Descriptors
from rdkit.Chem.Draw import rdMolDraw2D
from rdkit.Avalon import pyAvalonTools
from rdkit.Chem.Draw import rdDepictor
import matchms
from pathlib import Path

# change current dir
os.chdir("..")

compounds_file = "../Manufactured case/Compounds.tsv"
compounds = pd.read_csv(compounds_file, sep="\t").set_index("Chemical name")
names = set(compounds.index.to_list())
assert len(names) == 96

p = Path("../Manufactured case/Mgf files/")
mgfs = MgfFiles(p)
assert mgfs.d.keys() == names, set(names) - set(mgfs.d.keys())

inchis = compounds.loc[compounds["InChI"].notna(), "InChI"]
compounds["Relative molecular weight"] = inchis.apply(lambda i: Descriptors.MolWt(Chem.inchi.MolFromInchi(i)))
compounds["Precursor m/z"] = mgfs.precursors
compounds["Precursor m/z − relative molecular weight"] = compounds["Precursor m/z"] - compounds["Relative molecular weight"]

In [2]:
pd.set_option('display.max_columns', None) 
pd.set_option('display.max_rows', None)
# compounds[compounds["Difference"] > 0.3]
compounds

Unnamed: 0_level_0,Id,Reported,Chemical class,InChI,Relative molecular weight,Precursor m/z,Precursor m/z − relative molecular weight
Chemical name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
13-hydroxylupanine,1,1,Bisquinolizidine alkaloid,InChI=1S/C15H24N2O2/c18-12-4-5-16-8-10-6-11(14...,264.369,265.190861,0.821861
3-beta-amino-pregnane,2,1,Steroidal alkaloid,InChI=1S/C21H37N/c1-4-14-6-8-18-17-7-5-15-13-1...,303.534,304.299793,0.765793
3-beta-dimethylamino-20-beta-hydroxy-5-pregnene,3,1,Steroidal alkaloid,InChI=1S/C23H39NO/c1-15(25)19-8-9-20-18-7-6-16...,345.571,346.310303,0.739303
3-beta-methylamino-20-oxo-5-pregnene,4,1,Steroidal alkaloid,InChI=1S/C22H35NO/c1-14(24)18-7-8-19-17-6-5-15...,329.528,330.279036,0.751036
Alchorneine,5,1,Imidazopyrimidine alkaloid,"InChI=1S/C12H19N3O/c1-9(2)10-8-14-7-6-12(3,4)1...",221.304,222.15976,0.85576
Anantine,6,1,Imidazole alkaloid,InChI=1S/C15H15N3O/c1-18-9-14(17-10-18)13-8-16...,253.305,254.128831,0.823831
Ancistroealaine A,7,1,Naphthalene-Isoquinoline alkaloid,InChI=1S/C26H29NO4/c1-14-10-18-17(8-9-20(28-4)...,419.521,420.217224,0.696224
Annomontine,8,1,Pyrimidine/Beta-carboline alkaloid,InChI=1S/C15H11N5/c16-15-18-8-6-12(20-15)14-13...,261.288,262.10886,0.82086
Asaronaldehyde,9,1,Monoaromatic,InChI=1S/C10H12O4/c1-12-8-5-10(14-3)9(13-2)4-7...,196.202,197.080777,0.878777
Baleabuxidine,10,1,Cycloartane-type triterpene,InChI=1S/C30H50N2O4/c1-17(2)25(36)31-22-11-12-...,502.74,503.384207,0.644207


In [3]:
task_ids_file = "../Manufactured case/Gnps task ids.json"
with open(task_ids_file) as task_ids_data:
    task_ids = json.load(task_ids_data)

task_id = task_ids[0]
all_annotations = GnpsCacher.cache_retrieve(task_id)
parameters = GnpsCacher.cache_retrieve_parameters(task_id)
isc = GnpsInchiScore(all_annotations, parameters)
new_cols = {
    f"inchi_gnps_{isc.min_peaks}_{isc.max_delta_mass}": isc.inchis,
    f"score_gnps_{isc.min_peaks}_{isc.max_delta_mass}": isc.scores,
}


In [4]:
isc.inchis

Id
1        
2     N/A
3        
4        
5        
6        
7        
8        
9        
10       
11       
12       
13       
14       
15       
16       
17       
18       
19       
20       
21       
22       
23       
24    N/A
25       
26       
27       
28    N/A
29       
30       
31       
32       
33       
34       
35       
36       
37       
39       
40       
41       
42       
43       
44       
45       
46       
47       
48       
49       
50       
51       
52       
53       
54       
55       
56       
57       
58       
59       
60       
61       
62       
63       
64       
65       
66       
67       
68       
70       
71    N/A
72       
73       
74       
75       
76       
77       
78       
79       
80       
81       
82       
83       
84       
85       
86       
87       
88       
89       
90       
92       
93       
94       
95       
96       
Name: INCHI, dtype: object