In [1]:
import pandas as pd
import json 
import pickle

# Input data

JESTR requires two inputs. We provide example input files in data/sample

1. data.tsv: spectra data and metadata (Only spectra with the fold column set as 'test' will be evaluated)
2. identifier_to_candidates.json: a mapping from identifier to a list of candidate smiles

In [5]:
# Main data file includes spectra and metadata
data = pd.read_csv("data/sample/data.tsv", sep="\t")
data[data['fold'] == 'test'].head()

Unnamed: 0,identifier,mzs,intensities,fold,precursor_mz
90,MassSpecGymID0214213,"42.033794,44.979271,86.05986,91.054344,93.0697...","0.01295,0.02273,0.10204,0.00936,0.00475,0.0077...",test,369.16312
91,MassSpecGymID0028002,"65.0385,68.997,77.0385,78.0461,79.0542,89.0386...","0.00999864,0.01208502,0.03910268,0.01290928,0....",test,404.1241
92,MassSpecGymID0336974,"247.044998,385.151001,464.252991,465.253998,48...","0.005,0.103,0.034,0.048,0.874,1.0,0.008,0.162",test,767.49
93,MassSpecGymID0030233,"141.9831,142.9913,170.0148,176.044,177.0515,18...","0.00999201,0.00636254,0.011405559999999999,0.0...",test,374.0934
94,MassSpecGymID0221881,"68.049377,79.054039,80.04937,84.044296,91.0540...","0.03041,0.14939,0.08069000000000001,0.06174,0....",test,771.33616


In [6]:
# candidate mapping from identifier to a list of candidate smiles
with open("data/sample/identifier_to_candidates.json", "r") as f:
    identifier_to_candidates = json.load(f)

identifier_to_candidates['MassSpecGymID0214213'][0:10]

['CC1=CC=CC=C1SCC(=O)N/N=C\\2/CCCC3=C2C(=CC(=C3)OC)C',
 'C[C@H]1CCN1c1nc(-c2noc(C3CCNCC3)n2)cc(C(F)(F)F)n1',
 'COC(=O)CC(C)(NC(=O)Cc1c[nH]c2ccccc12)c1ccc(F)cc1',
 'CN(C)CCNc1nc(NC(=O)c2cccc(N)c2)cc(C(F)(F)F)n1',
 'Cc1c(/N=C(\\N)Cc2cc(C(F)F)ccn2)nc(N)nc1-c1ccccc1',
 'COC(=O)c1occ(C(=O)OCC2CCCC(N)CNC2)c(=O)c1OC',
 'Cc1ccccc1CCNC(=O)CN(Cc1ccco1)Cc1cccs1',
 'CCC(=O)N(Cc1ccc(F)cc1)Cc1cc2cc(OC)ccc2[nH]c1=O',
 'CC(C)NC(CNC(=O)c1cc(CS)cc(CS)c1)C(=O)C(C)C',
 'Cc1oc2ccc(OCc3ccccc3)cc2c1C(=O)N[C@@H]1CNC[C@H]1F']

# JESTR output

The output file is a dataframe containing the sorted candidates and their scores

In [7]:
with open("experiments/20251001_JESTR_sample_run/result_identifier_to_candidates.pkl", "rb") as f:
    result = pickle.load(f)

result.head()

Unnamed: 0,identifier,sorted_candidates,sorted_scores
0,MassSpecGymID0214213,[CC1=CC=CC=C1SCC(=O)N/N=C\2/CCCC3=C2C(=CC(=C3)...,"[0.49698513746261597, 0.4939979314804077, 0.49..."
1,MassSpecGymID0028002,[O=c1[nH]c2cnc(-n3cnc4cnccc43)nc2n1C1CCOc2c(F)...,"[0.6375532150268555, 0.6322932243347168, 0.620..."
2,MassSpecGymID0336974,[CC(=CCC(O)C(C)C1CCC2(C)C3CCC4C(C)(C)C(OC5OC(C...,"[0.4895637035369873, 0.48725855350494385, 0.46..."
3,MassSpecGymID0030233,[CC(=O)OC1=C(c2cc(-c3ccc(F)cc3)ccc2Cl)C(=O)C(C...,"[0.5568597912788391, 0.5509239435195923, 0.525..."
4,MassSpecGymID0221881,[Cc1cncc(C(CCC2CC2)(N[S@](=O)C(C)(C)C)c2ccc(F)...,"[0.5085124969482422, 0.48943740129470825, 0.47..."
