# Step1: Make sdf file correct

In [1]:
from utils.converter import correct_format


correct_format('./data/nist17/replib.sdf', 
               './data/nist17/c_replib.sdf')
correct_format('./data/nist17/mainlib.sdf', 
               './data/nist17/c_mainlib.sdf')

Num of lines before modification: 6612632
Num of lines after modification: 6651878
Num of lines before modification: 53023736
Num of lines after modification: 53291107


In [None]:
from rdkit import Chem


suppl_main = Chem.SDMolSupplier('./data/nist17/c_mainlib.sdf')
suppl_rep = Chem.SDMolSupplier('./data/nist17/c_replib.sdf')
mols_main = [mol for mol in suppl_main if mol]
mols_rep = [mol for mol in suppl_rep if mol]
print(len(mols_main)) # 265929
print(len(mols_rep)) # 39204
print(type(mols_main[0].GetProp('NISTNO')))

# Step2: Add SMILES to each molecule

In [11]:
from rdkit.Chem import AllChem


writer = AllChem.SDWriter('./data/nist17/cs_mainlib.sdf')
for mol in mols_main:
    smi = Chem.MolToSmiles(mol)
    mol.SetProp('SMILES', smi)
    writer.write(mol)
writer.close()

writer = AllChem.SDWriter('./data/nist17/cs_replib.sdf')
for mol in mols_rep:
    smi = Chem.MolToSmiles(mol)
    mol.SetProp('SMILES', smi)
    writer.write(mol)
writer.close()

# Step3: Get test set

In [17]:
import pandas as pd


df = pd.read_csv('./data/author/NEIMS_test_11499molecules.csv')
NistNo_main = df['NISTmain No.']
print(len(NistNo_main))
NistNo_main = set([str(no) for no in NistNo_main])
print(NistNo_main)
NistNo_rep = df['NISTreplib No.']
print(len(NistNo_rep))
NistNo_rep = set([str(no) for no in NistNo_rep])
print(NistNo_rep)

11499
{'374616', '118688', '408739', '238979', '429469', '120982', '257956', '2039', '286830', '121602', '228147', '160748', '62580', '231823', '290453', '216652', '343214', '228386', '307452', '352435', '47006', '236105', '343150', '9254', '373523', '238877', '291671', '228932', '408457', '24810', '68075', '238397', '279788', '108026', '348', '38904', '262244', '233226', '228203', '31041', '135511', '2046', '34418', '134877', '437009', '333726', '237947', '108092', '373282', '289532', '125075', '374747', '261724', '253224', '385700', '282521', '69796', '413589', '408031', '74609', '23294', '60348', '293437', '129350', '325592', '118625', '118863', '135014', '239683', '237006', '120343', '235535', '238966', '236263', '373158', '104038', '35061', '443036', '26158', '232090', '113975', '229316', '118538', '301509', '290664', '210354', '133595', '31789', '352595', '149615', '57734', '245274', '4287', '235445', '135385', '73881', '70528', '246802', '117983', '194347', '335282', '373901', '

In [18]:
test_mols_main = []
for mol in mols_main:
    if mol.GetProp('NISTNO') in NistNo_main:
        test_mols_main.append(mol)
print(len(test_mols_main))

test_mols_rep = []
for mol in mols_rep:
    if mol.GetProp('NISTNO') in NistNo_rep:
        test_mols_rep.append(mol)
print(len(test_mols_rep))

11499
11499


In [19]:
from rdkit.Chem import AllChem


writer = AllChem.SDWriter('./data/nist17/test_11499.sdf')
for mol in test_mols_main:
    writer.write(mol)
writer.close()

writer = AllChem.SDWriter('./data/nist17/test_11499_rep.sdf')
for mol in test_mols_rep:
    writer.write(mol)
writer.close()

In [20]:
from utils.converter import NIST17sdf2mgf


NIST17sdf2mgf('./data/nist17/test_11499_rep.sdf',
              './data/nist17/test_11499_rep.mgf')
NIST17sdf2mgf('./data/nist17/test_11499.sdf',
              './data/nist17/test_11499.mgf')

In [20]:
from matchms import set_matchms_logger_level
set_matchms_logger_level("ERROR")
from matchms.importing import load_from_mgf
import torch as pt 


iter = load_from_mgf('./data/nist17/test_11499_rep.mgf')
spectra = [s for s in iter]
pt.save(spectra, './data/mine/test_11499_rep.pt')
iter = load_from_mgf('./data/nist17/test_11499.mgf')
spectra = [s for s in iter]
pt.save(spectra, './data/mine/test_11499.pt')

# Step4: Get validation set

In [27]:
from utils.filter import filter_nist


nist_val = open('./data/neims/validation_set_smiles.txt')
smiles_val = nist_val.readlines()
smiles_val = [smi.strip() for smi in smiles_val]
print(len(smiles_val)) # 11599
_, _, val_smi = filter_nist(smiles_val)

11599
unqualified:102


In [29]:
from rdkit.Chem import AllChem


writer = AllChem.SDWriter('./data/nist17/val_set.sdf')
mols_val = []
for mol in mols_main:
    if mol.GetProp('SMILES') in val_smi:
        mols_val.append(mol)
        writer.write(mol)
print(len(mols_val)) # 12008
writer.close()

12008


In [None]:
from utils.converter import NIST17sdf2mgf
from matchms import set_matchms_logger_level
set_matchms_logger_level("ERROR")
from matchms.importing import load_from_mgf
import torch as pt


NIST17sdf2mgf('./data/nist17/val_set.sdf', 
              './data/nist17/val_set.mgf')
iter = load_from_mgf('./data/nist17/val_set.mgf')
spectra = [s for s in iter]
pt.save(spectra, './data/mine/val_12008.pt')