In [1]:
import sys
import os

parent_dir = os.path.abspath(os.path.join('../..'))
if parent_dir not in sys.path:
    sys.path.append(parent_dir)

In [2]:
from src.utils import process_mgf_file, process_smiles_dataframe, extract_casmi_spectra
from tqdm import tqdm
import pandas as pd

In [3]:
mgf_file = '../../data/raw/casmi22/CASMI_processed.mgf'
casmi_labels_file = '../../data/raw/casmi22/CASMI_labels.tsv'

In [4]:
result = process_mgf_file(mgf_file)

Parsing blocks:   0%|          | 0/304 [00:00<?, ?it/s]

Parsing blocks: 100%|██████████| 304/304 [00:00<00:00, 19207.18it/s]


In [5]:
df = extract_casmi_spectra(casmi_labels_file,mgf_file)

Parsing blocks: 100%|██████████| 304/304 [00:00<00:00, 19055.05it/s]


In [6]:
columns_drop = [
    'dataset',
    'name',
    'inchikey',
    'instrument',
    'formula'
]

df = df.drop(columns_drop, axis=1,errors='ignore')

In [7]:
df

Unnamed: 0,spec,ionization,smiles,spectral_data
0,3,[M+H]+,CC1C(C(C(C(O1)OC2=C(OC3=C(C(=CC(=C3C2=O)O)O)CC...,"{'mz': [50.5921, 50.651, 51.1588, 51.7146, 51...."
1,4,[M+Na]+,CC(C)CC(C(=O)O)NC(=O)C(C(C)C)OC(=O)C(C(C)C)NC(...,"{'mz': [51.0578, 51.9172, 51.9195, 51.9476, 52..."
2,5,[M+Na]+,CC1CC2(C(C1O)C=C(C(CC3C(C3(C)C)C=C(C2=O)C)OC(=...,"{'mz': [51.2449, 53.0387, 55.0545, 57.0696, 67..."
3,6,[M+NH4]+,CC1C(C(C(C(O1)OC2C(OC(C(C2O)O)OCC3C(C(C(C(O3)O...,"{'mz': [79.0542, 81.0287, 81.0335, 81.0698, 81..."
4,9,[M+H]+,CN(C)CCC1=CNC2=C1C=C(C=C2)CS(=O)(=O)N3CCCC3,"{'mz': [50.2834, 56.0497, 57.0577, 58.058, 58...."
...,...,...,...,...
299,492,[M+NH4]+,CC1CCC(C2(C1=CC(CC2)C(=C)C(=O)O)C)OC3C(C(C(C(O...,"{'mz': [50.4192, 52.9216, 74.1355, 75.7308, 76..."
300,493,[M+NH4]+,CC=CC1=CC(=C(C(=C1)OC)OC(C)C(C2=CC(=C(C(=C2)OC...,"{'mz': [53.0389, 55.0181, 55.3369, 56.3095, 57..."
301,494,[M+NH4]+,COC(=O)C=CC1=CC=C(C=C1)OC2C(C(C(C(O2)CO)O)O)O,"{'mz': [50.4232, 51.3694, 52.1419, 52.6558, 53..."
302,495,[M+NH4]+,COC1=C2C(=CC(=C1OC3C(C(C(C(O3)CO)O)O)OC4C(C(C(...,"{'mz': [51.2225, 51.5258, 53.0388, 54.5814, 56..."


In [8]:
df_updated = process_smiles_dataframe(df,'smiles')

Extracting SMILES from dataframe
Processing 304 SMILES strings using 128 processes...


Processing SMILES: 100%|██████████| 304/304 [00:00<00:00, 10047.42it/s]

Successfully processed 304 of 304 entries





In [9]:
df_updated.columns

Index(['spec', 'ionization', 'smiles', 'spectral_data', 'InChI', 'Formula'], dtype='object')

In [10]:
extracted_spectral_info_list = []

for idx, row in tqdm(df_updated.iterrows(), desc="Row processing"):
    info = {
        'cand_form': row['Formula'],
        'cand_ion': row['ionization'],
        'output_tbl': row['spectral_data']
    }
    extracted_spectral_info_list.append(info)

Row processing: 304it [00:00, 23978.27it/s]


In [12]:
df_updated['extracted_spectral_info'] = extracted_spectral_info_list

In [13]:
columns_drop = [
    'ionization',
    'spectral_data',
]

df_updated = df_updated.drop(columns_drop,axis=1,errors='ignore')

In [14]:
df_updated 

Unnamed: 0,spec,smiles,InChI,Formula,extracted_spectral_info
0,3,CC1C(C(C(C(O1)OC2=C(OC3=C(C(=CC(=C3C2=O)O)O)CC...,InChI=1S/C35H42O16/c1-14(2)6-11-19-20(38)12-21...,C35H42O16,"{'cand_form': 'C35H42O16', 'cand_ion': '[M+H]+..."
1,4,CC(C)CC(C(=O)O)NC(=O)C(C(C)C)OC(=O)C(C(C)C)NC(...,InChI=1S/C22H40N2O7/c1-11(2)9-15(21(28)29)23-2...,C22H40N2O7,"{'cand_form': 'C22H40N2O7', 'cand_ion': '[M+Na..."
2,5,CC1CC2(C(C1O)C=C(C(CC3C(C3(C)C)C=C(C2=O)C)OC(=...,InChI=1S/C26H36O8/c1-13-9-19-23(31)15(3)12-26(...,C26H36O8,"{'cand_form': 'C26H36O8', 'cand_ion': '[M+Na]+..."
3,6,CC1C(C(C(C(O1)OC2C(OC(C(C2O)O)OCC3C(C(C(C(O3)O...,InChI=1S/C54H84O22/c1-23-11-16-54(18-17-52(7)2...,C54H84O22,"{'cand_form': 'C54H84O22', 'cand_ion': '[M+NH4..."
4,9,CN(C)CCC1=CNC2=C1C=C(C=C2)CS(=O)(=O)N3CCCC3,InChI=1S/C17H25N3O2S/c1-19(2)10-7-15-12-18-17-...,C17H25N3O2S,"{'cand_form': 'C17H25N3O2S', 'cand_ion': '[M+H..."
...,...,...,...,...,...
299,492,CC1CCC(C2(C1=CC(CC2)C(=C)C(=O)O)C)OC3C(C(C(C(O...,InChI=1S/C21H32O8/c1-10-4-5-15(29-20-18(25)17(...,C21H32O8,"{'cand_form': 'C21H32O8', 'cand_ion': '[M+NH4]..."
300,493,CC=CC1=CC(=C(C(=C1)OC)OC(C)C(C2=CC(=C(C(=C2)OC...,InChI=1S/C25H32O8/c1-9-10-17-11-19(27-4)25(20(...,C25H32O8,"{'cand_form': 'C25H32O8', 'cand_ion': '[M+NH4]..."
301,494,COC(=O)C=CC1=CC=C(C=C1)OC2C(C(C(C(O2)CO)O)O)O,InChI=1S/C16H20O8/c1-22-12(18)7-4-9-2-5-10(6-3...,C16H20O8,"{'cand_form': 'C16H20O8', 'cand_ion': '[M+NH4]..."
302,495,COC1=C2C(=CC(=C1OC3C(C(C(C(O3)CO)O)O)OC4C(C(C(...,InChI=1S/C24H32O15/c1-34-22-19-10(4-5-35-19)6-...,C24H32O15,"{'cand_form': 'C24H32O15', 'cand_ion': '[M+NH4..."


In [43]:
df_updated.to_csv('../../data/production_ready_data/spectrs/casmi22.csv', sep=',', index=False)