In [None]:
import glob

import pandas as pd
from pathlib import Path

from rdkit import Chem
from rdkit.Chem import rdFingerprintGenerator, rdMolDescriptors

In [None]:
OUTPUT_DIRECTORY = '../../data/paired_spectra/casmi2022/spec_files'
Path(OUTPUT_DIRECTORY).mkdir(exist_ok=True)

In [None]:
df = pd.read_pickle('../../data/paired_spectra/casmi2022/processed_massformer/spec_df.pkl')
print(len(df))
df.head()

In [None]:
hydrogen_adduct = df[df['prec_type'] == '[M+H]+']
len(hydrogen_adduct)

In [None]:
hydrogen_adduct['mol'] = hydrogen_adduct['smiles'].apply(Chem.MolFromSmiles)

hydrogen_adduct['inchikey'] = hydrogen_adduct['mol'].apply(Chem.MolToInchiKey)

hydrogen_adduct['formula'] = hydrogen_adduct['mol'].apply(rdMolDescriptors.CalcMolFormula)

mfpgen = rdFingerprintGenerator.GetMorganGenerator(radius=2,fpSize=4096)
hydrogen_adduct['fingerprint'] = hydrogen_adduct['mol'].apply(mfpgen.GetFingerprint)
hydrogen_adduct.head()

In [None]:
def compound_to_ms_file(row):
    
    with open(f'{OUTPUT_DIRECTORY}/casmi2022_{row["casmi_id"]}.ms', 'w') as f:
        
        f.write(f'>compound casmi2022_{row["casmi_id"]}\n')
        f.write(f'>formula {row["formula"]}\n')
        f.write(f'>parentmass {row["prec_mz"]}\n')
        f.write(f'>rt {row["rt"]}\n')
        f.write(f'>Ionization {row["prec_type"]}\n')
        f.write(f'>InChI None\n')
        f.write(f'>InChIKey {row["inchikey"]}\n')
        f.write(f'#smiles {row["smiles"]}\n')
        f.write(f'#scans {row["spec_id"]}\n')
        
        f.write('\n')
        f.write(f'>ms2\n')

        for peak in row['peaks']:
            f.write(f'{peak[0]} {peak[1]}\n')

In [None]:
hydrogen_adduct.apply(compound_to_ms_file, axis=1)

In [None]:
# Create labels file for ...
ms_files = glob.glob(f'{OUTPUT_DIRECTORY}/*.ms')

df = pd.DataFrame(columns=['dataset', 'spec'])

for ms_file in ms_files:
    spec = ms_file.split('/')[-1].split('.')[0]
    df.loc[len(df)] = {'dataset': 'casmi2022', 'spec': spec}

df.to_csv('../../data/paired_spectra/casmi2022/labels.tsv', sep="\t", index=None)

In [None]:
# Create true labels file
ms_files = glob.glob(f'{OUTPUT_DIRECTORY}/*.ms')

df = pd.DataFrame(columns=['dataset', 'spec', 'name', 'ionization', 'formula', 'smiles', 'inchikey'])

for ms_file in ms_files:
    
    spec = ms_file.split('/')[-1].split('.')[0]

    spec_id = int(spec.split('_')[-1])
    row = hydrogen_adduct[hydrogen_adduct['casmi_id'] == spec_id]
    
    df.loc[len(df)] = {'dataset': 'casmi2022', 
                       'spec': spec, 
                       'name': '', 
                       'ionization': row['prec_type'].values[0], 
                       'formula': row['formula'].values[0], 
                       'smiles': row['smiles'].values[0], 
                       'inchikey': row['inchikey'].values[0]}

df.to_csv('../../data/paired_spectra/casmi2022/labels_true.tsv', sep="\t", index=None)