In [1]:
import pandas as pd
from tqdm import tqdm
import json
import os
from rdkit import Chem
from rdkit import RDLogger
lg = RDLogger.logger()
lg.setLevel(RDLogger.CRITICAL)

In [8]:
canopus_labels = pd.read_csv('../../data/raw/canopus_diffms/labels.tsv', sep='\t')

In [None]:
columns_drop = [
    'dataset',
    'name',
    'instrument',
    'ionization',	
    'formula',	
    'inchikey'
]

In [None]:
canopus_labels = canopus_labels.drop(columns_drop, axis=1, errors='ignore')

In [10]:
def read_spec_canopus(file_name, data_path):
    """
    Process the JSON file for a given spec value.
    
    Parameters:
    -----------
    file_name, : str
        The filename or identifier from the spec field
    data_path : str
        The directory path where the JSON files are stored
    
    Returns:
    --------
    dict or None
        Processed data from the JSON file, or None if file not found
    """
    
    # Construct the full path to the JSON file
    json_path = os.path.join(data_path,file_name)
    
    # Check if the path exists as is
    if not os.path.exists(json_path):
        if not json_path.endswith('.json'):
            json_path = f"{json_path}.json"
            
        if not os.path.exists(json_path):
            print(f"Warning: JSON file not found for spec: {file_name}")
            return None
        
    # Read and parse the JSON file
    try:
        with open(json_path, 'r') as f:
            data = json.load(f)
            
        return data
        
    except Exception as e:
        print(f"Error processing {json_path}: {e}")
        return None

In [11]:
data_path = '../../data/raw/canopus_diffms/subformulae/subformulae_default'
canopus_labels['extracted_spectral_info'] = canopus_labels['spec'].apply(
    lambda spec: read_spec_canopus(spec,data_path)
)

In [None]:
canopus_inchis = []
for i in tqdm(range(len(canopus_labels)), desc="Converting CANOPUS SMILES to InChI", leave=False):
    
    mol = Chem.MolFromSmiles(canopus_labels.loc[i, "smiles"])
    smi = Chem.MolToSmiles(mol, isomericSmiles=False) # remove stereochemistry information
    mol = Chem.MolFromSmiles(smi)
    inchi = Chem.MolToInchi(mol)
    canopus_inchis.append(inchi)

canopus_labels['InChI'] = canopus_inchis

In [None]:
canopus_labels.to_csv('../../data/production_ready_data/NPLIB1_canopus.csv',index=False)