In [None]:
import sys
import os

parent_dir = os.path.abspath(os.path.join('../..'))
if parent_dir not in sys.path:
    sys.path.append(parent_dir)

In [None]:
import pandas as pd

In [None]:
coconut_df = pd.read_csv('../../data/raw/diffms_spectrum_db/raw/coconut_csv-03-2025.csv')

In [None]:
columns_leave = [
    'canonical_smiles',
    'standard_inchi'
]

In [None]:
coconut_df_updated = coconut_df[columns_leave]

In [None]:
from rdkit import Chem
from rdkit import RDLogger
from rdkit.Chem import Descriptors, PandasTools

def get_formula(smiles):
    """
    Calculate molecular formula from SMILES with proper error handling
    
    Parameters:
    -----------
    smiles : str
        SMILES string representation of molecule
        
    Returns:
    --------
    str or None
        Molecular formula or None if SMILES is invalid
    """
    if pd.isna(smiles):
        return None
    
    try:
        mol = Chem.MolFromSmiles(smiles)
        if mol is None:
            return None
        return Chem.rdMolDescriptors.CalcMolFormula(mol)
    except Exception as e:
        print(e)
        return None

In [None]:
from tqdm import tqdm

In [None]:
tqdm.pandas(desc="Calculating formulas")
coconut_df_updated['formula'] = coconut_df_updated['canonical_smiles'].progress_apply(get_formula)

In [None]:
null_vals_count = coconut_df_updated['formula'].isna().sum()

In [None]:
coconut_df_updated = coconut_df_updated.dropna()
coconut_df_updated = coconut_df_updated.reset_index(drop=True)

In [None]:
columns_names = ['SMILES','InChI','Formula']
coconut_df_updated.columns = columns_names

In [None]:
coconut_df_updated

In [None]:
coconut_df_updated.to_csv('../../data/production_ready_data/mols/coconut_dataset.csv', index=False)