#### Import dependencies

In [1]:
import pandas as pd
import requests
from typing import Dict
import pickle
import json
from featurizations import featurizations
from rdkit import Chem
from rdkit.Chem import rdMolDescriptors

#### Helper functions

In [2]:
def name2smiles(SEED_df: pd.DataFrame, chemical_name) -> str:
    
    cleaned_chemical_name = chemical_name.strip(';').lower()
    
    for i in range(SEED_df.shape[0]):
    
        cpd_smiles = SEED_df.iloc[i,:]['smiles']
        cpd_aliases = SEED_df.iloc[i,:]['aliases']

        cleaned_aliases = []

        if type(cpd_aliases) == str:
            for alias in cpd_aliases.split():
                cleaned_alias = alias.strip(';+-|:').split('|')[0]
                cleaned_aliases.append(cleaned_alias)

        if cleaned_chemical_name in cleaned_aliases:
            return cpd_smiles
        
        if cleaned_chemical_name in cpd_aliases:
            return cpd_smiles

In [3]:
def get_smiles_from_aliases(SEED_dict: Dict[str,str], chemical_name: str) -> str:
    
    for i in range(0,len(SEED_dict["SMILES"])):
        
        if chemical_name in SEED_dict["aliases"][i]:
            
            return SEED_dict["SMILES"][i]

In [4]:
def get_kegg_smiles(compound_ids):
    """
    Query KEGG API for given compound IDs and return their SMILES strings.

    :param compound_ids: List of KEGG compound IDs (e.g., ['C00002', 'C00003']).
    :return: Dictionary of compound IDs and their SMILES strings.
    """
    successful_names_dict = {}
    failed_names_dict = {}
    base_url = "http://rest.kegg.jp/get/cpd:"

    for cid in compound_ids:
        url = f"{base_url}{cid}"
        response = requests.get(url)
        if response.status_code == 200:
            # Split the response into lines and iterate through them
            for line in response.text.split('\n'):
                # Check if the line contains the SMILES string
                if line.startswith("            "):  # SMILES lines start with spaces
                    # Extract the name string, which follows ' '
                    name = line.strip()
                    successful_names_dict[cid] = name
                    break  # Stop searching once the SMILES is found
        else:
            print(f"Failed to fetch data for compound ID {cid}")
            failed_names_dict[cid] = name
            
    return successful_names_dict, failed_names_dict

In [5]:
def contains_aromatic_ring_rdkit(smiles):
    # Convert the SMILES string into an RDKit molecule object
    mol = Chem.MolFromSmiles(smiles)
    
    # Check if the molecule contains an aromatic ring
    aromatic_rings = rdMolDescriptors.CalcNumAromaticRings(mol)
    
    return aromatic_rings > 0

#### Read in relevant datasets

In [6]:
SEED_raw_filepath = '../data/SEED/SEED_IDs.csv'
SEED_neutralized = '../data/SEED/SEED_neutralized.tsv'
validation_molecules_filepath = '../data/bio-based-chemicals/bit26779-sup-0005-supmat.xlsx'
MCF2_db_filepath = '../data/MCF2/mcf_chem.jsonl'

In [7]:
data = []
with open(MCF2_db_filepath, 'r') as file:
    for line in file:
        for i, split_line in enumerate(line.split()):
            if "canonical_smiles" in split_line:
                smiles_index = i + 1        
        # data.append(json.loads(line))

In [8]:
# read in the raw SEED database while dropping the column comprising of smiles with charge info on them
SEED_raw_df = pd.read_csv(SEED_raw_filepath).drop(labels=['Unnamed: 0','smiles'], axis = 1)

# then, read in the SEED database which contains neutralized SMILES rather than charged SMILES strings
SEED_neutralized_df = pd.read_csv(SEED_neutralized,delimiter='\t')

# merge both dataframes using rows with matching SEED IDs
SEED_df = pd.merge(SEED_raw_df, SEED_neutralized_df, left_on='SEED ID', right_on='cpd')

validation_molecules_ecoli = list(pd.read_excel(validation_molecules_filepath, sheet_name = 'E.coli',skiprows=2)['KEGG ID'])
validation_molecules_yeast = list(pd.read_excel(validation_molecules_filepath, sheet_name = 'S.cerevisiae',skiprows=2)['KEGG ID'])
validation_molecules_unique = set(validation_molecules_ecoli + validation_molecules_yeast)

# consider only molecules in the reported dataset with valid KEGG IDs
validation_molecules_w_KEGG_IDs = list({s for s in validation_molecules_unique if s and str(s).startswith('C') and not str(s).startswith('CPD')})

leftover_set = {s for s in validation_molecules_unique if not (s and str(s).startswith('C') and not str(s).startswith('CPD'))}

In [9]:
successful_names_dict, failed_names_dict = get_kegg_smiles( validation_molecules_w_KEGG_IDs )

In [11]:
all_KEGG_IDs = []
all_molecule_names = []
all_molecule_smiles = []

for KEGG_ID in successful_names_dict.keys():
    
    molecule_name = successful_names_dict[KEGG_ID].strip(';')
    molecule_smiles = name2smiles(SEED_df, molecule_name)
    
    all_KEGG_IDs.append(KEGG_ID)
    all_molecule_names.append(molecule_name)
    all_molecule_smiles.append(featurizations.compound(molecule_smiles).remove_stereo())

In [12]:
validation_molecules_df_processed = pd.DataFrame({'KEGG IDs': all_KEGG_IDs,
                                                  'Name': all_molecule_names,
                                                  'SMILES': all_molecule_smiles})

In [13]:
validation_molecules_df_processed

Unnamed: 0,KEGG IDs,Name,SMILES
0,C00479,Propionaldehyde,CC(O)C=O
1,C07112,Methylphenyl carbinol,CC(O)c1ccccc1
2,C00488,Methanamide,NC=O
3,C12537,"P ANTIPARASITIC PRODUCTS, INSECTICIDES AND REP...",
4,C06677,Tosylate,Cc1ccc(S(=O)(=O)O)cc1
...,...,...,...
187,C06424,Tetradecanoate,CCCCCCCCCCCCCC(=O)O
188,C00116,Glycerin,OCC(O)CO
189,C18248,Ethyl chloride,CCCl
190,C10700,(4-Hydroxyphenyl)ethan-1-one,CC(=O)c1ccc(O)cc1


In [14]:
allowed_atoms = {'C', 'O', 'N', 'S', 'P', 'H'}

rows_to_keep = []
rows_to_discard = []

for i in range(0,validation_molecules_df_processed.shape[0]):
        smi = validation_molecules_df_processed.iloc[i,:]['SMILES']
        
        # see if a SMILES string can be converted to an RDkit mol object
        try:
            mol = Chem.MolFromSmiles(smi)
            
            # first, ensure compound has no aromatic rings
            if not contains_aromatic_ring_rdkit(smi):
                
                # then, check if atoms in this compound fall within our allowed list 
                atoms = set(atom.GetSymbol() for atom in mol.GetAtoms())
                if atoms.issubset(allowed_atoms):
                    rows_to_keep.append(i)
                
                
            # discard compound if aromatic rings are present
            else:
                rows_to_discard.append(i)
        
        # discard compound if it cannot be converted to an RDkit mol object
        except TypeError:
            rows_to_discard.append(i)

In [15]:
final_validation_set = validation_molecules_df_processed.iloc[rows_to_keep]

In [16]:
final_validation_set.head(40)

Unnamed: 0,KEGG IDs,Name,SMILES
0,C00479,Propionaldehyde,CC(O)C=O
2,C00488,Methanamide,NC=O
5,C00854,Hexalin,OC1CCCCC1
6,C01548,Ethyne,C#C
11,C11145,Methanesulfonate,CS(=O)(=O)O
13,C00072,Ascorbic acid,O=C1OC(C(O)CO)C(O)=C1O
14,C00158,Citric acid,O=C(O)CC(O)(CC(=O)O)C(=O)O
15,C14710,2-Methyl-1-propanol,CC(C)CO
17,C00067,Methanal,C=O
22,C00489,Glutaric acid,NC(CCC(=O)O)C(=O)O


In [17]:
final_validation_set.to_excel('validation_molecules_processed.xlsx')