<a href="https://colab.research.google.com/github/Kienknu/Kienknu/blob/main/SMILES_from_chemical_space_Option_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install rdkit
!pip install molmass

Collecting rdkit
  Downloading rdkit-2025.9.5-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (3.8 kB)
Downloading rdkit-2025.9.5-cp312-cp312-manylinux_2_28_x86_64.whl (36.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m36.7/36.7 MB[0m [31m28.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rdkit
Successfully installed rdkit-2025.9.5
Collecting molmass
  Downloading molmass-2026.1.8-py3-none-any.whl.metadata (5.8 kB)
Downloading molmass-2026.1.8-py3-none-any.whl (73 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.8/73.8 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: molmass
Successfully installed molmass-2026.1.8


In [2]:
import rdkit
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem, Draw, rdChemReactions
from rdkit.Chem.Draw import IPythonConsole
from IPython.display import display
import pandas as pd
import matplotlib.pyplot as plt
from rdkit.ML.Cluster import Butina
import re

**Comparison**

**Step 1: Loading Files**


In [13]:
import glob
import json

#------- Loading chemical_space files-----#

chemical_space = set()

file_pattern = 'simulated_chemical_space_*.txt'
chunk_files = glob.glob(file_pattern)

print(f"Found {len(chunk_files)} chunk files matching '{file_pattern}'.")

for file_path in chunk_files:
    print(f"Loading SMILES from: {file_path}")
    with open(file_path, 'r') as f:
        for line in f:
            smi = line.strip()
            if smi:
                chemical_space.add(smi)

print(f"Total unique SMILES in combined chemical_space: {len(chemical_space)}")


Found 7 chunk files matching 'simulated_chemical_space_*.txt'.
Loading SMILES from: simulated_chemical_space_chunk_3_cresol.txt
Loading SMILES from: simulated_chemical_space_chunk_5_cresol.txt
Loading SMILES from: simulated_chemical_space_chunk_0_cresol.txt
Loading SMILES from: simulated_chemical_space_chunk_4_cresol.txt
Loading SMILES from: simulated_chemical_space_chunk_2_cresol.txt
Loading SMILES from: simulated_chemical_space_chunk_6_cresol.txt
Loading SMILES from: simulated_chemical_space_chunk_1_cresol.txt
Total unique SMILES in combined chemical_space: 2331663


Entering specific **Formula** to search SMILES

In [None]:
search_mode = None
while search_mode not in ['formula', 'smiles']:
    user_choice = input("type 'formula' or 'smiles': ").lower()
    if user_choice in ['formula', 'smiles']:
        search_mode = user_choice
    else:
        print("Invalid input. Please type 'formula' or 'smiles'.")

product_smiles_to_trace = None

def calculate_formula_from_smiles(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        return Chem.rdMolDescriptors.CalcMolFormula(mol)
    return None

if search_mode == 'formula':
    formula_to_search = input('Enter the formula to search:')

    # Create a set of formulas
    chemical_space_formulas = {calculate_formula_from_smiles(smi) for smi in chemical_space if calculate_formula_from_smiles(smi) is not None}

    if formula_to_search in chemical_space_formulas:
        print(f"At least one molecule with the formula '{formula_to_search}' is found in the chemical space.")
    else:
        print(f"No molecule with the formula '{formula_to_search}' is found in the chemical space.")

    # Find all SMILES with that formula:
    matching_smiles = [smi for smi in chemical_space if calculate_formula_from_smiles(smi) == formula_to_search]

    if matching_smiles:
        print(f"\nFound the following SMILES with the formula '{formula_to_search}':")
        for smi in matching_smiles:
            print(smi)
    else:
        print("No SMILES found for the given formula.")

elif search_mode == 'smiles':
    smiles_input = input('Enter the SMILES string for the product to trace pathways for: ')
    mol_from_input = Chem.MolFromSmiles(smiles_input)
    if mol_from_input is not None:
        product_smiles_to_trace = Chem.MolToSmiles(mol_from_input, canonical=True)
        print(f"Using canonical SMILES for tracing: {product_smiles_to_trace}")
        matching_smiles = [product_smiles_to_trace]
    else:
        print(f"Invalid SMILES input: '{smiles_input}'. Could not parse. No pathways will be traced.")
        matching_smiles = []

type 'formula' or 'smiles': formula
Enter the formula to search:C26H41NO18


In [15]:
import pandas as pd

output_excel_filename = f'{formula_to_search}_VOCs_name.xlsx'

if 'matching_smiles' in locals() and matching_smiles:
    try:
        output_df = pd.DataFrame(matching_smiles, columns=['Matching SMILES'])

        output_df.to_excel(output_excel_filename, index=False)
        print(f"Successfully saved {len(matching_smiles)} SMILES to '{output_excel_filename}'.")

        print("\nFirst 5 rows of the saved data:")
        display(output_df.head())
    except Exception as e:
        print(f"Error saving SMILES to Excel file: {e}")
else:
    print("No 'matching_smiles' list found.")

Successfully saved 140 SMILES to 'C26H41NO18_VOCs_name.xlsx'.

First 5 rows of the saved data:


Unnamed: 0,Matching SMILES
0,CC(C=O)C(O)C(OC1C(O)C(C)C=CC1(O)OO)C(O)OC(O)C(...
1,CC1C=CC(O)(OC(O)C=CC(C)(CO)OO)C(OOC2C(O)C(C)C=...
2,CC(C=O)C(O)C(OC1C(O)C(C)C=CC1(O)OO)C(O)OC(O)C(...
3,CC(C=O)(OC1(C)C=CC(O)(OC2(O)CC(O)C(C)(OC(C)(C=...
4,CC1C=CC(O)(OC(O)C=CC(C)(CO)OO)C(OOC2C(O)C(C[N+...
