In [4]:
!pip install rdkit-pypi numpy Pillow pandas seaborn matplotlib tqdm
import pandas as pd
import numpy as np
import os
from rdkit import Chem
from rdkit.Chem import Descriptors, QED
from rdkit import RDLogger
from tqdm import tqdm






In [9]:
input_filename = "generated_molecules.csv"
input_directory = "/content/"
input_path = os.path.join(input_directory, input_filename)

read_smiles = []
try:
    # Assuming CSV with a 'SMILES' header
    df_input = pd.read_csv(input_path)
    if 'SMILES' in df_input.columns:
         read_smiles = df_input['SMILES'].dropna().tolist()
    else:
         print(f"Error: Could not find 'SMILES' column in {input_filename}")
         read_smiles = None

    if read_smiles is not None:
      print(f"Read {len(read_smiles)} SMILES strings from {input_path}")

except Exception as e:
    print(f"Error reading file {input_path}: {e}")
    read_smiles = None

# --- Perform Analysis ---
analysis_results = []
if read_smiles is not None:
    print("Analyzing molecules...")
    for smi in tqdm(read_smiles, desc="Processing SMILES"):
        mol = None
        sanitized = False
        error_msg = None

        try:
            mol = Chem.MolFromSmiles(smi, sanitize=False)
            if mol is None:
                 error_msg = "Invalid SMILES Syntax"
            else:
                 Chem.SanitizeMol(mol)
                 sanitized = True

                 # --- Calculate Boolean Flags ---
                 has_ring = mol.HasSubstructMatch(Chem.MolFromSmarts('[r]'))
                 has_aromatic = mol.HasSubstructMatch(Chem.MolFromSmarts('[a]'))
                 has_double = mol.HasSubstructMatch(Chem.MolFromSmarts('*=*'))
                 has_n = mol.HasSubstructMatch(Chem.MolFromSmarts('[#7]'))
                 has_o = mol.HasSubstructMatch(Chem.MolFromSmarts('[#8]'))

                 # --- Calculate Descriptors ---
                 mw = Descriptors.MolWt(mol)
                 logp = Descriptors.MolLogP(mol)
                 hbd = Descriptors.NumHDonors(mol)
                 hba = Descriptors.NumHAcceptors(mol)
                 rotb = Descriptors.NumRotatableBonds(mol)
                 tpsa = Descriptors.TPSA(mol)
                 qed_val = QED.qed(mol)

                 # --- Lipinski Rule of Five Check ---
                 violations = 0
                 if mw > 500: violations += 1
                 if logp > 5: violations += 1
                 if hbd > 5: violations += 1
                 if hba > 10: violations += 1
                 passes_lipinski = (violations == 0)

                 analysis_results.append({
                     'SMILES': smi,
                     'Valid_Mol': True,
                     'Sanitized': True,
                     'Has_Ring': has_ring,
                     'Has_Aromatic_Ring': has_aromatic,
                     'Has_Double_Bond': has_double,
                     'Has_N': has_n,
                     'Has_O': has_o,
                     'QED': qed_val,
                     'LogP': logp,
                     'Molecular_Weight': mw,
                     'H_Donors': hbd,
                     'H_Acceptors': hba,
                     'Rotatable_Bonds': rotb,
                     'TPSA': tpsa,
                     'Passes_Lipinski': passes_lipinski,
                     'Error': None
                 })

        except Exception as e:
             error_msg = f"Processing Error: {e}"

        if not sanitized and mol is None:
             analysis_results.append({'SMILES': smi, 'Valid_Mol': False, 'Sanitized': False, 'Error': error_msg})
        elif not sanitized and mol is not None:
             analysis_results.append({'SMILES': smi, 'Valid_Mol': True, 'Sanitized': False, 'Error': error_msg})

# --- Calculate and Print Statistics ---
print("\n--- Analysis Summary ---")
df_analysis = pd.DataFrame(analysis_results)

# Filter for rows where molecule was valid AND sanitized successfully
valid_df = df_analysis[df_analysis['Sanitized'] == True].copy()

n_total_read = len(read_smiles) if read_smiles else 0
n_valid_syntax = df_analysis['Valid_Mol'].sum()
n_sanitized = len(valid_df)

print(f"Total SMILES read: {n_total_read}")
print(f"Valid SMILES syntax: {n_valid_syntax} ({n_valid_syntax/n_total_read*100:.1f}% of read)" if n_total_read > 0 else "")
print(f"Chemically valid (Sanitized): {n_sanitized} ({n_sanitized/n_total_read*100:.1f}% of read)" if n_total_read > 0 else "")

if n_sanitized > 0:
    # --- Calculate counts and percentages ---
    n_with_rings = valid_df['Has_Ring'].sum()
    n_with_aromatic = valid_df['Has_Aromatic_Ring'].sum()
    n_with_double_bonds = valid_df['Has_Double_Bond'].sum()
    n_with_n = valid_df['Has_N'].sum()
    n_with_o = valid_df['Has_O'].sum()
    n_with_heteroatoms = valid_df[valid_df['Has_N'] | valid_df['Has_O']].shape[0]
    n_passing_lipinski = valid_df['Passes_Lipinski'].sum()

    print(f"\nMolecules with rings: {n_with_rings} ({n_with_rings/n_sanitized*100:.1f}%)")
    print(f"Molecules with aromatic rings: {n_with_aromatic} ({n_with_aromatic/n_sanitized*100:.1f}%)")
    print(f"Molecules with double bonds: {n_with_double_bonds} ({n_with_double_bonds/n_sanitized*100:.1f}%)")
    print(f"Molecules with nitrogen: {n_with_n} ({n_with_n/n_sanitized*100:.1f}%)")
    print(f"Molecules with oxygen: {n_with_o} ({n_with_o/n_sanitized*100:.1f}%)")
    print(f"Molecules with N or O heteroatoms: {n_with_heteroatoms} ({n_with_heteroatoms/n_sanitized*100:.1f}%)")

    # --- Compute summary statistics for numerical properties ---
    print("\nDrug-likeness metrics (Avg, Min, Max):")
    print(f"  QED: avg={valid_df['QED'].mean():.3f}, min={valid_df['QED'].min():.3f}, max={valid_df['QED'].max():.3f}")
    print(f"  LogP: avg={valid_df['LogP'].mean():.3f}, min={valid_df['LogP'].min():.3f}, max={valid_df['LogP'].max():.3f}")
    print(f"  Molecular weight: avg={valid_df['Molecular_Weight'].mean():.1f}, min={valid_df['Molecular_Weight'].min():.1f}, max={valid_df['Molecular_Weight'].max():.1f}")
    print(f"  H-bond donors: avg={valid_df['H_Donors'].mean():.1f}, min={valid_df['H_Donors'].min()}, max={valid_df['H_Donors'].max()}")
    print(f"  H-bond acceptors: avg={valid_df['H_Acceptors'].mean():.1f}, min={valid_df['H_Acceptors'].min()}, max={valid_df['H_Acceptors'].max()}")
    print(f"  Rotatable bonds: avg={valid_df['Rotatable_Bonds'].mean():.1f}, min={valid_df['Rotatable_Bonds'].min()}, max={valid_df['Rotatable_Bonds'].max()}")
    print(f"  TPSA: avg={valid_df['TPSA'].mean():.1f}, min={valid_df['TPSA'].min():.1f}, max={valid_df['TPSA'].max():.1f}")
    print(f"\n  Pass Lipinski's Rule of Five: {n_passing_lipinski} ({n_passing_lipinski/n_sanitized*100:.1f}%)")

    # --- Uniqueness ---
    unique_smiles = set(valid_df['SMILES'])
    print(f"\nUniqueness (among sanitized): {len(unique_smiles)}/{n_sanitized} ({len(unique_smiles)/n_sanitized*100:.1f}%)")

else:
    print("\nNo chemically valid (sanitized) molecules found to calculate statistics.")



Read 101 SMILES strings from /content/generated_molecules.csv
Analyzing molecules...


Processing SMILES: 100%|██████████| 101/101 [00:00<00:00, 517.50it/s]


--- Analysis Summary ---
Total SMILES read: 101
Valid SMILES syntax: 101 (100.0% of read)
Chemically valid (Sanitized): 101 (100.0% of read)

Molecules with rings: 99 (98.0%)
Molecules with aromatic rings: 11 (10.9%)
Molecules with double bonds: 83 (82.2%)
Molecules with nitrogen: 100 (99.0%)
Molecules with oxygen: 85 (84.2%)
Molecules with N or O heteroatoms: 101 (100.0%)

Drug-likeness metrics (Avg, Min, Max):
  QED: avg=0.689, min=0.438, max=0.810
  LogP: avg=1.614, min=-0.077, max=4.181
  Molecular weight: avg=224.8, min=169.3, max=288.3
  H-bond donors: avg=1.2, min=0, max=3
  H-bond acceptors: avg=3.9, min=1, max=8
  Rotatable bonds: avg=1.4, min=0, max=7
  TPSA: avg=49.9, min=3.2, max=121.1

  Pass Lipinski's Rule of Five: 101 (100.0%)

Uniqueness (among sanitized): 74/101 (73.3%)





In [10]:
analysis_output_filename = "druglikeness.csv"
analysis_output_directory = "/content/"
analysis_output_path = os.path.join(analysis_output_directory, analysis_output_filename)


if 'df_analysis' in locals() and isinstance(df_analysis, pd.DataFrame) and not df_analysis.empty:
    df_analysis.to_csv(analysis_output_path, index=False)
    print(f"\nAnalysis results saved successfully to: {analysis_output_path}")
else:
    print("\nAnalysis DataFrame ('df_analysis') not found or is empty. Cannot save CSV.")




Analysis results saved successfully to: /content/druglikeness.csv
