PHASE B

In [None]:
# Install RDKit (necessary in Google Colab environment)
!pip install rdkit

import pandas as pd
from rdkit import Chem

def run_phase_b():
    """
    Executes Phase B: Molecular Cleaning, Canonicalization, and Deduplication.
    Assumes df_raw and SMILES_COL exist globally.
    Returns:
        df_clean (pd.DataFrame): The cleaned dataset.
        summary (dict): Dictionary of cleaning statistics.
    """
    print(f"--- Starting PHASE B: Molecular Cleaning, Canonicalization, and Deduplication ---")

    # 16. Defensive Programming: Verify Pre-conditions
    # (Checking strictly against global scope as this is running in a notebook context)
    assert 'df_raw' in globals(), "CRITICAL ERROR: df_raw not found in global scope. Run Phase A first."
    assert 'SMILES_COL' in globals(), "CRITICAL ERROR: SMILES_COL not found in global scope."

    # 6. Ensure df_raw remains unchanged by working on a copy
    df_working = df_raw.copy()
    initial_count = len(df_working)
    print(f"[Status] Initial rows from df_raw: {initial_count}")

    # 8. Convert SMILES to RDKit Mol objects
    print(f"[Processing] Converting SMILES to RDKit Mol objects...")
    # Use RDKit to parse SMILES; invalid SMILES return None
    df_working['mol_temp'] = df_working[SMILES_COL].apply(lambda x: Chem.MolFromSmiles(str(x)))

    # 9. Identify invalid SMILES
    # Rows where 'mol_temp' is None are invalid
    invalid_mask = df_working['mol_temp'].isnull()
    num_invalid = invalid_mask.sum()

    # 10. Remove rows with invalid SMILES
    if num_invalid > 0:
        print(f"[Cleaning] Found {num_invalid} invalid SMILES. Removing...")
        df_working = df_working[~invalid_mask].copy()
    else:
        print(f"[Cleaning] No invalid SMILES found.")

    # 11. Canonicalize valid molecules
    print(f"[Processing] Generating canonical SMILES...")
    # RDKit MolToSmiles generates canonical SMILES by default
    df_working['canonical_smiles'] = df_working['mol_temp'].apply(lambda x: Chem.MolToSmiles(x, canonical=True))

    # 12. Remove duplicate molecules based on 'canonical_smiles'
    rows_before_dedup = len(df_working)
    df_working.drop_duplicates(subset=['canonical_smiles'], inplace=True)
    num_duplicates = rows_before_dedup - len(df_working)

    if num_duplicates > 0:
        print(f"[Cleaning] Found {num_duplicates} duplicate molecules. Removing...")
    else:
        print(f"[Cleaning] No duplicates found.")

    # 13. Drop temporary RDKit-only columns
    df_working.drop(columns=['mol_temp'], inplace=True)

    # Finalize df_clean
    df_final = df_working.reset_index(drop=True)
    final_count = len(df_final)

    # 15. Store summary statistics
    summary = {
        'initial_rows': initial_count,
        'invalid_smiles_removed': int(num_invalid),
        'duplicates_removed': int(num_duplicates),
        'final_rows': final_count
    }

    # 14. Print Cleaning Summary
    print(f"\n[Summary] CLEANING REPORT:")
    print(f" -> Initial Rows:      {summary['initial_rows']}")
    print(f" -> Invalid Removed:   {summary['invalid_smiles_removed']}")
    print(f" -> Duplicates Removed:{summary['duplicates_removed']}")
    print(f" -> Final Rows:        {summary['final_rows']}")

    return df_final, summary

# --- EXECUTION ---

# Execute logic and assign to GLOBAL variables
df_clean, cleaning_summary = run_phase_b()

# 16. Defensive Programming: Verify Post-conditions
print(f"\n--- Running Final Sanity Checks ---")
assert 'df_clean' in globals(), "Global variable df_clean was not created."
assert 'cleaning_summary' in globals(), "Global variable cleaning_summary was not created."
assert 'canonical_smiles' in df_clean.columns, "'canonical_smiles' column missing in df_clean."
assert len(df_clean) <= len(df_raw), "df_clean has more rows than df_raw (impossible)."
assert df_raw.shape[0] == cleaning_summary['initial_rows'], "df_raw was modified unexpectedly."

print(f"Success: df_clean created with shape {df_clean.shape}.")
print(f"--- PHASE B COMPLETED SUCCESSFULLY ---")

Collecting rdkit
  Downloading rdkit-2025.9.3-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (4.2 kB)
Downloading rdkit-2025.9.3-cp312-cp312-manylinux_2_28_x86_64.whl (36.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m36.4/36.4 MB[0m [31m42.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rdkit
Successfully installed rdkit-2025.9.3
--- Starting PHASE B: Molecular Cleaning, Canonicalization, and Deduplication ---
[Status] Initial rows from df_raw: 1484
[Processing] Converting SMILES to RDKit Mol objects...


[12:02:43] Explicit valence for atom # 0 N, 4, is greater than permitted
[12:02:44] Can't kekulize mol.  Unkekulized atoms: 9
[12:02:44] Can't kekulize mol.  Unkekulized atoms: 4
[12:02:44] Can't kekulize mol.  Unkekulized atoms: 4


[Cleaning] Found 4 invalid SMILES. Removing...
[Processing] Generating canonical SMILES...
[Cleaning] Found 19 duplicate molecules. Removing...

[Summary] CLEANING REPORT:
 -> Initial Rows:      1484
 -> Invalid Removed:   4
 -> Duplicates Removed:19
 -> Final Rows:        1461

--- Running Final Sanity Checks ---
Success: df_clean created with shape (1461, 5).
--- PHASE B COMPLETED SUCCESSFULLY ---


In [None]:
print(df_raw.shape)        # unchanged
print(df_clean.shape)      # smaller or equal
print(cleaning_summary)


(1484, 4)
(1461, 5)
{'initial_rows': 1484, 'invalid_smiles_removed': 4, 'duplicates_removed': 19, 'final_rows': 1461}
