In [None]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem

def run_phase_e():
    """
    Executes Phase E: Molecular Featurization.
    Converts canonical SMILES to Morgan Fingerprints (ECFP4).
    Returns X (features), y (labels), and metadata.
    """
    print(f"--- Starting PHASE E: Molecular Featurization ---")

    # 1. Defensive Programming: Verify Pre-conditions
    assert 'df_clean' in globals(), "CRITICAL ERROR: df_clean not found. Run Phase A-D first."
    assert 'canonical_smiles' in df_clean.columns, "CRITICAL ERROR: 'canonical_smiles' missing."
    assert 'toxicity_label' in df_clean.columns, "CRITICAL ERROR: 'toxicity_label' missing."

    # Define parameters
    RADIUS = 2
    N_BITS = 2048

    # 8. Convert to Mol objects
    print(f"[Processing] Converting SMILES to Mol objects...")
    # Note: We assume SMILES are valid as per Phase B cleaning.
    # We use a list comprehension for speed.
    mols = [Chem.MolFromSmiles(s) for s in df_clean['canonical_smiles']]

    # Verify no failures in Mol conversion (should have been cleaned in Phase B)
    assert all(m is not None for m in mols), "CRITICAL ERROR: Found invalid SMILES during featurization."

    # 9. Generate Morgan Fingerprints
    print(f"[Processing] Generating Morgan Fingerprints (Radius={RADIUS}, nBits={N_BITS})...")
    # Generate bit vectors
    fps = [AllChem.GetMorganFingerprintAsBitVect(m, radius=RADIUS, nBits=N_BITS) for m in mols]

    # 10. Convert to NumPy array X
    # Initializing an empty array of the correct shape and type is efficient
    num_samples = len(df_clean)
    X = np.zeros((num_samples, N_BITS), dtype=np.int8)

    # Efficiently fill the array
    # RDKit ExplicitBitVect can be converted to numpy/list easily
    for i, fp in enumerate(fps):
        # We can use specific RDKit functionality or simple iteration
        # DataStructs.ConvertToNumpyArray(fp, X[i]) is standard but requires DataStructs import.
        # Direct iteration or list conversion works for bit vectors:
        X[i] = np.array(fp)

    # 11. Extract labels y
    y = df_clean['toxicity_label'].values.astype(np.int8)

    # 12. SANITY CHECKS (MANDATORY)
    print(f"[Validation] Running Sanity Checks...")

    # Check 1: Shape Dimensions
    # Note: User specified exactly 1461 rows.
    assert X.shape == (1461, 2048), f"Shape Mismatch: Expected (1461, 2048), got {X.shape}"
    assert y.shape == (1461,), f"Shape Mismatch: Expected (1461,), got {y.shape}"

    # Check 2: Row Order Integrity
    assert X.shape[0] == len(df_clean), "X row count does not match df_clean length"
    assert y.shape[0] == len(df_clean), "y row count does not match df_clean length"

    # Check 3: Data Integrity
    assert not np.isnan(X).any(), "X contains NaN values"
    unique_y = np.unique(y)
    # Check that y only contains 0 and 1 (subset check in case one class is missing, though unlikely)
    assert np.all(np.isin(unique_y, [0, 1])), f"y contains invalid labels: {unique_y}"

    print(f" -> All sanity checks passed.")

    # 15. Create Metadata
    feature_metadata = {
        'fingerprint_type': 'Morgan (ECFP)',
        'radius': RADIUS,
        'n_bits': N_BITS,
        'n_samples': num_samples
    }

    # 14. REPORTING
    print(f"\n" + "="*45)
    print(f"       FEATURE GENERATION REPORT")
    print(f"="*45)
    print(f"Molecules Featurized:      {feature_metadata['n_samples']}")
    print(f"Fingerprint Type:          {feature_metadata['fingerprint_type']}")
    print(f"Radius:                    {feature_metadata['radius']}")
    print(f"Number of Bits:            {feature_metadata['n_bits']}")
    print(f"Feature Matrix X Shape:    {X.shape}")
    print(f"Label Vector y Shape:      {y.shape}")
    print(f"="*45)

    return X, y, feature_metadata

# --- EXECUTION ---

# Execute logic and assign to GLOBAL variables
X, y, feature_metadata = run_phase_e()

# 23. Final Success Message
print(f"\n--- PHASE E COMPLETED SUCCESSFULLY ---")

--- Starting PHASE E: Molecular Featurization ---
[Processing] Converting SMILES to Mol objects...
[Processing] Generating Morgan Fingerprints (Radius=2, nBits=2048)...




[Validation] Running Sanity Checks...
 -> All sanity checks passed.

       FEATURE GENERATION REPORT
Molecules Featurized:      1461
Fingerprint Type:          Morgan (ECFP)
Radius:                    2
Number of Bits:            2048
Feature Matrix X Shape:    (1461, 2048)
Label Vector y Shape:      (1461,)

--- PHASE E COMPLETED SUCCESSFULLY ---
