In [9]:
"""
OPTIMIZED BINDINGDB TRAINING NOTEBOOK (v5 - IPYNB / MODULAR PATHS)
Project: druglikeliness12
File: notebooks/train.ipynb

Features:
- 🚀 GPU Accelerated (Native Windows): Uses XGBoost (tree_method='gpu_hist')
- 🧠 Memory Optimized: Samples 1,500,000+ entries for 32GB+ RAM
- 🎯 Efficient TSV/CSV Loader: Fixes DtypeWarning by pre-selecting columns
- 📊 REAL PROGRESS BARS: Uses TQDM for data loading and feature extraction
- 🖥️ Tuned for high-end CPU/GPU (Ultra 9 + 5070 Ti)
- ⏭️ SKIP AHEAD: Saves/loads intermediate files (CSV, NPY) to resume training
- 📂 MODULAR PATHS: Reads/saves to project root structure (data/, models/)
"""

import pandas as pd
import numpy as np
import pickle
import os
import gc
import warnings
from collections import Counter
from tqdm import tqdm

# === ML IMPORTS ===
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, mean_squared_error, r2_score
import xgboost as xgb

# Initialize tqdm for pandas
tqdm.pandas() 
warnings.filterwarnings('ignore')

# ==================== PATH DEFINITIONS ====================
# This notebook is in /notebooks, so root is one level up ('../')
ROOT_DIR = '../'

# --- Data Dirs ---
DATA_DIR = os.path.join(ROOT_DIR, 'data')
RAW_DATA_DIR = os.path.join(DATA_DIR, 'raw')
PROCESSED_DATA_DIR = os.path.join(DATA_DIR, 'processed')

# --- Model Dir ---
MODELS_DIR = os.path.join(ROOT_DIR, 'models')

# --- Input File ---
BINDINGDB_FILE = os.path.join(RAW_DATA_DIR, 'BindingDB_All.tsv')

# --- Processed Data Files ---
CLEANED_FILE = os.path.join(PROCESSED_DATA_DIR, 'cleaned_bindingdb_data.csv')
FEATURES_X_FILE = os.path.join(PROCESSED_DATA_DIR, 'X_features.npy')
FEATURES_Y_CLASS_FILE = os.path.join(PROCESSED_DATA_DIR, 'y_class.npy')
FEATURES_Y_REG_FILE = os.path.join(PROCESSED_DATA_DIR, 'y_reg.npy')

# --- Model Files ---
MODEL_CLASSIFIER = os.path.join(MODELS_DIR, 'classifier.pkl')
MODEL_REGRESSOR = os.path.join(MODELS_DIR, 'regressor.pkl')
MODEL_SCALER = os.path.join(MODELS_DIR, 'scaler.pkl')
MODEL_EXTRACTOR = os.path.join(MODELS_DIR, 'feature_extractor.pkl')

# --- Create Dirs if they don't exist ---
os.makedirs(PROCESSED_DATA_DIR, exist_ok=True)
os.makedirs(MODELS_DIR, exist_ok=True)

print("="*80)
print(" 🚀 DRUG-TARGET INTERACTION ML TRAINING (XGBoost GPU - v5 IPYNB) 🚀")
print(f" Mode: Native Windows (XGBoost) with Skip-Logic")
print(f" Reading from: {RAW_DATA_DIR}")
print(f" Saving to: {PROCESSED_DATA_DIR} and {MODELS_DIR}")
print("="*80)

 🚀 DRUG-TARGET INTERACTION ML TRAINING (XGBoost GPU - v5 IPYNB) 🚀
 Mode: Native Windows (XGBoost) with Skip-Logic
 Reading from: ../data\raw
 Saving to: ../data\processed and ../models


In [10]:
# ==================== UTILITY FUNCTION ====================

def find_column(columns, possible_names):
    """Find column by trying multiple possible names"""
    for name in possible_names:
        for col in columns:
            if name.lower() in col.lower():
                return col
    return None

# ==================== STEP 1: LOAD BINDINGDB DATA ====================
def load_bindingdb_data(filepath=BINDINGDB_FILE, sample_size=1500000):
    """Load and intelligently sample BindingDB TSV/CSV data"""
    print("\n" + "="*80)
    print("STEP 1: LOADING BINDINGDB DATA")
    print("="*80)
    
    if not os.path.exists(filepath):
        print(f"\n❌ ERROR: File not found: {filepath}")
        print(f"    Please ensure 'BindingDB_All.tsv' is in {RAW_DATA_DIR}")
        return None
        
    file_size_gb = os.path.getsize(filepath) / (1024**3)
    print(f"\n📖 Reading file: {os.path.basename(filepath)}")
    print(f"    File size: {file_size_gb:.2f} GB")
    print(f"    Target sample size: {sample_size:,} entries")
    
    print("\n🧐 Analyzing file header to find required columns...")
    
    sep = '\t' if filepath.endswith('.tsv') else ','
    delimiter_name = 'TAB (\\t)' if sep == '\t' else 'COMMA (,)'
    print(f"    Detected delimiter: {delimiter_name}")
    
    try:
        header = pd.read_csv(filepath, sep=sep, nrows=0).columns
    except Exception as e:
        print(f"\n❌ CRITICAL ERROR: Could not read file header: {e}")
        return None
        
    print(f"    File has {len(header)} total columns.")

    smiles_col = find_column(header, ['ligand smiles', 'smiles', 'ligand_smiles'])
    sequence_col = find_column(header, ['target sequence', 'bindingdb target chain sequence', 'sequence', 'protein sequence'])
    ic50_col = find_column(header, ['ic50 (nm)', 'ic50', 'ic50_nm'])
    ki_col = find_column(header, ['ki (nm)', 'ki', 'ki_nm'])
    kd_col = find_column(header, ['kd (nm)', 'kd', 'kd_nm'])
    
    cols_to_load = [smiles_col, sequence_col, ic50_col, ki_col, kd_col]
    cols_to_load = [col for col in cols_to_load if col is not None]
    
    if not smiles_col or not sequence_col:
        print("\n❌ ERROR: Could not find required SMILES or Sequence column!")
        return None
        
    if not any([ic50_col, ki_col, kd_col]):
        print("\n❌ ERROR: Could not find any binding affinity column (IC50, Ki, Kd)!")
        return None

    print("\n✓ Found required columns:")
    print(f"    SMILES: {smiles_col}")
    print(f"    Sequence: {sequence_col}")
    if ic50_col: print(f"    IC50: {ic50_col}")
    if ki_col: print(f"    Ki: {ki_col}")
    if kd_col: print(f"    Kd: {kd_col}")
    
    print(f"\n📦 Reading {file_size_gb:.2f}GB file in chunks...")
    print(f"    This may take 5-10 minutes. Please wait...")
    
    chunk_size = 1_000_000
    chunks = []
    total_rows = 0
    
    try:
        try:
            print("    (Performing quick row count for progress bar...)")
            total_rows_estimate = sum(1 for row in open(filepath, 'r', encoding='utf-8'))
            n_chunks = int(np.ceil(total_rows_estimate / chunk_size))
            print(f"    (Estimated {total_rows_estimate:,} rows in {n_chunks} chunks)")
        except:
            n_chunks = None
            print("    (Could not get row count, progress bar will be un-timed)")

        chunk_iter = pd.read_csv(
            filepath, 
            sep=sep, 
            chunksize=chunk_size, 
            usecols=cols_to_load,
            low_memory=False
        )

        for i, chunk in enumerate(tqdm(chunk_iter, total=n_chunks, desc="Reading 6GB File")):
            total_rows += len(chunk)
            
            if ic50_col:
                binding_col = ic50_col
            elif ki_col:
                binding_col = ki_col
            else:
                binding_col = kd_col
            
            chunk = chunk.rename(columns={
                smiles_col: 'drug_smiles',
                sequence_col: 'protein_sequence',
                binding_col: 'binding_value'
            })
            
            chunk = chunk.dropna(subset=['drug_smiles', 'protein_sequence', 'binding_value'])
            chunk = chunk[chunk['drug_smiles'].astype(str).str.len() > 5]
            
            seq_lens = chunk['protein_sequence'].astype(str).str.len()
            chunk = chunk[(seq_lens >= 50) & (seq_lens <= 1500)]
            
            chunk['binding_value'] = pd.to_numeric(chunk['binding_value'], errors='coerce')
            chunk = chunk.dropna(subset=['binding_value'])
            
            chunk = chunk[['drug_smiles', 'protein_sequence', 'binding_value']]
            
            if len(chunk) > 0:
                chunks.append(chunk)

        print(f"\n\n    ✓ Finished reading. Total rows processed: {total_rows:,}")
        
        if not chunks:
            print("❌ ERROR: No valid data was found after filtering all chunks.")
            return None
            
        print("    Concatenating filtered chunks...")
        df = pd.concat(chunks, ignore_index=True)
        del chunks
        gc.collect()
        
        print(f"    ✓ Found {len(df):,} valid entries total.")
        
        if len(df) > sample_size:
            print(f"\n📉 Sampling {sample_size:,} from {len(df):,} valid entries...")
            df = df.sample(sample_size, random_state=42)
        else:
            print(f"\n✓ Using all {len(df):,} valid entries (less than target)")
            
        print(f"\n✅ DATA LOADED SUCCESSFULLY:")
        print(f"    Total entries: {len(df):,}")
        print(f"    Memory usage: ~{df.memory_usage(deep=True).sum() / 1024**2:.1f} MB")
        
        return df

    except Exception as e:
        print(f"\n❌ ERROR loading file: {e}")
        import traceback
        traceback.print_exc()
        return None

# ==================== STEP 2: CLEAN DATA ====================
def clean_bindingdb_data(df):
    """Clean and process the already-filtered BindingDB data"""
    print("\n" + "="*80)
    print("STEP 2: CLEANING AND PROCESSING DATA")
    print("="*80)
    
    print(f"\n🧹 Processing {len(df):,} pre-filtered entries")
    
    print(f"\n📊 Processing binding affinities...")
    
    df['binding_affinity'] = df['binding_value'] / 1000.0
    
    initial = len(df)
    df = df[(df['binding_affinity'] > 0.001) & (df['binding_affinity'] < 100000)]
    print(f"    ✓ Valid binding values: {len(df):,} entries ({len(df)/initial*100:.1f}% kept)")
    
    df['binds'] = (df['binding_affinity'] < 10).astype(int)
    
    print(f"\n🧬 Validating protein sequences...")
    valid_aa = set('ACDEFGHIKLMNPQRSTVWYXU-')
    
    def is_valid_sequence(seq):
        if not isinstance(seq, str) or len(seq) == 0:
            return False
        seq = seq.strip().upper()
        valid_count = sum(1 for aa in seq if aa in valid_aa)
        return valid_count / len(seq) >= 0.95
    
    print("    (This may take a minute...)")
    valid_mask = df['protein_sequence'].progress_apply(is_valid_sequence)
    df = df[valid_mask]
    print(f"    ✓ Valid sequences: {len(df):,} entries")
    
    df['protein_sequence'] = df['protein_sequence'].str.strip().str.upper()
    df['drug_smiles'] = df['drug_smiles'].str.strip()
    
    initial = len(df)
    df = df.drop_duplicates(subset=['protein_sequence', 'drug_smiles'])
    if len(df) < initial:
        print(f"    ✓ Removed {initial - len(df):,} duplicates")
    
    df = df[['protein_sequence', 'drug_smiles', 'binding_affinity', 'binds']].reset_index(drop=True)
    
    print(f"\n✅ FINAL CLEANED DATASET:")
    print(f"    Total samples: {len(df):,}")
    print(f"    Binders (IC50 < 10 μM): {df['binds'].sum():,} ({df['binds'].sum()/len(df)*100:.1f}%)")
    print(f"    Non-binders: {(1-df['binds']).sum():,} ({(1-df['binds']).sum()/len(df)*100:.1f}%)")
    
    # ### --- PATH CHANGED --- ###
    df.to_csv(CLEANED_FILE, index=False)
    print(f"\n💾 Saved cleaned data to: {CLEANED_FILE}")
    # ### --- END CHANGED --- ###
    
    gc.collect()
    return df

In [11]:
# ==================== STEP 3: FEATURE EXTRACTION ====================

class FeatureExtractor:
    """Extract numerical features from proteins and drugs"""
    # This class is self-contained in the notebook
    
    def __init__(self):
        self.aa_weights = {
            'A': 89, 'C': 121, 'D': 133, 'E': 147, 'F': 165,
            'G': 75, 'H': 155, 'I': 131, 'K': 146, 'L': 131,
            'M': 149, 'N': 132, 'P': 115, 'Q': 146, 'R': 174,
            'S': 105, 'T': 119, 'V': 117, 'W': 204, 'Y': 181
        }
        self.amino_acids = 'ACDEFGHIKLMNPQRSTVWY'

    def protein_to_features(self, sequence):
        """Convert protein sequence to 33 numerical features"""
        sequence = ''.join([aa for aa in sequence if aa in self.amino_acids])
        
        if len(sequence) == 0:
            return None
            
        total = len(sequence)
        features = []
        
        aa_counts = Counter(sequence)
        for aa in self.amino_acids:
            features.append(aa_counts[aa] / total)
            
        features.append(total) # Length
        features.append(sum([self.aa_weights.get(aa, 110) for aa in sequence])) # MW
        features.append((aa_counts['F'] + aa_counts['W'] + aa_counts['Y']) / total) # Aromaticity
        
        if len(sequence) > 1:
            dipeptides = [sequence[i:i+2] for i in range(len(sequence)-1)]
            dipeptide_counts = Counter(dipeptides)
            most_common = dipeptide_counts.most_common(10)
            for i in range(10):
                if i < len(most_common):
                    features.append(most_common[i][1] / (total - 1))
                else:
                    features.append(0)
        else:
            features.extend([0] * 10)
            
        return features
    
    def smiles_to_features(self, smiles):
        """Convert SMILES to 17 numerical features"""
        features = []
        features.append(len(smiles)) # Length
        features.append(smiles.count('C')) # Carbons
        features.append(smiles.count('O')) # Oxygens
        features.append(smiles.count('N')) # Nitrogens
        features.append(smiles.count('S')) # Sulfurs
        features.append(smiles.count('P')) # Phosphorus
        features.append(smiles.count('=')) # Double bonds
        features.append(smiles.count('#')) # Triple bonds
        features.append(smiles.count('(')) # Branches
        features.append(smiles.count('[')) # Atoms in brackets
        features.append(smiles.count('@')) # Chirality
        
        for i in range(1, 7):
            features.append(smiles.count(str(i)))
            
        return features
    
    def combine_features(self, protein_features, drug_features):
        """Combine protein and drug features"""
        return protein_features + drug_features

def prepare_ml_features(df):
    """Convert dataframe to ML-ready feature matrix"""
    print("\n" + "="*80)
    print("STEP 3: EXTRACTING ML FEATURES")
    print("="*80)
    
    extractor = FeatureExtractor()
    
    total = len(df)
    print(f"\n🔬 Processing {total:,} samples...")
    
    print("    Extracting protein features (with progress bar)...")
    prot_features = df['protein_sequence'].progress_apply(extractor.protein_to_features)
    
    print("    Extracting drug features (with progress bar)...")
    drug_features = df['drug_smiles'].progress_apply(extractor.smiles_to_features)
    
    print("    Combining features...")
    df_features = pd.DataFrame({
        'prot': prot_features,
        'drug': drug_features,
        'y_class': df['binds'],
        'y_reg': df['binding_affinity']
    })
    
    df_features = df_features.dropna().reset_index(drop=True)
    
    failed = total - len(df_features)
    
    X_list = df_features.progress_apply(lambda row: extractor.combine_features(row['prot'], row['drug']), axis=1)
    X = np.array(X_list.tolist())
    y_class = df_features['y_class'].values
    y_reg = df_features['y_reg'].values
    
    print(f"\n    ✓ Successfully processed: {len(X):,} samples")
    if failed > 0:
        print(f"    ⚠ Failed to process: {failed} samples ({failed/total*100:.1f}%)")
    
    print(f"\n✅ FEATURE EXTRACTION COMPLETE:")
    print(f"    Feature matrix shape: {X.shape}")
    print(f"    Memory: ~{X.nbytes / 1024**2:.1f} MB")
    
    gc.collect()
    
    # ### --- PATH CHANGED --- ###
    # Save the outputs of this step for faster re-runs
    print("\n💾 Saving feature-extracted data for faster re-runs...")
    try:
        np.save(FEATURES_X_FILE, X)
        print(f"    ✓ Saved {os.path.basename(FEATURES_X_FILE)}")
        np.save(FEATURES_Y_CLASS_FILE, y_class)
        print(f"    ✓ Saved {os.path.basename(FEATURES_Y_CLASS_FILE)}")
        np.save(FEATURES_Y_REG_FILE, y_reg)
        print(f"    ✓ Saved {os.path.basename(FEATURES_Y_REG_FILE)}")
        
        with open(MODEL_EXTRACTOR, 'wb') as f:
            pickle.dump(extractor, f)
        print(f"    ✓ Saved {os.path.basename(MODEL_EXTRACTOR)}")
    except Exception as e:
        print(f"    ⚠ Warning: Could not save feature files: {e}")
    # ### --- END CHANGED --- ###
    
    return X, y_class, y_reg, extractor

In [12]:
# ==================== STEP 4: TRAIN ML MODELS (XGBOOST GPU) ====================

def train_models(X, y_class, y_reg, extractor):
    """Train XGBoost models on the GPU"""
    print("\n" + "="*80)
    print("STEP 4: TRAINING MACHINE LEARNING MODELS")
    print("        🚀 STATUS: XGBoost (Native Windows GPU) 🚀")
    print("="*80)
    
    print("\n📊 Scaling features...")
    scaler = StandardScaler() 
    X_scaled = scaler.fit_transform(X)
    print("    ✓ Features scaled (mean=0, std=1)")
    
    # ===== TRAIN CLASSIFIER =====
    print("\n" + "-"*80)
    print("TRAINING XGBoost CLASSIFIER (Binds: Yes/No)")
    print("-"*80)
    
    X_train, X_test, y_train, y_test = train_test_split(
        X_scaled, y_class, test_size=0.2, random_state=42, stratify=y_class
    )
    
    print(f"\n    Training set: {len(X_train):,} samples")
    print(f"    Test set: {len(X_test):,} samples")
    
    classifier = xgb.XGBClassifier(
        tree_method='hist', device='cuda',
        n_estimators=1000,
        max_depth=12,
        learning_rate=0.05,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        n_jobs=-1,
        early_stopping_rounds=50
    )
    
    print("\n    🌲 Training XGBoost Classifier...")
    print("      (This will take a few minutes)")
    
    classifier.fit(
        X_train, y_train,
        eval_set=[(X_test, y_test)],
        verbose=50
    )
    print("    ✓ Training complete!")
    
    train_acc = accuracy_score(y_train, classifier.predict(X_train))
    test_acc = accuracy_score(y_test, classifier.predict(X_test))
    y_pred = classifier.predict(X_test)
    
    print(f"\n    📈 CLASSIFIER RESULTS:")
    print(f"       Training Accuracy: {train_acc:.3f} ({train_acc*100:.1f}%)")
    print(f"       Testing Accuracy: {test_acc:.3f} ({test_acc*100:.1f}%)")
    print(f"\n    Classification Report:")
    print(classification_report(y_test, y_pred,
                                target_names=['No Binding', 'Binds'],
                                digits=3))
    
    # ===== TRAIN REGRESSOR =====
    print("\n" + "-"*80)
    print("TRAINING XGBoost REGRESSOR (Binding Affinity Prediction)")
    print("-"*80)
    
    X_train, X_test, y_train, y_test = train_test_split(
        X_scaled, y_reg, test_size=0.2, random_state=42
    )
    
    print(f"\n    Training set: {len(X_train):,} samples")
    print(f"    Test set: {len(X_test):,} samples")
    
    regressor = xgb.XGBRegressor(
        tree_method='hist', device='cuda',
        n_estimators=1000,
        max_depth=12,
        learning_rate=0.05,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        n_jobs=-1,
        early_stopping_rounds=50
    )
    
    print("\n    🌲 Training XGBoost Regressor...")
    print("      (This will take a few minutes)")
    
    regressor.fit(
        X_train, y_train,
        eval_set=[(X_test, y_test)],
        verbose=50
    )
    print("    ✓ Training complete!")
    
    train_r2 = r2_score(y_train, regressor.predict(X_train))
    test_r2 = r2_score(y_test, regressor.predict(X_test))
    y_pred = regressor.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mae = np.mean(np.abs(y_test - y_pred))
    
    print(f"\n    📈 REGRESSOR RESULTS:")
    print(f"       Training R²: {train_r2:.3f}")
    print(f"       Testing R²: {test_r2:.3f}")
    print(f"       RMSE: {rmse:.3f} μM")
    print(f"       MAE: {mae:.3f} μM")
    
    # ### --- PATH CHANGED --- ###
    # Save models to the /models directory
    print("\n" + "-"*80)
    print(f"💾 SAVING MODELS to {MODELS_DIR}")
    print("-"*80)
    
    with open(MODEL_CLASSIFIER, 'wb') as f:
        pickle.dump(classifier, f)
    print(f"    ✓ {os.path.basename(MODEL_CLASSIFIER)}")
    
    with open(MODEL_REGRESSOR, 'wb') as f:
        pickle.dump(regressor, f)
    print(f"    ✓ {os.path.basename(MODEL_REGRESSOR)}")
    
    with open(MODEL_SCALER, 'wb') as f:
        pickle.dump(scaler, f)
    print(f"    ✓ {os.path.basename(MODEL_SCALER)}")
    # ### --- END CHANGED --- ###
        
    return classifier, regressor, scaler

In [None]:
# ==================== MAIN EXECUTION ====================

def main():
    """Main training pipeline"""
    
    # All path variables are defined in Cell 1
    
    SAMPLE_SIZE = 1_500_000 
    
    X, y_class, y_reg, extractor = None, None, None, None
    df = None

    # === CHECK 1: SKIP TO STEP 4 (Training) ===
    feature_files = [FEATURES_X_FILE, FEATURES_Y_CLASS_FILE, FEATURES_Y_REG_FILE, MODEL_EXTRACTOR]
    
    if all(os.path.exists(f) for f in feature_files):
        print("\n" + "="*80)
        print("♻️ SKIPPING TO STEP 4: Found pre-computed feature files!")
        print(f"    Loading from {PROCESSED_DATA_DIR} and {MODELS_DIR}...")
        try:
            X = np.load(FEATURES_X_FILE)
            y_class = np.load(FEATURES_Y_CLASS_FILE)
            y_reg = np.load(FEATURES_Y_REG_FILE)
            with open(MODEL_EXTRACTOR, 'rb') as f:
                extractor = pickle.load(f)
            print(f"    ✓ Loaded features with shape: {X.shape}")
            
            if extractor is None:
                print(f"    ...Missing {os.path.basename(MODEL_EXTRACTOR)}. Re-computing...")
                X = None # Force re-computation
            
        except Exception as e:
            print(f"    ❌ Error loading feature files: {e}. Re-computing...")
            X = None # Force re-computation

    # === CHECK 2: SKIP TO STEP 3 (Feature Extraction) ===
    if X is None and os.path.exists(CLEANED_FILE):
        print("\n" + "="*80)
        print(f"♻️ SKIPPING TO STEP 3: Found {os.path.basename(CLEANED_FILE)}!")
        print(f"    Loading {CLEANED_FILE}...")
        try:
            df = pd.read_csv(CLEANED_FILE)
            if len(df) < 1000:
                print("    ...File is too small. Re-running from start.")
                df = None # Force re-run
            else:
                print(f"    ✓ Loaded {len(df):,} cleaned samples.")
        except Exception as e:
            print(f"    ❌ Error loading {os.path.basename(CLEANED_FILE)}: {e}. Re-running from start...")
            df = None # Force re-run

    # === CHECK 3: Run from STEP 1 (Full Load) ===
    if df is None and X is None:
        print("\n" + "="*80)
        print("▶️ STARTING FROM STEP 1: No valid cached data found.")
        print("="*80)
        
        df = load_bindingdb_data(filepath=BINDINGDB_FILE, sample_size=SAMPLE_SIZE)
        
        if df is None or len(df) == 0:
            print("\n❌ Failed to load valid data. Exiting...")
            return
            
        df = clean_bindingdb_data(df)
        if df is None or len(df) < 1000:
            print(f"\n❌ Not enough valid data ({len(df) if df is not None else 0} samples).")
            return
    
    # --- RUN STEP 3 (if needed) ---
    if X is None:
        X, y_class, y_reg, extractor = prepare_ml_features(df)
    
    # --- RUN STEP 4 (Training) ---
    if X is None:
        print("\n❌ Critical error: Feature matrix (X) is still None. Exiting.")
        return
        
    print("\nStarting Model Training...")
    classifier, regressor, scaler = train_models(X, y_class, y_reg, extractor)
    
    print("\n" + "="*80)
    print(" ✅ TRAINING COMPLETE! (XGBoost GPU - v5 IPYNB)")
    print("="*80)
    print(f"\n📦 Created files in {MODELS_DIR}:")
    print(f"    ✓ {os.path.basename(MODEL_CLASSIFIER)}")
    print(f"    ✓ {os.path.basename(MODEL_REGRESSOR)}")
    print(f"    ✓ {os.path.basename(MODEL_SCALER)}")
    print(f"    ✓ {os.path.basename(MODEL_EXTRACTOR)}")
    print(f"\n📦 Created files in {PROCESSED_DATA_DIR}:")
    print(f"    ✓ {os.path.basename(CLEANED_FILE)}")
    print(f"    ✓ {os.path.basename(FEATURES_X_FILE)}, {os.path.basename(FEATURES_Y_CLASS_FILE)}, {os.path.basename(FEATURES_Y_REG_FILE)}")
    
    print("\n🚀 Next steps:")
    print(f"    1. Your models are ready to use!")
    print(f"    2. Your streamlit app (src/app.py) can now load the models from {MODELS_DIR}.")
    
    print("\n" + "="*80 + "\n")

# --- Run the pipeline ---
if __name__ == "__main__":
    try:
        main()
    except KeyboardInterrupt:
        print("\n\n⚠ Training interrupted by user")
    except Exception as e:
        print(f"\n\n❌ UNEXPECTED ERROR:")
        print(f"    {e}")
        print("\nFull traceback:")
        import traceback
        traceback.print_exc()

# This final `main()` call is what triggers the whole pipeline
main()


♻️ SKIPPING TO STEP 3: Found cleaned_bindingdb_data.csv!
    Loading ../data\processed\cleaned_bindingdb_data.csv...
    ✓ Loaded 1,122,930 cleaned samples.

STEP 3: EXTRACTING ML FEATURES

🔬 Processing 1,122,930 samples...
    Extracting protein features (with progress bar)...


100%|██████████| 1122930/1122930 [05:01<00:00, 3727.89it/s]


    Extracting drug features (with progress bar)...


100%|██████████| 1122930/1122930 [00:07<00:00, 141517.92it/s]


    Combining features...


100%|██████████| 1122930/1122930 [00:11<00:00, 96975.39it/s] 



    ✓ Successfully processed: 1,122,930 samples

✅ FEATURE EXTRACTION COMPLETE:
    Feature matrix shape: (1122930, 50)
    Memory: ~428.4 MB

💾 Saving feature-extracted data for faster re-runs...
    ✓ Saved X_features.npy
    ✓ Saved y_class.npy
    ✓ Saved y_reg.npy
    ✓ Saved feature_extractor.pkl

Starting Model Training...

STEP 4: TRAINING MACHINE LEARNING MODELS
        🚀 STATUS: XGBoost (Native Windows GPU) 🚀

📊 Scaling features...
    ✓ Features scaled (mean=0, std=1)

--------------------------------------------------------------------------------
TRAINING XGBoost CLASSIFIER (Binds: Yes/No)
--------------------------------------------------------------------------------

    Training set: 898,344 samples
    Test set: 224,586 samples

    🌲 Training XGBoost Classifier...
      (This will take a few minutes)
[0]	validation_0-logloss:0.41185
[50]	validation_0-logloss:0.27284
[100]	validation_0-logloss:0.24958
[150]	validation_0-logloss:0.24142
[200]	validation_0-logloss:0.23

: 