In [1]:
import xgboost as xgb

print("XGBoost version:", xgb.__version__)

# Check GPU support via build info
build_info = xgb.build_info()
print("Build Info:")
print(build_info)

# --- CORRECTED LOGIC ---
# We check the *value* of the 'USE_CUDA' key, not for a separate key named "ON".
if build_info.get('USE_CUDA', False):  # .get() is a safe way to check
    print("‚úÖ GPU (CUDA) support is available.")
else:
    print("‚ùå GPU (CUDA) support not detected.")

XGBoost version: 2.1.4
Build Info:
{'BUILTIN_PREFETCH_PRESENT': False, 'CUDA_VERSION': [12, 4], 'DEBUG': False, 'MM_PREFETCH_PRESENT': True, 'THRUST_VERSION': [2, 3, 2], 'USE_CUDA': True, 'USE_DLOPEN_NCCL': False, 'USE_FEDERATED': False, 'USE_NCCL': False, 'USE_OPENMP': True, 'USE_RMM': False, 'libxgboost': 'c:\\Users\\Krishna\\miniconda3\\envs\\dti_gpu\\Lib\\site-packages\\xgboost\\lib\\xgboost.dll'}
‚úÖ GPU (CUDA) support is available.


In [3]:
"""
OPTIMIZED BINDINGDB TRAINING SCRIPT (v4 - WORKSTATION TUNED + SKIP LOGIC)
For file: BindingDB_All.tsv (6.23 GB)

Features:
- üöÄ GPU Accelerated (Native Windows): Uses XGBoost (tree_method='gpu_hist')
- üß† Memory Optimized: Samples 1,500,000+ entries for 32GB+ RAM
- üéØ Efficient TSV/CSV Loader: Fixes DtypeWarning by pre-selecting columns
- üìä REAL PROGRESS BARS: Uses TQDM for data loading and feature extraction
- üñ•Ô∏è Tuned for high-end CPU/GPU (Ultra 9 + 5070 Ti)
- ‚è≠Ô∏è SKIP AHEAD: Saves/loads intermediate files (CSV, NPY) to resume training

Run: python train_bindingdb_gpu_v4.py
"""

import pandas as pd
import numpy as np
import pickle
import os
import gc
import warnings

# === NATIVE WINDOWS IMPORTS ===
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, mean_squared_error, r2_score
import xgboost as xgb # Import XGBoost

from collections import Counter
from tqdm import tqdm # ### --- CHANGED --- ### Import tqdm for progress bars

# ### --- CHANGED --- ### Initialize tqdm for pandas
tqdm.pandas() 

warnings.filterwarnings('ignore')

print("="*80)
print(" üöÄ DRUG-TARGET INTERACTION ML TRAINING (XGBoost GPU - v4 WORKSTATION) üöÄ")
print(f" Using BindingDB_All.tsv Dataset")
print(f" Hardware: 32GB RAM, Intel Ultra 9, NVIDIA 5070 Ti")
print(f" Mode: Native Windows (XGBoost) with Skip-Logic")
print("="*80)

# ==================== UTILITY FUNCTION ====================

def find_column(columns, possible_names):
    """Find column by trying multiple possible names"""
    for name in possible_names:
        for col in columns:
            if name.lower() in col.lower():
                return col
    return None

# ==================== STEP 1: LOAD BINDINGDB DATA ====================
def load_bindingdb_data(filepath='BindingDB_All.tsv', sample_size=1500000):
    """Load and intelligently sample BindingDB TSV/CSV data"""
    print("\n" + "="*80)
    print("STEP 1: LOADING BINDINGDB DATA")
    print("="*80)
    
    if not os.path.exists(filepath):
        print(f"\n‚ùå ERROR: File not found: {filepath}")
        return None
        
    file_size_gb = os.path.getsize(filepath) / (1024**3)
    print(f"\nüìñ Reading file: {filepath}")
    print(f"    File size: {file_size_gb:.2f} GB")
    print(f"    Target sample size: {sample_size:,} entries")
    
    print("\nüßê Analyzing file header to find required columns...")
    
    sep = '\t' if filepath.endswith('.tsv') else ','
    delimiter_name = 'TAB (\\t)' if sep == '\t' else 'COMMA (,)'
    print(f"    Detected delimiter: {delimiter_name}")
    
    try:
        header = pd.read_csv(filepath, sep=sep, nrows=0).columns
    except Exception as e:
        print(f"\n‚ùå CRITICAL ERROR: Could not read file header: {e}")
        return None
        
    print(f"    File has {len(header)} total columns.")

    smiles_col = find_column(header, ['ligand smiles', 'smiles', 'ligand_smiles'])
    sequence_col = find_column(header, ['target sequence', 'bindingdb target chain sequence', 'sequence', 'protein sequence'])
    ic50_col = find_column(header, ['ic50 (nm)', 'ic50', 'ic50_nm'])
    ki_col = find_column(header, ['ki (nm)', 'ki', 'ki_nm'])
    kd_col = find_column(header, ['kd (nm)', 'kd', 'kd_nm'])
    
    cols_to_load = [smiles_col, sequence_col, ic50_col, ki_col, kd_col]
    cols_to_load = [col for col in cols_to_load if col is not None]
    
    if not smiles_col or not sequence_col:
        print("\n‚ùå ERROR: Could not find required SMILES or Sequence column!")
        return None
        
    if not any([ic50_col, ki_col, kd_col]):
        print("\n‚ùå ERROR: Could not find any binding affinity column (IC50, Ki, Kd)!")
        return None

    print("\n‚úì Found required columns:")
    print(f"    SMILES: {smiles_col}")
    print(f"    Sequence: {sequence_col}")
    if ic50_col: print(f"    IC50: {ic50_col}")
    if ki_col: print(f"    Ki: {ki_col}")
    if kd_col: print(f"    Kd: {kd_col}")
    
    print(f"\nüì¶ Reading {file_size_gb:.2f}GB file in chunks...")
    print(f"    This may take 5-10 minutes. Please wait...")
    
    chunk_size = 1_000_000
    chunks = []
    total_rows = 0
    
    try:
        # ### --- CHANGED --- ### Added progress bar for file reading
        # We need to estimate total chunks. Approx total_rows / chunk_size
        # This is a rough estimate, but good enough for a progress bar
        try:
            # Quick row count (can be slow, but useful)
            print("    (Performing quick row count for progress bar...)")
            total_rows_estimate = sum(1 for row in open(filepath, 'r', encoding='utf-8'))
            n_chunks = int(np.ceil(total_rows_estimate / chunk_size))
            print(f"    (Estimated {total_rows_estimate:,} rows in {n_chunks} chunks)")
        except:
            n_chunks = None # Fallback if count fails
            print("    (Could not get row count, progress bar will be un-timed)")

        chunk_iter = pd.read_csv(
            filepath, 
            sep=sep, 
            chunksize=chunk_size, 
            usecols=cols_to_load,
            low_memory=False
        )

        for i, chunk in enumerate(tqdm(chunk_iter, total=n_chunks, desc="Reading 6GB File")):
            total_rows += len(chunk)
            
            if ic50_col:
                binding_col = ic50_col
            elif ki_col:
                binding_col = ki_col
            else:
                binding_col = kd_col
            
            chunk = chunk.rename(columns={
                smiles_col: 'drug_smiles',
                sequence_col: 'protein_sequence',
                binding_col: 'binding_value'
            })
            
            chunk = chunk.dropna(subset=['drug_smiles', 'protein_sequence', 'binding_value'])
            chunk = chunk[chunk['drug_smiles'].astype(str).str.len() > 5]
            
            seq_lens = chunk['protein_sequence'].astype(str).str.len()
            chunk = chunk[(seq_lens >= 50) & (seq_lens <= 1500)]
            
            chunk['binding_value'] = pd.to_numeric(chunk['binding_value'], errors='coerce')
            chunk = chunk.dropna(subset=['binding_value'])
            
            chunk = chunk[['drug_smiles', 'protein_sequence', 'binding_value']]
            
            if len(chunk) > 0:
                chunks.append(chunk)

        print(f"\n\n    ‚úì Finished reading. Total rows processed: {total_rows:,}")
        
        if not chunks:
            print("‚ùå ERROR: No valid data was found after filtering all chunks.")
            return None
            
        print("    Concatenating filtered chunks...")
        df = pd.concat(chunks, ignore_index=True)
        del chunks
        gc.collect()
        
        print(f"    ‚úì Found {len(df):,} valid entries total.")
        
        if len(df) > sample_size:
            print(f"\nüìâ Sampling {sample_size:,} from {len(df):,} valid entries...")
            df = df.sample(sample_size, random_state=42)
        else:
            print(f"\n‚úì Using all {len(df):,} valid entries (less than target)")
            
        print(f"\n‚úÖ DATA LOADED SUCCESSFULLY:")
        print(f"    Total entries: {len(df):,}")
        print(f"    Memory usage: ~{df.memory_usage(deep=True).sum() / 1024**2:.1f} MB")
        
        return df

    except Exception as e:
        print(f"\n‚ùå ERROR loading file: {e}")
        import traceback
        traceback.print_exc()
        return None

# ==================== STEP 2: CLEAN DATA ====================
# This function is unchanged
def clean_bindingdb_data(df):
    """Clean and process the already-filtered BindingDB data"""
    print("\n" + "="*80)
    print("STEP 2: CLEANING AND PROCESSING DATA")
    print("="*80)
    
    print(f"\nüßπ Processing {len(df):,} pre-filtered entries")
    
    print(f"\nüìä Processing binding affinities...")
    
    df['binding_affinity'] = df['binding_value'] / 1000.0
    
    initial = len(df)
    df = df[(df['binding_affinity'] > 0.001) & (df['binding_affinity'] < 100000)]
    print(f"    ‚úì Valid binding values: {len(df):,} entries ({len(df)/initial*100:.1f}% kept)")
    
    df['binds'] = (df['binding_affinity'] < 10).astype(int)
    
    print(f"\nüß¨ Validating protein sequences...")
    valid_aa = set('ACDEFGHIKLMNPQRSTVWYXU-')
    
    def is_valid_sequence(seq):
        if not isinstance(seq, str) or len(seq) == 0:
            return False
        seq = seq.strip().upper()
        valid_count = sum(1 for aa in seq if aa in valid_aa)
        return valid_count / len(seq) >= 0.95
    
    # ### --- CHANGED --- ### Added progress_apply for sequence validation
    print("    (This may take a minute...)")
    valid_mask = df['protein_sequence'].progress_apply(is_valid_sequence)
    df = df[valid_mask]
    print(f"    ‚úì Valid sequences: {len(df):,} entries")
    
    df['protein_sequence'] = df['protein_sequence'].str.strip().str.upper()
    df['drug_smiles'] = df['drug_smiles'].str.strip()
    
    initial = len(df)
    df = df.drop_duplicates(subset=['protein_sequence', 'drug_smiles'])
    if len(df) < initial:
        print(f"    ‚úì Removed {initial - len(df):,} duplicates")
    
    df = df[['protein_sequence', 'drug_smiles', 'binding_affinity', 'binds']].reset_index(drop=True)
    
    print(f"\n‚úÖ FINAL CLEANED DATASET:")
    print(f"    Total samples: {len(df):,}")
    print(f"    Binders (IC50 < 10 ŒºM): {df['binds'].sum():,} ({df['binds'].sum()/len(df)*100:.1f}%)")
    print(f"    Non-binders: {(1-df['binds']).sum():,} ({(1-df['binds']).sum()/len(df)*100:.1f}%)")
    
    output_file = 'cleaned_bindingdb_data.csv'
    df.to_csv(output_file, index=False)
    print(f"\nüíæ Saved cleaned data to: {output_file}")
    
    gc.collect()
    return df

# ==================== STEP 3: FEATURE EXTRACTION ====================
# This class is unchanged
class FeatureExtractor:
    """Extract numerical features from proteins and drugs"""
    
    def __init__(self):
        self.aa_weights = {
            'A': 89, 'C': 121, 'D': 133, 'E': 147, 'F': 165,
            'G': 75, 'H': 155, 'I': 131, 'K': 146, 'L': 131,
            'M': 149, 'N': 132, 'P': 115, 'Q': 146, 'R': 174,
            'S': 105, 'T': 119, 'V': 117, 'W': 204, 'Y': 181
        }
        self.amino_acids = 'ACDEFGHIKLMNPQRSTVWY'

    def protein_to_features(self, sequence):
        """Convert protein sequence to 33 numerical features"""
        sequence = ''.join([aa for aa in sequence if aa in self.amino_acids])
        
        if len(sequence) == 0:
            return None
            
        total = len(sequence)
        features = []
        
        aa_counts = Counter(sequence)
        for aa in self.amino_acids:
            features.append(aa_counts[aa] / total)
            
        features.append(total) # Length
        features.append(sum([self.aa_weights.get(aa, 110) for aa in sequence])) # MW
        features.append((aa_counts['F'] + aa_counts['W'] + aa_counts['Y']) / total) # Aromaticity
        
        if len(sequence) > 1:
            dipeptides = [sequence[i:i+2] for i in range(len(sequence)-1)]
            dipeptide_counts = Counter(dipeptides)
            most_common = dipeptide_counts.most_common(10)
            for i in range(10):
                if i < len(most_common):
                    features.append(most_common[i][1] / (total - 1))
                else:
                    features.append(0)
        else:
            features.extend([0] * 10)
            
        return features
    
    def smiles_to_features(self, smiles):
        """Convert SMILES to 17 numerical features"""
        features = []
        features.append(len(smiles)) # Length
        features.append(smiles.count('C')) # Carbons
        features.append(smiles.count('O')) # Oxygens
        features.append(smiles.count('N')) # Nitrogens
        features.append(smiles.count('S')) # Sulfurs
        features.append(smiles.count('P')) # Phosphorus
        features.append(smiles.count('=')) # Double bonds
        features.append(smiles.count('#')) # Triple bonds
        features.append(smiles.count('(')) # Branches
        features.append(smiles.count('[')) # Atoms in brackets
        features.append(smiles.count('@')) # Chirality
        
        for i in range(1, 7):
            features.append(smiles.count(str(i)))
            
        return features
    
    def combine_features(self, protein_features, drug_features):
        """Combine protein and drug features"""
        return protein_features + drug_features

def prepare_ml_features(df):
    """Convert dataframe to ML-ready feature matrix"""
    print("\n" + "="*80)
    print("STEP 3: EXTRACTING ML FEATURES")
    print("="*80)
    
    extractor = FeatureExtractor()
    
    total = len(df)
    print(f"\nüî¨ Processing {total:,} samples...")
    
    # ### --- CHANGED --- ### Replaced .apply() with .progress_apply()
    print("    Extracting protein features (with progress bar)...")
    prot_features = df['protein_sequence'].progress_apply(extractor.protein_to_features)
    
    print("    Extracting drug features (with progress bar)...")
    drug_features = df['drug_smiles'].progress_apply(extractor.smiles_to_features)
    
    print("    Combining features...")
    df_features = pd.DataFrame({
        'prot': prot_features,
        'drug': drug_features,
        'y_class': df['binds'],
        'y_reg': df['binding_affinity']
    })
    
    df_features = df_features.dropna().reset_index(drop=True)
    
    failed = total - len(df_features)
    
    # ### --- CHANGED --- ### Added progress bar for final combination
    X_list = df_features.progress_apply(lambda row: extractor.combine_features(row['prot'], row['drug']), axis=1)
    X = np.array(X_list.tolist())
    y_class = df_features['y_class'].values
    y_reg = df_features['y_reg'].values
    
    print(f"\n    ‚úì Successfully processed: {len(X):,} samples")
    if failed > 0:
        print(f"    ‚ö† Failed to process: {failed} samples ({failed/total*100:.1f}%)")
    
    print(f"\n‚úÖ FEATURE EXTRACTION COMPLETE:")
    print(f"    Feature matrix shape: {X.shape}")
    print(f"    Memory: ~{X.nbytes / 1024**2:.1f} MB")
    
    gc.collect()
    
    # ### --- CHANGED --- ###
    # Save the outputs of this step for faster re-runs
    print("\nüíæ Saving feature-extracted data for faster re-runs...")
    try:
        np.save('X_features.npy', X)
        print("    ‚úì Saved X_features.npy")
        np.save('y_class.npy', y_class)
        print("    ‚úì Saved y_class.npy")
        np.save('y_reg.npy', y_reg)
        print("    ‚úì Saved y_reg.npy")
        
        with open('feature_extractor.pkl', 'wb') as f:
            pickle.dump(extractor, f)
        print("    ‚úì Saved feature_extractor.pkl")
    except Exception as e:
        print(f"    ‚ö† Warning: Could not save feature files: {e}")
    # ### --- END CHANGED --- ###
    
    return X, y_class, y_reg, extractor

# ==================== STEP 4: TRAIN ML MODELS (XGBOOST GPU) ====================

def train_models(X, y_class, y_reg, extractor):
    """Train XGBoost models on the GPU"""
    print("\n" + "="*80)
    print("STEP 4: TRAINING MACHINE LEARNING MODELS")
    print("        üöÄ STATUS: XGBoost (Native Windows GPU) üöÄ")
    print("="*80)
    
    print("\nüìä Scaling features...")
    scaler = StandardScaler() 
    X_scaled = scaler.fit_transform(X)
    print("    ‚úì Features scaled (mean=0, std=1)")
    
    # ===== TRAIN CLASSIFIER =====
    print("\n" + "-"*80)
    print("TRAINING XGBoost CLASSIFIER (Binds: Yes/No)")
    print("-"*80)
    
    X_train, X_test, y_train, y_test = train_test_split(
        X_scaled, y_class, test_size=0.2, random_state=42, stratify=y_class
    )
    
    print(f"\n    Training set: {len(X_train):,} samples")
    print(f"    Test set: {len(X_test):,} samples")
    
    
    # ### --- CHANGED --- ### Tuned parameters for higher GPU utilization
    classifier = xgb.XGBClassifier(
        tree_method='gpu_hist',
        n_estimators=1000,          # From 200 -> 1000 (5x more work)
        max_depth=12,               # From 10 -> 12 (deeper)
        learning_rate=0.05,         # From 0.1 -> 0.05 (more careful steps)
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        n_jobs=-1,
        early_stopping_rounds=50    # From 10 -> 50 (more room to improve)
    )
    
    print("\n    üå≤ Training XGBoost Classifier...")
    print("      (Using 5070 Ti... This will take a few minutes)")
    
    # ### --- CHANGED --- ### Set verbose=50 to show progress
    classifier.fit(
        X_train, y_train,
        eval_set=[(X_test, y_test)],
        verbose=50 # Print progress every 50 trees
    )
    print("    ‚úì Training complete!")
    
    train_acc = accuracy_score(y_train, classifier.predict(X_train))
    test_acc = accuracy_score(y_test, classifier.predict(X_test))
    y_pred = classifier.predict(X_test)
    
    print(f"\n    üìà CLASSIFIER RESULTS:")
    print(f"       Training Accuracy: {train_acc:.3f} ({train_acc*100:.1f}%)")
    print(f"       Testing Accuracy: {test_acc:.3f} ({test_acc*100:.1f}%)")
    print(f"\n    Classification Report:")
    print(classification_report(y_test, y_pred,
                                target_names=['No Binding', 'Binds'],
                                digits=3))
    
    # ===== TRAIN REGRESSOR =====
    print("\n" + "-"*80)
    print("TRAINING XGBoost REGRESSOR (Binding Affinity Prediction)")
    print("-"*80)
    
    X_train, X_test, y_train, y_test = train_test_split(
        X_scaled, y_reg, test_size=0.2, random_state=42
    )
    
    print(f"\n    Training set: {len(X_train):,} samples")
    print(f"    Test set: {len(X_test):,} samples")
    
    # ### --- CHANGED --- ### Tuned parameters for higher GPU utilization
    regressor = xgb.XGBRegressor(
        tree_method='gpu_hist',
        n_estimators=1000,          # From 200 -> 1000
        max_depth=12,               # From 10 -> 12
        learning_rate=0.05,         # From 0.1 -> 0.05
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        n_jobs=-1,
        early_stopping_rounds=50    # From 10 -> 50
    )
    
    print("\n    üå≤ Training XGBoost Regressor...")
    print("      (Using 5070 Ti... This will take a few minutes)")
    
    # ### --- CHANGED --- ### Set verbose=50 to show progress
    regressor.fit(
        X_train, y_train,
        eval_set=[(X_test, y_test)],
        verbose=50 # Print progress every 50 trees
    )
    print("    ‚úì Training complete!")
    
    train_r2 = r2_score(y_train, regressor.predict(X_train))
    test_r2 = r2_score(y_test, regressor.predict(X_test))
    y_pred = regressor.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mae = np.mean(np.abs(y_test - y_pred))
    
    print(f"\n    üìà REGRESSOR RESULTS:")
    print(f"       Training R¬≤: {train_r2:.3f}")
    print(f"       Testing R¬≤: {test_r2:.3f}")
    print(f"       RMSE: {rmse:.3f} ŒºM")
    print(f"       MAE: {mae:.3f} ŒºM")
    
    # Save models
    print("\n" + "-"*80)
    print("üíæ SAVING MODELS")
    print("-"*80)
    
    with open('classifier.pkl', 'wb') as f:
        pickle.dump(classifier, f)
    print("    ‚úì classifier.pkl")
    
    with open('regressor.pkl', 'wb') as f:
        pickle.dump(regressor, f)
    print("    ‚úì regressor.pkl")
    
    with open('scaler.pkl', 'wb') as f:
        pickle.dump(scaler, f)
    print("    ‚úì scaler.pkl")
    
    # ### --- CHANGED --- ###
    # Removed saving 'feature_extractor.pkl', as it's now saved in Step 3
    #
    # with open('feature_extractor.pkl', 'wb') as f:
    #     pickle.dump(extractor, f)
    # print(" 	‚úì feature_extractor.pkl")
    # ### --- END CHANGED --- ###
        
    return classifier, regressor, scaler

# ==================== MAIN EXECUTION ====================

def main():
    """Main training pipeline"""
    bindingdb_file = 'BindingDB_All.tsv'
    
    # ### --- CHANGED --- ###
    # Define all our intermediate and final file names
    cleaned_file = 'cleaned_bindingdb_data.csv'
    features_X_file = 'X_features.npy'
    features_y_class_file = 'y_class.npy'
    features_y_reg_file = 'y_reg.npy'
    extractor_file = 'feature_extractor.pkl'
    
    SAMPLE_SIZE = 1_500_000 
    
    X, y_class, y_reg, extractor = None, None, None, None
    df = None

    # === CHECK 1: SKIP TO STEP 4 (Training) ===
    # Check if the final feature files from Step 3 already exist
    if all(os.path.exists(f) for f in [features_X_file, features_y_class_file, features_y_reg_file, extractor_file]):
        print("\n" + "="*80)
        print("‚ôªÔ∏è SKIPPING TO STEP 4: Found pre-computed feature files!")
        print(f"    Loading {features_X_file}, {features_y_class_file}, etc...")
        try:
            X = np.load(features_X_file)
            y_class = np.load(features_y_class_file)
            y_reg = np.load(features_y_reg_file)
            with open(extractor_file, 'rb') as f:
                extractor = pickle.load(f)
            print(f"    ‚úì Loaded features with shape: {X.shape}")
            
            # We also need the extractor for the train_models function
            if extractor is None:
                print("    ...Missing feature_extractor.pkl. Re-computing...")
                X = None # Force re-computation
            
        except Exception as e:
            print(f"    ‚ùå Error loading feature files: {e}. Re-computing...")
            X = None # Force re-computation

    # === CHECK 2: SKIP TO STEP 3 (Feature Extraction) ===
    # If we couldn't load features, check for the cleaned CSV from Step 2
    if X is None and os.path.exists(cleaned_file):
        print("\n" + "="*80)
        print(f"‚ôªÔ∏è SKIPPING TO STEP 3: Found {cleaned_file}!")
        print(f"    Loading {cleaned_file}...")
        try:
            df = pd.read_csv(cleaned_file)
            if len(df) < 1000:
                print("    ...File is too small. Re-running from start.")
                df = None # Force re-run
            else:
                print(f"    ‚úì Loaded {len(df):,} cleaned samples.")
        except Exception as e:
            print(f"    ‚ùå Error loading {cleaned_file}: {e}. Re-running from start...")
            df = None # Force re-run

    # === CHECK 3: Run from STEP 1 (Full Load) ===
    # If both previous checks failed, run the full pipeline
    if df is None and X is None:
        print("\n" + "="*80)
        print("‚ñ∂Ô∏è STARTING FROM STEP 1: No valid cached data found.")
        print("="*80)
        print(f"\nüîç Looking for: {bindingdb_file}")
        
        if not os.path.exists(bindingdb_file):
            print(f"\n‚ùå File not found! Please ensure {bindingdb_file} is in this folder.")
            return
            
        print(f"    ‚úì Found: {bindingdb_file}")
        
        df = load_bindingdb_data(bindingdb_file, sample_size=SAMPLE_SIZE)
        if df is None or len(df) == 0:
            print("\n‚ùå Failed to load valid data. Exiting...")
            return
            
        df = clean_bindingdb_data(df)
        if df is None or len(df) < 1000:
            print(f"\n‚ùå Not enough valid data ({len(df) if df is not None else 0} samples).")
            return
    
    # --- RUN STEP 3 (if needed) ---
    if X is None:
        # This will run if EITHER Step 1+2 ran, OR Step 2 was skipped
        X, y_class, y_reg, extractor = prepare_ml_features(df)
    
    # --- RUN STEP 4 (Training) ---
    # This will always run, using loaded features or newly computed ones
    if X is None:
        print("\n‚ùå Critical error: Feature matrix (X) is still None. Exiting.")
        return
        
    classifier, regressor, scaler = train_models(X, y_class, y_reg, extractor)
    
    print("\n" + "="*80)
    print(" ‚úÖ TRAINING COMPLETE! (XGBoost GPU - v4)")
    print("="*80)
    print("\nüì¶ Created files:")
    print("    ‚úì classifier.pkl")
    print("    ‚úì regressor.pkl")
    print("    ‚úì scaler.pkl")
    print("    ‚úì feature_extractor.pkl")
    print("    ‚úì X_features.npy, y_class.npy, y_reg.npy (for re-runs)")
    print("    ‚úì cleaned_bindingdb_data.csv")
    
    print("\nüöÄ Next steps:")
    print("    1. Your models are ready to use!")
    print("    2. Your app can now load these .pkl files.")
    
    print("\n" + "="*80 + "\n")
    # ### --- END CHANGED --- ###

if __name__ == "__main__":
    try:
        main()
    except KeyboardInterrupt:
        print("\n\n‚ö† Training interrupted by user")
    except Exception as e:
        print(f"\n\n‚ùå UNEXPECTED ERROR:")
        print(f"    {e}")
        print("\nFull traceback:")
        import traceback
        traceback.print_exc()

 üöÄ DRUG-TARGET INTERACTION ML TRAINING (XGBoost GPU - v4 WORKSTATION) üöÄ
 Using BindingDB_All.tsv Dataset
 Hardware: 32GB RAM, Intel Ultra 9, NVIDIA 5070 Ti
 Mode: Native Windows (XGBoost) with Skip-Logic

‚ôªÔ∏è SKIPPING TO STEP 4: Found pre-computed feature files!
    Loading X_features.npy, y_class.npy, etc...
    ‚úì Loaded features with shape: (1122930, 50)

STEP 4: TRAINING MACHINE LEARNING MODELS
        üöÄ STATUS: XGBoost (Native Windows GPU) üöÄ

üìä Scaling features...
    ‚úì Features scaled (mean=0, std=1)

--------------------------------------------------------------------------------
TRAINING XGBoost CLASSIFIER (Binds: Yes/No)
--------------------------------------------------------------------------------

    Training set: 898,344 samples
    Test set: 224,586 samples

    üå≤ Training XGBoost Classifier...
      (Using 5070 Ti... This will take a few minutes)
[0]	validation_0-logloss:0.41185
[50]	validation_0-logloss:0.27284
[100]	validation_0-logloss:0.2495