# üöÄ XGBoost Regression: DAT Binding Prediction

**Goal**: Predict pKi values (binding strength) using XGBoost with train/test split

**Dataset**: 541 compounds with RDKit descriptors  
**Target**: pKi (continuous variable)  
**Method**: XGBoost Regression + 80/20 Train/Test Split + Early Stopping

**Key Differences from Random Forest:**
- Uses gradient boosting (sequential trees) instead of bagging
- 80/20 split for proper test set evaluation
- Early stopping to prevent overfitting
- More hyperparameters to tune

---


In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.metrics import confusion_matrix, classification_report
from scipy.stats import randint, uniform

print("‚úÖ Libraries imported successfully!")


## üìÇ Step 1: Load Processed Data from Analysis

**Source:** `processed_DAT_rdkit_features.csv` (from dataanalyse.ipynb)

This ensures we use the **same RDKit features** across all models!


In [None]:
# Load processed RDKit features from data analysis
df_rdkit = pd.read_csv('processed_DAT_rdkit_features.csv')

print("="*60)
print("üìÇ LOADED PROCESSED DATA FROM ANALYSIS")
print("="*60)
print(f"‚úÖ Dataset: {len(df_rdkit)} compounds")
print(f"‚úÖ Features: {len(df_rdkit.columns)-2} RDKit descriptors")
print(f"‚úÖ Source: dataanalyse.ipynb (same features as other models!)")
print(f"\nüìä pKi distribution:")
print(f"   Min: {df_rdkit['pKi'].min():.2f}")
print(f"   Max: {df_rdkit['pKi'].max():.2f}")
print(f"   Mean: {df_rdkit['pKi'].mean():.2f}")
print(f"   Median: {df_rdkit['pKi'].median():.2f}")
print("\nüî¨ Available features:")
print([col for col in df_rdkit.columns if col not in ['ChEMBL_ID', 'pKi']])
print("="*60)
print(df_rdkit.head())


## üîß Step 2: Prepare Features & Split Data (80/20)

**Key Difference from RF:** We use an 80/20 train/test split instead of cross-validation only.

This allows:
- Proper held-out test set evaluation
- Early stopping monitoring
- Fair comparison with Neural Networks


In [None]:
# Separate features and target
X = df_rdkit.drop(['ChEMBL_ID', 'pKi'], axis=1)
y = df_rdkit['pKi']

print("="*60)
print("üîß FEATURE PREPARATION")
print("="*60)
print(f"Features (X): {X.shape}")
print(f"Target (y): {y.shape}")
print(f"\nFeature names: {list(X.columns)}")
print("="*60)


In [None]:
# 80/20 Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, shuffle=True
)

print("="*60)
print("üìä TRAIN-TEST SPLIT (80/20)")
print("="*60)
print(f"Training set: {X_train.shape[0]} compounds ({X_train.shape[0]/len(X)*100:.1f}%)")
print(f"Test set: {X_test.shape[0]} compounds ({X_test.shape[0]/len(X)*100:.1f}%)")
print(f"\nTraining pKi range: {y_train.min():.2f} - {y_train.max():.2f}")
print(f"Test pKi range: {y_test.min():.2f} - {y_test.max():.2f}")
print("="*60)


In [None]:
# Scale features (important for regularization)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("‚úÖ Features scaled using StandardScaler")
print(f"   Training set scaled: {X_train_scaled.shape}")
print(f"   Test set scaled: {X_test_scaled.shape}")


## üéØ Step 3: Train Baseline XGBoost Model

**Baseline Configuration:**
- `n_estimators=1000` (will use early stopping)
- `learning_rate=0.1` (default)
- `max_depth=6` (default)
- Early stopping with 50 rounds patience


In [None]:
# Initialize baseline XGBoost model
xgb_baseline = XGBRegressor(
    n_estimators=1000,
    learning_rate=0.1,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1
)

print("üöÄ Training Baseline XGBoost Model with Early Stopping...")
print("   This may take a minute...\n")

# Train with early stopping
xgb_baseline.fit(
    X_train_scaled, y_train,
    eval_set=[(X_test_scaled, y_test)],
    early_stopping_rounds=50,
    verbose=False
)

print(f"‚úÖ Training completed!")
print(f"   Best iteration: {xgb_baseline.best_iteration}")
print(f"   Total trees trained: {xgb_baseline.n_estimators}")


## üìä Step 4: Evaluate Baseline Model


In [None]:
# Predictions on training set
y_train_pred = xgb_baseline.predict(X_train_scaled)

# Predictions on test set
y_test_pred = xgb_baseline.predict(X_test_scaled)

# Calculate metrics
train_r2 = r2_score(y_train, y_train_pred)
train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
train_mae = mean_absolute_error(y_train, y_train_pred)

test_r2 = r2_score(y_test, y_test_pred)
test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
test_mae = mean_absolute_error(y_test, y_test_pred)

print("="*70)
print("üìä BASELINE XGBOOST MODEL PERFORMANCE")
print("="*70)
print(f"\n{'Metric':<20} {'Training Set':<20} {'Test Set':<20}")
print("-"*70)
print(f"{'R¬≤ Score':<20} {train_r2:<20.4f} {test_r2:<20.4f}")
print(f"{'RMSE':<20} {train_rmse:<20.4f} {test_rmse:<20.4f}")
print(f"{'MAE':<20} {train_mae:<20.4f} {test_mae:<20.4f}")
print("-"*70)

# Check for overfitting
overfit_r2 = train_r2 - test_r2
print(f"\nüîç Overfitting Analysis:")
print(f"   R¬≤ difference (train - test): {overfit_r2:.4f}")
if overfit_r2 > 0.1:
    print(f"   ‚ö†Ô∏è  Potential overfitting detected!")
elif overfit_r2 > 0.05:
    print(f"   ‚ö° Mild overfitting")
else:
    print(f"   ‚úÖ Good generalization!")
print("="*70)


## üìà Step 5: Visualize Predictions


In [None]:
# Actual vs Predicted plots
fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# Training set
axes[0].scatter(y_train, y_train_pred, alpha=0.6, edgecolors='black', s=50)
axes[0].plot([y_train.min(), y_train.max()], [y_train.min(), y_train.max()], 
             'r--', lw=2, label='Perfect Prediction')
axes[0].set_xlabel('Actual pKi', fontsize=12, fontweight='bold')
axes[0].set_ylabel('Predicted pKi', fontsize=12, fontweight='bold')
axes[0].set_title(f'Training Set (n={len(y_train)})\nR¬≤ = {train_r2:.4f}, RMSE = {train_rmse:.4f}', 
                  fontsize=13, fontweight='bold')
axes[0].legend()
axes[0].grid(alpha=0.3)

# Test set
axes[1].scatter(y_test, y_test_pred, alpha=0.6, edgecolors='black', s=50, color='orange')
axes[1].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 
             'r--', lw=2, label='Perfect Prediction')
axes[1].set_xlabel('Actual pKi', fontsize=12, fontweight='bold')
axes[1].set_ylabel('Predicted pKi', fontsize=12, fontweight='bold')
axes[1].set_title(f'Test Set (n={len(y_test)})\nR¬≤ = {test_r2:.4f}, RMSE = {test_rmse:.4f}', 
                  fontsize=13, fontweight='bold')
axes[1].legend()
axes[1].grid(alpha=0.3)

plt.tight_layout()
plt.show()


In [None]:
# Residual plots
train_residuals = y_train - y_train_pred
test_residuals = y_test - y_test_pred

fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# Training residuals
axes[0].scatter(y_train_pred, train_residuals, alpha=0.6, edgecolors='black', s=50)
axes[0].axhline(y=0, color='r', linestyle='--', lw=2)
axes[0].set_xlabel('Predicted pKi', fontsize=12, fontweight='bold')
axes[0].set_ylabel('Residuals (Actual - Predicted)', fontsize=12, fontweight='bold')
axes[0].set_title('Training Set Residuals', fontsize=13, fontweight='bold')
axes[0].grid(alpha=0.3)

# Test residuals
axes[1].scatter(y_test_pred, test_residuals, alpha=0.6, edgecolors='black', s=50, color='orange')
axes[1].axhline(y=0, color='r', linestyle='--', lw=2)
axes[1].set_xlabel('Predicted pKi', fontsize=12, fontweight='bold')
axes[1].set_ylabel('Residuals (Actual - Predicted)', fontsize=12, fontweight='bold')
axes[1].set_title('Test Set Residuals', fontsize=13, fontweight='bold')
axes[1].grid(alpha=0.3)

plt.tight_layout()
plt.show()


## üéØ Step 6: Feature Importance Analysis


In [None]:
# Get feature importances
feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'Importance': xgb_baseline.feature_importances_
}).sort_values('Importance', ascending=False)

print("="*60)
print("üéØ FEATURE IMPORTANCE (Baseline XGBoost)")
print("="*60)
print(feature_importance.to_string(index=False))
print("="*60)

# Visualize
plt.figure(figsize=(10, 6))
plt.barh(feature_importance['Feature'], feature_importance['Importance'], 
         color='steelblue', edgecolor='black', alpha=0.7)
plt.xlabel('Importance Score', fontsize=12, fontweight='bold')
plt.ylabel('Feature', fontsize=12, fontweight='bold')
plt.title('Feature Importance - Baseline XGBoost Model', fontsize=14, fontweight='bold')
plt.gca().invert_yaxis()
plt.grid(axis='x', alpha=0.3)
plt.tight_layout()
plt.show()


## üîß Step 7: Hyperparameter Tuning with RandomizedSearchCV

**Strategy:** Use RandomizedSearch to find optimal hyperparameters

**Key Parameters to Tune:**
- `n_estimators`: Number of boosting rounds
- `learning_rate`: Step size for updates
- `max_depth`: Tree complexity
- `subsample`: Row sampling
- `colsample_bytree`: Column sampling
- `gamma`: Minimum loss reduction for split
- `reg_alpha`: L1 regularization
- `reg_lambda`: L2 regularization


In [None]:
# Define parameter distributions
param_distributions = {
    'n_estimators': randint(100, 1000),
    'learning_rate': uniform(0.01, 0.3),
    'max_depth': randint(3, 10),
    'min_child_weight': randint(1, 7),
    'subsample': uniform(0.6, 0.4),
    'colsample_bytree': uniform(0.6, 0.4),
    'gamma': uniform(0, 0.5),
    'reg_alpha': uniform(0, 1),
    'reg_lambda': uniform(0, 2)
}

# Initialize XGBoost for tuning
xgb_tuning = XGBRegressor(random_state=42, n_jobs=-1)

# RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=xgb_tuning,
    param_distributions=param_distributions,
    n_iter=100,
    scoring='r2',
    cv=5,
    verbose=2,
    random_state=42,
    n_jobs=-1
)

print("="*70)
print("üîç HYPERPARAMETER TUNING - RANDOMIZED SEARCH")
print("="*70)
print(f"Total iterations: 100")
print(f"Cross-validation: 5-fold")
print(f"Scoring metric: R¬≤")
print(f"\nüöÄ Starting search... (this will take several minutes)\n")

# Fit
random_search.fit(X_train_scaled, y_train)

print("\n" + "="*70)
print("‚úÖ HYPERPARAMETER TUNING COMPLETED!")
print("="*70)
print(f"\nüèÜ Best Parameters:")
for param, value in random_search.best_params_.items():
    print(f"   {param:<20s}: {value}")
print(f"\nüéØ Best Cross-Validation R¬≤ Score: {random_search.best_score_:.4f}")
print("="*70)


In [None]:
# Get best model
best_xgb_model = random_search.best_estimator_

# Evaluate tuned model
y_train_pred_tuned = best_xgb_model.predict(X_train_scaled)
y_test_pred_tuned = best_xgb_model.predict(X_test_scaled)

# Calculate metrics
train_r2_tuned = r2_score(y_train, y_train_pred_tuned)
train_rmse_tuned = np.sqrt(mean_squared_error(y_train, y_train_pred_tuned))
train_mae_tuned = mean_absolute_error(y_train, y_train_pred_tuned)

test_r2_tuned = r2_score(y_test, y_test_pred_tuned)
test_rmse_tuned = np.sqrt(mean_squared_error(y_test, y_test_pred_tuned))
test_mae_tuned = mean_absolute_error(y_test, y_test_pred_tuned)

print("="*70)
print("üìä TUNED XGBOOST MODEL PERFORMANCE")
print("="*70)
print(f"\n{'Metric':<20} {'Training Set':<20} {'Test Set':<20}")
print("-"*70)
print(f"{'R¬≤ Score':<20} {train_r2_tuned:<20.4f} {test_r2_tuned:<20.4f}")
print(f"{'RMSE':<20} {train_rmse_tuned:<20.4f} {test_rmse_tuned:<20.4f}")
print(f"{'MAE':<20} {train_mae_tuned:<20.4f} {test_mae_tuned:<20.4f}")
print("-"*70)

# Check for overfitting
overfit_r2_tuned = train_r2_tuned - test_r2_tuned
print(f"\nüîç Overfitting Analysis:")
print(f"   R¬≤ difference (train - test): {overfit_r2_tuned:.4f}")
if overfit_r2_tuned > 0.1:
    print(f"   ‚ö†Ô∏è  Potential overfitting detected!")
elif overfit_r2_tuned > 0.05:
    print(f"   ‚ö° Mild overfitting")
else:
    print(f"   ‚úÖ Good generalization!")
print("="*70)


## üìä Step 8: Compare Baseline vs Tuned Model


In [None]:
# Comparison table
comparison_df = pd.DataFrame({
    'Model': ['Baseline XGBoost', 'Tuned XGBoost'],
    'Train R¬≤': [train_r2, train_r2_tuned],
    'Test R¬≤': [test_r2, test_r2_tuned],
    'Train RMSE': [train_rmse, train_rmse_tuned],
    'Test RMSE': [test_rmse, test_rmse_tuned],
    'Train MAE': [train_mae, train_mae_tuned],
    'Test MAE': [test_mae, test_mae_tuned]
})

print("="*80)
print("üìä MODEL COMPARISON: Baseline vs Tuned")
print("="*80)
print(comparison_df.to_string(index=False))
print("="*80)

# Calculate improvements
r2_improvement = (test_r2_tuned - test_r2) / abs(test_r2) * 100 if test_r2 != 0 else 0
rmse_improvement = (test_rmse - test_rmse_tuned) / test_rmse * 100 if test_rmse != 0 else 0
mae_improvement = (test_mae - test_mae_tuned) / test_mae * 100 if test_mae != 0 else 0

print(f"\n‚ú® Improvements on Test Set:")
print(f"   R¬≤ improvement: {r2_improvement:+.2f}%")
print(f"   RMSE improvement: {rmse_improvement:+.2f}%")
print(f"   MAE improvement: {mae_improvement:+.2f}%")
print("="*80)


In [None]:
# Visualize comparison
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

metrics = ['R¬≤', 'RMSE', 'MAE']
baseline_vals = [test_r2, test_rmse, test_mae]
tuned_vals = [test_r2_tuned, test_rmse_tuned, test_mae_tuned]

for i, (metric, baseline, tuned) in enumerate(zip(metrics, baseline_vals, tuned_vals)):
    x = ['Baseline', 'Tuned']
    y = [baseline, tuned]
    colors = ['steelblue', 'forestgreen']
    
    bars = axes[i].bar(x, y, color=colors, edgecolor='black', alpha=0.7)
    axes[i].set_ylabel(metric, fontsize=12, fontweight='bold')
    axes[i].set_title(f'{metric} Comparison (Test Set)', fontsize=13, fontweight='bold')
    axes[i].grid(axis='y', alpha=0.3)
    
    # Add value labels
    for bar in bars:
        height = bar.get_height()
        axes[i].text(bar.get_x() + bar.get_width()/2., height,
                    f'{height:.4f}',
                    ha='center', va='bottom', fontsize=10, fontweight='bold')

plt.tight_layout()
plt.show()


## üéØ Step 9: Classification Performance (Confusion Matrix)

Convert continuous predictions to categories:
- **Low**: pKi < 6.0
- **Medium**: 6.0 ‚â§ pKi < 8.0
- **High**: pKi ‚â• 8.0


In [None]:
# Define classification function
def classify_pKi(pKi_values):
    """Convert continuous pKi to categories"""
    categories = []
    for pKi in pKi_values:
        if pKi < 6.0:
            categories.append('Low')
        elif pKi < 8.0:
            categories.append('Medium')
        else:
            categories.append('High')
    return np.array(categories)

# Convert to categories (test set only)
y_test_categorical = classify_pKi(y_test)
y_test_pred_baseline_cat = classify_pKi(y_test_pred)
y_test_pred_tuned_cat = classify_pKi(y_test_pred_tuned)

print("="*60)
print("üìä TEST SET CLASSIFICATION DISTRIBUTION")
print("="*60)
print(f"Low Binders (pKi < 6.0): {np.sum(y_test_categorical == 'Low')} compounds")
print(f"Medium Binders (6.0 ‚â§ pKi < 8.0): {np.sum(y_test_categorical == 'Medium')} compounds")
print(f"High Binders (pKi ‚â• 8.0): {np.sum(y_test_categorical == 'High')} compounds")
print("="*60)


In [None]:
# Confusion matrices
cm_baseline = confusion_matrix(y_test_categorical, y_test_pred_baseline_cat, labels=['Low', 'Medium', 'High'])
cm_tuned = confusion_matrix(y_test_categorical, y_test_pred_tuned_cat, labels=['Low', 'Medium', 'High'])

# Side-by-side confusion matrix comparison
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Baseline confusion matrix
sns.heatmap(cm_baseline, annot=True, fmt='d', cmap='Blues', ax=axes[0],
            xticklabels=['Low', 'Medium', 'High'],
            yticklabels=['Low', 'Medium', 'High'],
            cbar_kws={'label': 'Count'})
axes[0].set_xlabel('Predicted Category', fontsize=12, fontweight='bold')
axes[0].set_ylabel('Actual Category', fontsize=12, fontweight='bold')
baseline_acc = np.trace(cm_baseline) / cm_baseline.sum() * 100
axes[0].set_title(f'Baseline XGBoost - Test Set\nAccuracy: {baseline_acc:.2f}%', 
                  fontsize=13, fontweight='bold')

# Tuned confusion matrix
sns.heatmap(cm_tuned, annot=True, fmt='d', cmap='Greens', ax=axes[1],
            xticklabels=['Low', 'Medium', 'High'],
            yticklabels=['Low', 'Medium', 'High'],
            cbar_kws={'label': 'Count'})
axes[1].set_xlabel('Predicted Category', fontsize=12, fontweight='bold')
axes[1].set_ylabel('Actual Category', fontsize=12, fontweight='bold')
tuned_acc = np.trace(cm_tuned) / cm_tuned.sum() * 100
axes[1].set_title(f'Tuned XGBoost - Test Set\nAccuracy: {tuned_acc:.2f}%', 
                  fontsize=13, fontweight='bold')

plt.tight_layout()
plt.show()

print("\n" + "="*70)
print("üìä CLASSIFICATION REPORTS")
print("="*70)
print("\nüîµ Baseline XGBoost:")
print(classification_report(y_test_categorical, y_test_pred_baseline_cat, labels=['Low', 'Medium', 'High']))
print("\nüü¢ Tuned XGBoost:")
print(classification_report(y_test_categorical, y_test_pred_tuned_cat, labels=['Low', 'Medium', 'High']))
print("="*70)


## üìä Step 10: Final Summary & Conclusions


In [None]:
print("="*80)
print("üéØ FINAL SUMMARY - XGBOOST REGRESSION (NO PCA)")
print("="*80)
print(f"\nüìä Dataset:")
print(f"   Total compounds: {len(df_rdkit)}")
print(f"   Training set: {len(X_train)} (80%)")
print(f"   Test set: {len(X_test)} (20%)")
print(f"   Features: {X.shape[1]} RDKit descriptors (no PCA)")

print(f"\nüèÜ Best Model Performance (Test Set):") 
print(f"   R¬≤ Score: {test_r2_tuned:.4f}")
print(f"   RMSE: {test_rmse_tuned:.4f}")
print(f"   MAE: {test_mae_tuned:.4f}")
print(f"   Classification Accuracy: {tuned_acc:.2f}%")

print(f"\n‚ú® Improvements from Tuning:")
print(f"   R¬≤ improvement: {r2_improvement:+.2f}%")
print(f"   RMSE improvement: {rmse_improvement:+.2f}%")
print(f"   MAE improvement: {mae_improvement:+.2f}%")

print(f"\nüîç Top 3 Most Important Features:")
for i in range(min(3, len(feature_importance))):
    feat = feature_importance.iloc[i]
    print(f"   {i+1}. {feat['Feature']}: {feat['Importance']:.4f}")

print(f"\nüí° Key Insights:")
print(f"   ‚Ä¢ XGBoost with 80/20 split allows proper test set evaluation")
print(f"   ‚Ä¢ Early stopping prevents overfitting")
print(f"   ‚Ä¢ Hyperparameter tuning {'improved' if r2_improvement > 0 else 'maintained'} performance")
print(f"   ‚Ä¢ Model generalizes {'well' if overfit_r2_tuned < 0.05 else 'reasonably'} to unseen data")

print("\n" + "="*80)
print("‚úÖ Analysis Complete!")
print("="*80)
