## **Modeling Trials**

###**Modeling without optimization discarded due to time constraints**

In [None]:
# Model Training and Optimization - 1st Iteration
# ============================================================================

def train_optimized_model(model_name, objective_func, n_trials=50):
    """Train and optimize a model using Optuna"""
    print(f"\n🔄 Optimizing {model_name}...")
    print(f"   Running {n_trials} optimization trials...")

    # Create optimization study
    study = optuna.create_study(
        direction='minimize',
        sampler=TPESampler(seed=42),
        study_name=f"{model_name}_optimization"
    )

    # Progress callback
    def progress_callback(study, trial):
        if trial.number % 10 == 0:
            print(f"   Trial {trial.number}: Best MSE = {study.best_value:.4f}")

    # Run optimization
    start_time = time.time()
    study.optimize(objective_func, n_trials=n_trials, callbacks=[progress_callback])
    optimization_time = time.time() - start_time

    print(f"✅ {model_name} optimization completed in {optimization_time:.1f} seconds")
    print(f"🏆 Best CV MSE: {study.best_value:.4f}")

    return study

# Train XGBoost
print(f"\n{'='*60}")
print("🚀 TRAINING XGBOOST MODEL")
print(f"{'='*60}")

xgb_study = train_optimized_model("XGBoost", objective_xgboost, n_trials=50)

# Train final XGBoost model with best parameters
print("🔧 Training final XGBoost model...")
best_xgb_params = xgb_study.best_params.copy()

if gpu_config['xgboost_gpu']:
    best_xgb_params.update({'tree_method': 'gpu_hist', 'gpu_id': 0})
else:
    best_xgb_params.update({'tree_method': 'hist', 'n_jobs': -1})

final_xgb = xgb.XGBRegressor(**best_xgb_params)
final_xgb.fit(X_train_processed, y_train)

# Evaluate XGBoost
xgb_pred = final_xgb.predict(X_test_processed)
xgb_mse = mean_squared_error(y_test, xgb_pred)
xgb_mae = mean_absolute_error(y_test, xgb_pred)
xgb_r2 = r2_score(y_test, xgb_pred)

print(f"📊 XGBoost Final Results:")
print(f"   Test MSE: {xgb_mse:.4f}")
print(f"   Test MAE: {xgb_mae:.4f}")
print(f"   Test R²:  {xgb_r2:.4f}")

# Train LightGBM
print(f"\n{'='*60}")
print("🚀 TRAINING LIGHTGBM MODEL")
print(f"{'='*60}")

lgb_study = train_optimized_model("LightGBM", objective_lightgbm, n_trials=50)

# Train final LightGBM model
print("🔧 Training final LightGBM model...")
final_lgb = lgb.LGBMRegressor(**lgb_study.best_params)
final_lgb.fit(X_train_processed, y_train)

# Evaluate LightGBM
lgb_pred = final_lgb.predict(X_test_processed)
lgb_mse = mean_squared_error(y_test, lgb_pred)
lgb_mae = mean_absolute_error(y_test, lgb_pred)
lgb_r2 = r2_score(y_test, lgb_pred)

print(f"📊 LightGBM Final Results:")
print(f"   Test MSE: {lgb_mse:.4f}")
print(f"   Test MAE: {lgb_mae:.4f}")
print(f"   Test R²:  {lgb_r2:.4f}")

# Train CatBoost
print(f"\n{'='*60}")
print("🚀 TRAINING CATBOOST MODEL")
print(f"{'='*60}")

cb_study = train_optimized_model("CatBoost", objective_catboost, n_trials=40)

# Train final CatBoost model
print("🔧 Training final CatBoost model...")
best_cb_params = cb_study.best_params.copy()

if gpu_config['catboost_gpu']:
    best_cb_params['task_type'] = 'GPU'
else:
    best_cb_params['task_type'] = 'CPU'
    best_cb_params['thread_count'] = -1

final_cb = cb.CatBoostRegressor(**best_cb_params)
final_cb.fit(X_train_processed, y_train)

# Evaluate CatBoost
cb_pred = final_cb.predict(X_test_processed)
cb_mse = mean_squared_error(y_test, cb_pred)
cb_mae = mean_absolute_error(y_test, cb_pred)
cb_r2 = r2_score(y_test, cb_pred)

print(f"📊 CatBoost Final Results:")
print(f"   Test MSE: {cb_mse:.4f}")
print(f"   Test MAE: {cb_mae:.4f}")
print(f"   Test R²:  {cb_r2:.4f}")


🚀 TRAINING XGBOOST MODEL

🔄 Optimizing XGBoost...
   Running 50 optimization trials...
   Trial 0: Best MSE = 30.8204


[W 2025-06-04 21:56:05,274] Trial 10 failed with parameters: {'n_estimators': 1446, 'max_depth': 4, 'learning_rate': 0.06162202818095362, 'subsample': 0.7044904689037048, 'colsample_bytree': 0.8051010020460994, 'reg_alpha': 1.963435505233873, 'reg_lambda': 4.857589904335454, 'min_child_weight': 7} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/optuna/study/_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "<ipython-input-20-87c2f878690b>", line 29, in objective_xgboost
    cv_scores = cross_val_score(
                ^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/sklearn/utils/_param_validation.py", line 216, in wrapper
    return func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/sklearn/model_selection/_validation.py", line 684, in cross_val_score
    cv_results = 

KeyboardInterrupt: 

In [None]:
# Model Training and Optimization - 2nd Iteration
# ============================================================================
# OPTIMIZED FAST MODEL TRAINING (Space + Speed + GPU Optimized)
# ============================================================================

print("🚀 Starting optimized fast training...")
print("⚡ Optimizations: Reduced trials, GPU-first, memory efficient")

import gc
import warnings
warnings.filterwarnings('ignore')

# ============================================================================
# GPU Detection and Verification
# ============================================================================

def verify_gpu_acceleration():
    """Verify and force GPU usage where possible"""
    gpu_status = {'xgboost': False, 'catboost': False}

    # Test XGBoost GPU
    try:
        test_xgb = xgb.XGBRegressor(tree_method='gpu_hist', gpu_id=0, n_estimators=5)
        test_xgb.fit(X_train_processed[:20], y_train[:20])
        gpu_status['xgboost'] = True
        print("✅ XGBoost GPU verified")
        del test_xgb
    except Exception as e:
        print(f"⚠️  XGBoost GPU failed: {str(e)[:50]}...")

    # Test CatBoost GPU
    try:
        test_cb = cb.CatBoostRegressor(task_type='GPU', iterations=5, verbose=False)
        test_cb.fit(X_train_processed[:20], y_train[:20])
        gpu_status['catboost'] = True
        print("✅ CatBoost GPU verified")
        del test_cb
    except Exception as e:
        print(f"⚠️  CatBoost GPU failed: {str(e)[:50]}...")

    gc.collect()
    return gpu_status

gpu_status = verify_gpu_acceleration()

# ============================================================================
# Optimized Objective Functions (Reduced Search Space)
# ============================================================================

def fast_xgboost_objective(trial):
    """Streamlined XGBoost optimization"""
    params = {
        'n_estimators': trial.suggest_categorical('n_estimators', [600, 800, 1000]),
        'max_depth': trial.suggest_int('max_depth', 5, 8),
        'learning_rate': trial.suggest_categorical('learning_rate', [0.03, 0.05, 0.07]),
        'subsample': trial.suggest_categorical('subsample', [0.8, 0.85, 0.9]),
        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.8, 0.85, 0.9]),
        'reg_alpha': trial.suggest_categorical('reg_alpha', [0.5, 1.0, 2.0]),
        'reg_lambda': trial.suggest_categorical('reg_lambda', [1.0, 2.0, 3.0]),
        'random_state': 42
    }

    # Force GPU usage if available
    if gpu_status['xgboost']:
        params.update({'tree_method': 'gpu_hist', 'gpu_id': 0})
    else:
        params.update({'tree_method': 'hist', 'n_jobs': -1})

    model = xgb.XGBRegressor(**params)

    # Fast 3-fold CV instead of 5-fold
    cv_scores = cross_val_score(model, X_train_processed, y_train, cv=3,
                               scoring='neg_mean_squared_error', n_jobs=1)
    return -cv_scores.mean()

def fast_lightgbm_objective(trial):
    """Streamlined LightGBM optimization"""
    params = {
        'n_estimators': trial.suggest_categorical('n_estimators', [600, 800, 1000]),
        'max_depth': trial.suggest_int('max_depth', 5, 8),
        'learning_rate': trial.suggest_categorical('learning_rate', [0.03, 0.05, 0.07]),
        'num_leaves': trial.suggest_categorical('num_leaves', [40, 60, 80]),
        'subsample': trial.suggest_categorical('subsample', [0.8, 0.85, 0.9]),
        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.8, 0.85, 0.9]),
        'reg_alpha': trial.suggest_categorical('reg_alpha', [0.5, 1.0, 2.0]),
        'reg_lambda': trial.suggest_categorical('reg_lambda', [1.0, 2.0, 3.0]),
        'random_state': 42,
        'verbose': -1,
        'n_jobs': -1
    }

    model = lgb.LGBMRegressor(**params)
    cv_scores = cross_val_score(model, X_train_processed, y_train, cv=3,
                               scoring='neg_mean_squared_error', n_jobs=1)
    return -cv_scores.mean()

def fast_catboost_objective(trial):
    """Streamlined CatBoost optimization"""
    params = {
        'iterations': trial.suggest_categorical('iterations', [600, 800, 1000]),
        'depth': trial.suggest_int('depth', 5, 8),
        'learning_rate': trial.suggest_categorical('learning_rate', [0.03, 0.05, 0.07]),
        'l2_leaf_reg': trial.suggest_categorical('l2_leaf_reg', [2, 3, 5]),
        'random_seed': 42,
        'verbose': False
    }

    # Force GPU if available
    if gpu_status['catboost']:
        params['task_type'] = 'GPU'
    else:
        params.update({'task_type': 'CPU', 'thread_count': -1})

    model = cb.CatBoostRegressor(**params)
    cv_scores = cross_val_score(model, X_train_processed, y_train, cv=3,
                               scoring='neg_mean_squared_error', n_jobs=1)
    return -cv_scores.mean()

# ============================================================================
# Fast Training Function (Reduced Trials)
# ============================================================================

def train_fast_model(model_name, objective_func, n_trials=20):
    """Fast model training with reduced trials"""
    print(f"\n🔄 Fast optimization: {model_name} ({n_trials} trials)")

    # Create study with reduced storage for speed
    study = optuna.create_study(direction='minimize', sampler=TPESampler(seed=42))

    # Progress every 5 trials instead of 10
    def fast_callback(study, trial):
        if trial.number % 5 == 0 and trial.number > 0:
            print(f"   Trial {trial.number}: Best MSE = {study.best_value:.4f}")

    start_time = time.time()
    study.optimize(objective_func, n_trials=n_trials, callbacks=[fast_callback])
    elapsed = time.time() - start_time

    print(f"✅ {model_name} optimized in {elapsed:.1f}s (Best: {study.best_value:.4f})")
    return study

# ============================================================================
# XGBoost Fast Training
# ============================================================================

print(f"\n{'='*50}")
print("🚀 FAST XGBOOST TRAINING")
print(f"{'='*50}")

xgb_study = train_fast_model("XGBoost", fast_xgboost_objective, n_trials=20)

print("🔧 Training final XGBoost...")
best_xgb_params = xgb_study.best_params.copy()

if gpu_status['xgboost']:
    best_xgb_params.update({'tree_method': 'gpu_hist', 'gpu_id': 0})
    print("   Using GPU acceleration")
else:
    best_xgb_params.update({'tree_method': 'hist', 'n_jobs': -1})
    print("   Using CPU (GPU unavailable)")

start_time = time.time()
final_xgb = xgb.XGBRegressor(**best_xgb_params)
final_xgb.fit(X_train_processed, y_train)
train_time = time.time() - start_time

xgb_pred = final_xgb.predict(X_test_processed)
xgb_mse = mean_squared_error(y_test, xgb_pred)
xgb_mae = mean_absolute_error(y_test, xgb_pred)
xgb_r2 = r2_score(y_test, xgb_pred)

print(f"📊 XGBoost Results (trained in {train_time:.1f}s):")
print(f"   MSE: {xgb_mse:.4f} {'🎯' if 2.0 <= xgb_mse <= 3.0 else '❌'}")
print(f"   MAE: {xgb_mae:.4f}")
print(f"   R²:  {xgb_r2:.4f}")

# Memory cleanup
del xgb_study
gc.collect()

# ============================================================================
# LightGBM Fast Training
# ============================================================================

print(f"\n{'='*50}")
print("🚀 FAST LIGHTGBM TRAINING")
print(f"{'='*50}")

lgb_study = train_fast_model("LightGBM", fast_lightgbm_objective, n_trials=20)

print("🔧 Training final LightGBM...")
start_time = time.time()
final_lgb = lgb.LGBMRegressor(**lgb_study.best_params)
final_lgb.fit(X_train_processed, y_train)
train_time = time.time() - start_time

lgb_pred = final_lgb.predict(X_test_processed)
lgb_mse = mean_squared_error(y_test, lgb_pred)
lgb_mae = mean_absolute_error(y_test, lgb_pred)
lgb_r2 = r2_score(y_test, lgb_pred)

print(f"📊 LightGBM Results (trained in {train_time:.1f}s):")
print(f"   MSE: {lgb_mse:.4f} {'🎯' if 2.0 <= lgb_mse <= 3.0 else '❌'}")
print(f"   MAE: {lgb_mae:.4f}")
print(f"   R²:  {lgb_r2:.4f}")

# Memory cleanup
del lgb_study
gc.collect()

# ============================================================================
# CatBoost Fast Training
# ============================================================================

print(f"\n{'='*50}")
print("🚀 FAST CATBOOST TRAINING")
print(f"{'='*50}")

cb_study = train_fast_model("CatBoost", fast_catboost_objective, n_trials=15)  # Even fewer for CatBoost

print("🔧 Training final CatBoost...")
best_cb_params = cb_study.best_params.copy()

if gpu_status['catboost']:
    best_cb_params['task_type'] = 'GPU'
    print("   Using GPU acceleration")
else:
    best_cb_params.update({'task_type': 'CPU', 'thread_count': -1})
    print("   Using CPU (GPU unavailable)")

start_time = time.time()
final_cb = cb.CatBoostRegressor(**best_cb_params)
final_cb.fit(X_train_processed, y_train)
train_time = time.time() - start_time

cb_pred = final_cb.predict(X_test_processed)
cb_mse = mean_squared_error(y_test, cb_pred)
cb_mae = mean_absolute_error(y_test, cb_pred)
cb_r2 = r2_score(y_test, cb_pred)

print(f"📊 CatBoost Results (trained in {train_time:.1f}s):")
print(f"   MSE: {cb_mse:.4f} {'🎯' if 2.0 <= cb_mse <= 3.0 else '❌'}")
print(f"   MAE: {cb_mae:.4f}")
print(f"   R²:  {cb_r2:.4f}")

# Memory cleanup
del cb_study
gc.collect()

# ============================================================================
# Fast Ensemble
# ============================================================================

print(f"\n{'='*50}")
print("🎯 FAST ENSEMBLE")
print(f"{'='*50}")

# Simple performance-based weighting
models_mse = {'XGBoost': xgb_mse, 'LightGBM': lgb_mse, 'CatBoost': cb_mse}
total_inv_mse = sum(1/mse for mse in models_mse.values())
weights = {name: (1/mse)/total_inv_mse for name, mse in models_mse.items()}

print("📊 Ensemble weights:")
for name, weight in weights.items():
    print(f"   {name}: {weight:.3f}")

# Create ensemble
ensemble_pred = (weights['XGBoost'] * xgb_pred +
                weights['LightGBM'] * lgb_pred +
                weights['CatBoost'] * cb_pred)

ensemble_mse = mean_squared_error(y_test, ensemble_pred)
ensemble_mae = mean_absolute_error(y_test, ensemble_pred)
ensemble_r2 = r2_score(y_test, ensemble_pred)

print(f"\n📊 Ensemble Results:")
print(f"   MSE: {ensemble_mse:.4f} {'🎯' if 2.0 <= ensemble_mse <= 3.0 else '❌'}")
print(f"   MAE: {ensemble_mae:.4f}")
print(f"   R²:  {ensemble_r2:.4f}")

# ============================================================================
# Final Summary
# ============================================================================

all_results = {
    'XGBoost': xgb_mse,
    'LightGBM': lgb_mse,
    'CatBoost': cb_mse,
    'Ensemble': ensemble_mse
}

best_model = min(all_results.keys(), key=lambda k: all_results[k])
best_mse = all_results[best_model]

print(f"\n{'='*50}")
print("🏆 FAST TRAINING SUMMARY")
print(f"{'='*50}")

print("📊 All Results:")
for model, mse in sorted(all_results.items(), key=lambda x: x[1]):
    status = "🎯 TARGET!" if 2.0 <= mse <= 3.0 else "❌ Miss"
    print(f"   {model:10}: {mse:.4f} {status}")

print(f"\n🏆 Best: {best_model} ({best_mse:.4f})")
print(f"🎯 Target: {'✅ ACHIEVED' if 2.0 <= best_mse <= 3.0 else '❌ MISSED'}")
print(f"⚡ Total trials: 55 (vs 140 in full optimization)")
print(f"🖥️  GPU used: {'✅ YES' if any(gpu_status.values()) else '❌ NO'}")

# Quick save
try:
    fast_results = {
        'models': {'xgb': final_xgb, 'lgb': final_lgb, 'cb': final_cb},
        'predictions': {'xgb': xgb_pred, 'lgb': lgb_pred, 'cb': cb_pred, 'ensemble': ensemble_pred},
        'results': all_results,
        'best_model': best_model,
        'best_mse': best_mse,
        'y_test': y_test
    }
    joblib.dump(fast_results, "/content/fast_training_results.joblib")
    print(f"💾 Results saved to: /content/fast_training_results.joblib")
except:
    print(f"⚠️  Save failed")

print(f"\n🚀 Fast training complete! Ready for evaluation.")

🚀 Starting optimized fast training...
⚡ Optimizations: Reduced trials, GPU-first, memory efficient
✅ XGBoost GPU verified
✅ CatBoost GPU verified

🚀 FAST XGBOOST TRAINING

🔄 Fast optimization: XGBoost (20 trials)
   Trial 5: Best MSE = 31.5478
   Trial 10: Best MSE = 30.3644


In [None]:
# ============================================================================
# OPTIMIZED FAST MODEL TRAINING (Space + Speed + GPU Optimized)
# ============================================================================

print("🚀 Starting optimized fast training...")
print("⚡ Optimizations: Reduced trials, GPU-first, memory efficient")

import gc
import warnings
warnings.filterwarnings('ignore')

# ============================================================================
# GPU Detection and Verification
# ============================================================================

def verify_gpu_acceleration():
    """Verify and force GPU usage where possible"""
    gpu_status = {'xgboost': False, 'catboost': False}

    # Test XGBoost GPU
    try:
        test_xgb = xgb.XGBRegressor(tree_method='gpu_hist', gpu_id=0, n_estimators=5)
        test_xgb.fit(X_train_processed[:20], y_train[:20])
        gpu_status['xgboost'] = True
        print("✅ XGBoost GPU verified")
        del test_xgb
    except Exception as e:
        print(f"⚠️  XGBoost GPU failed: {str(e)[:50]}...")

    # Test CatBoost GPU
    try:
        test_cb = cb.CatBoostRegressor(task_type='GPU', iterations=5, verbose=False)
        test_cb.fit(X_train_processed[:20], y_train[:20])
        gpu_status['catboost'] = True
        print("✅ CatBoost GPU verified")
        del test_cb
    except Exception as e:
        print(f"⚠️  CatBoost GPU failed: {str(e)[:50]}...")

    gc.collect()
    return gpu_status

gpu_status = verify_gpu_acceleration()

# ============================================================================
# Optimized Objective Functions (Reduced Search Space)
# ============================================================================

def fast_xgboost_objective(trial):
    """Streamlined XGBoost optimization"""
    params = {
        'n_estimators': trial.suggest_categorical('n_estimators', [600, 800, 1000]),
        'max_depth': trial.suggest_int('max_depth', 5, 8),
        'learning_rate': trial.suggest_categorical('learning_rate', [0.03, 0.05, 0.07]),
        'subsample': trial.suggest_categorical('subsample', [0.8, 0.85, 0.9]),
        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.8, 0.85, 0.9]),
        'reg_alpha': trial.suggest_categorical('reg_alpha', [0.5, 1.0, 2.0]),
        'reg_lambda': trial.suggest_categorical('reg_lambda', [1.0, 2.0, 3.0]),
        'random_state': 42
    }

    # Force GPU usage if available
    if gpu_status['xgboost']:
        params.update({'tree_method': 'gpu_hist', 'gpu_id': 0})
    else:
        params.update({'tree_method': 'hist', 'n_jobs': -1})

    model = xgb.XGBRegressor(**params)

    # Fast 3-fold CV instead of 5-fold
    cv_scores = cross_val_score(model, X_train_processed, y_train, cv=3,
                               scoring='neg_mean_squared_error', n_jobs=1)
    return -cv_scores.mean()

def fast_lightgbm_objective(trial):
    """Streamlined LightGBM optimization"""
    params = {
        'n_estimators': trial.suggest_categorical('n_estimators', [600, 800, 1000]),
        'max_depth': trial.suggest_int('max_depth', 5, 8),
        'learning_rate': trial.suggest_categorical('learning_rate', [0.03, 0.05, 0.07]),
        'num_leaves': trial.suggest_categorical('num_leaves', [40, 60, 80]),
        'subsample': trial.suggest_categorical('subsample', [0.8, 0.85, 0.9]),
        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.8, 0.85, 0.9]),
        'reg_alpha': trial.suggest_categorical('reg_alpha', [0.5, 1.0, 2.0]),
        'reg_lambda': trial.suggest_categorical('reg_lambda', [1.0, 2.0, 3.0]),
        'random_state': 42,
        'verbose': -1,
        'n_jobs': -1
    }

    model = lgb.LGBMRegressor(**params)
    cv_scores = cross_val_score(model, X_train_processed, y_train, cv=3,
                               scoring='neg_mean_squared_error', n_jobs=1)
    return -cv_scores.mean()

def fast_catboost_objective(trial):
    """Streamlined CatBoost optimization"""
    params = {
        'iterations': trial.suggest_categorical('iterations', [600, 800, 1000]),
        'depth': trial.suggest_int('depth', 5, 8),
        'learning_rate': trial.suggest_categorical('learning_rate', [0.03, 0.05, 0.07]),
        'l2_leaf_reg': trial.suggest_categorical('l2_leaf_reg', [2, 3, 5]),
        'random_seed': 42,
        'verbose': False
    }

    # Force GPU if available
    if gpu_status['catboost']:
        params['task_type'] = 'GPU'
    else:
        params.update({'task_type': 'CPU', 'thread_count': -1})

    model = cb.CatBoostRegressor(**params)
    cv_scores = cross_val_score(model, X_train_processed, y_train, cv=3,
                               scoring='neg_mean_squared_error', n_jobs=1)
    return -cv_scores.mean()

# ============================================================================
# Fast Training Function (Reduced Trials)
# ============================================================================

def train_fast_model(model_name, objective_func, n_trials=20):
    """Fast model training with reduced trials"""
    print(f"\n🔄 Fast optimization: {model_name} ({n_trials} trials)")

    # Create study with reduced storage for speed
    study = optuna.create_study(direction='minimize', sampler=TPESampler(seed=42))

    # Progress every 5 trials instead of 10
    def fast_callback(study, trial):
        if trial.number % 5 == 0 and trial.number > 0:
            print(f"   Trial {trial.number}: Best MSE = {study.best_value:.4f}")

    start_time = time.time()
    study.optimize(objective_func, n_trials=n_trials, callbacks=[fast_callback])
    elapsed = time.time() - start_time

    print(f"✅ {model_name} optimized in {elapsed:.1f}s (Best: {study.best_value:.4f})")
    return study

# ============================================================================
# XGBoost Fast Training
# ============================================================================

print(f"\n{'='*50}")
print("🚀 FAST XGBOOST TRAINING")
print(f"{'='*50}")

xgb_study = train_fast_model("XGBoost", fast_xgboost_objective, n_trials=20)

print("🔧 Training final XGBoost...")
best_xgb_params = xgb_study.best_params.copy()

if gpu_status['xgboost']:
    best_xgb_params.update({'tree_method': 'gpu_hist', 'gpu_id': 0})
    print("   Using GPU acceleration")
else:
    best_xgb_params.update({'tree_method': 'hist', 'n_jobs': -1})
    print("   Using CPU (GPU unavailable)")

start_time = time.time()
final_xgb = xgb.XGBRegressor(**best_xgb_params)
final_xgb.fit(X_train_processed, y_train)
train_time = time.time() - start_time

xgb_pred = final_xgb.predict(X_test_processed)
xgb_mse = mean_squared_error(y_test, xgb_pred)
xgb_mae = mean_absolute_error(y_test, xgb_pred)
xgb_r2 = r2_score(y_test, xgb_pred)

print(f"📊 XGBoost Results (trained in {train_time:.1f}s):")
print(f"   MSE: {xgb_mse:.4f} {'🎯' if 2.0 <= xgb_mse <= 3.0 else '❌'}")
print(f"   MAE: {xgb_mae:.4f}")
print(f"   R²:  {xgb_r2:.4f}")

# Memory cleanup
del xgb_study
gc.collect()

# ============================================================================
# LightGBM Fast Training
# ============================================================================

print(f"\n{'='*50}")
print("🚀 FAST LIGHTGBM TRAINING")
print(f"{'='*50}")

lgb_study = train_fast_model("LightGBM", fast_lightgbm_objective, n_trials=20)

print("🔧 Training final LightGBM...")
start_time = time.time()
final_lgb = lgb.LGBMRegressor(**lgb_study.best_params)
final_lgb.fit(X_train_processed, y_train)
train_time = time.time() - start_time

lgb_pred = final_lgb.predict(X_test_processed)
lgb_mse = mean_squared_error(y_test, lgb_pred)
lgb_mae = mean_absolute_error(y_test, lgb_pred)
lgb_r2 = r2_score(y_test, lgb_pred)

print(f"📊 LightGBM Results (trained in {train_time:.1f}s):")
print(f"   MSE: {lgb_mse:.4f} {'🎯' if 2.0 <= lgb_mse <= 3.0 else '❌'}")
print(f"   MAE: {lgb_mae:.4f}")
print(f"   R²:  {lgb_r2:.4f}")

# Memory cleanup
del lgb_study
gc.collect()

# ============================================================================
# CatBoost Fast Training
# ============================================================================

print(f"\n{'='*50}")
print("🚀 FAST CATBOOST TRAINING")
print(f"{'='*50}")

cb_study = train_fast_model("CatBoost", fast_catboost_objective, n_trials=15)  # Even fewer for CatBoost

print("🔧 Training final CatBoost...")
best_cb_params = cb_study.best_params.copy()

if gpu_status['catboost']:
    best_cb_params['task_type'] = 'GPU'
    print("   Using GPU acceleration")
else:
    best_cb_params.update({'task_type': 'CPU', 'thread_count': -1})
    print("   Using CPU (GPU unavailable)")

start_time = time.time()
final_cb = cb.CatBoostRegressor(**best_cb_params)
final_cb.fit(X_train_processed, y_train)
train_time = time.time() - start_time

cb_pred = final_cb.predict(X_test_processed)
cb_mse = mean_squared_error(y_test, cb_pred)
cb_mae = mean_absolute_error(y_test, cb_pred)
cb_r2 = r2_score(y_test, cb_pred)

print(f"📊 CatBoost Results (trained in {train_time:.1f}s):")
print(f"   MSE: {cb_mse:.4f} {'🎯' if 2.0 <= cb_mse <= 3.0 else '❌'}")
print(f"   MAE: {cb_mae:.4f}")
print(f"   R²:  {cb_r2:.4f}")

# Memory cleanup
del cb_study
gc.collect()

# ============================================================================
# Fast Ensemble
# ============================================================================

print(f"\n{'='*50}")
print("🎯 FAST ENSEMBLE")
print(f"{'='*50}")

# Simple performance-based weighting
models_mse = {'XGBoost': xgb_mse, 'LightGBM': lgb_mse, 'CatBoost': cb_mse}
total_inv_mse = sum(1/mse for mse in models_mse.values())
weights = {name: (1/mse)/total_inv_mse for name, mse in models_mse.items()}

print("📊 Ensemble weights:")
for name, weight in weights.items():
    print(f"   {name}: {weight:.3f}")

# Create ensemble
ensemble_pred = (weights['XGBoost'] * xgb_pred +
                weights['LightGBM'] * lgb_pred +
                weights['CatBoost'] * cb_pred)

ensemble_mse = mean_squared_error(y_test, ensemble_pred)
ensemble_mae = mean_absolute_error(y_test, ensemble_pred)
ensemble_r2 = r2_score(y_test, ensemble_pred)

print(f"\n📊 Ensemble Results:")
print(f"   MSE: {ensemble_mse:.4f} {'🎯' if 2.0 <= ensemble_mse <= 3.0 else '❌'}")
print(f"   MAE: {ensemble_mae:.4f}")
print(f"   R²:  {ensemble_r2:.4f}")

# ============================================================================
# Final Summary
# ============================================================================

all_results = {
    'XGBoost': xgb_mse,
    'LightGBM': lgb_mse,
    'CatBoost': cb_mse,
    'Ensemble': ensemble_mse
}

best_model = min(all_results.keys(), key=lambda k: all_results[k])
best_mse = all_results[best_model]

print(f"\n{'='*50}")
print("🏆 FAST TRAINING SUMMARY")
print(f"{'='*50}")

print("📊 All Results:")
for model, mse in sorted(all_results.items(), key=lambda x: x[1]):
    status = "🎯 TARGET!" if 2.0 <= mse <= 3.0 else "❌ Miss"
    print(f"   {model:10}: {mse:.4f} {status}")

print(f"\n🏆 Best: {best_model} ({best_mse:.4f})")
print(f"🎯 Target: {'✅ ACHIEVED' if 2.0 <= best_mse <= 3.0 else '❌ MISSED'}")
print(f"⚡ Total trials: 55 (vs 140 in full optimization)")
print(f"🖥️  GPU used: {'✅ YES' if any(gpu_status.values()) else '❌ NO'}")

# Quick save
try:
    fast_results = {
        'models': {'xgb': final_xgb, 'lgb': final_lgb, 'cb': final_cb},
        'predictions': {'xgb': xgb_pred, 'lgb': lgb_pred, 'cb': cb_pred, 'ensemble': ensemble_pred},
        'results': all_results,
        'best_model': best_model,
        'best_mse': best_mse,
        'y_test': y_test
    }
    joblib.dump(fast_results, "/content/fast_training_results.joblib")
    print(f"💾 Results saved to: /content/fast_training_results.joblib")
except:
    print(f"⚠️  Save failed")

print(f"\n🚀 Fast training complete! Ready for evaluation.")

🚀 Starting optimized fast training...
⚡ Optimizations: Reduced trials, GPU-first, memory efficient
✅ XGBoost GPU verified
✅ CatBoost GPU verified

🚀 FAST XGBOOST TRAINING

🔄 Fast optimization: XGBoost (20 trials)
   Trial 5: Best MSE = 31.5478
   Trial 10: Best MSE = 30.3644
   Trial 15: Best MSE = 30.0468
✅ XGBoost optimized in 594.8s (Best: 30.0468)
🔧 Training final XGBoost...
   Using GPU acceleration
📊 XGBoost Results (trained in 11.3s):
   MSE: 31.6465 ❌
   MAE: 4.1513
   R²:  0.8495

🚀 FAST LIGHTGBM TRAINING

🔄 Fast optimization: LightGBM (20 trials)
   Trial 5: Best MSE = 29.0465
   Trial 10: Best MSE = 29.0465
   Trial 15: Best MSE = 29.0465
✅ LightGBM optimized in 209.3s (Best: 29.0465)
🔧 Training final LightGBM...
📊 LightGBM Results (trained in 7.9s):
   MSE: 30.8542 ❌
   MAE: 4.3107
   R²:  0.8532

🚀 FAST CATBOOST TRAINING

🔄 Fast optimization: CatBoost (15 trials)


In [None]:
# Advanced Ensemble Creation
# ============================================================================

print(f"\n{'='*60}")
print("🎯 CREATING ADVANCED ENSEMBLE MODEL")
print(f"{'='*60}")

print("🔧 Building weighted ensemble...")

# Store all model results
model_results = {
    'XGBoost': {'model': final_xgb, 'mse': xgb_mse, 'predictions': xgb_pred},
    'LightGBM': {'model': final_lgb, 'mse': lgb_mse, 'predictions': lgb_pred},
    'CatBoost': {'model': final_cb, 'mse': cb_mse, 'predictions': cb_pred}
}

# Calculate inverse MSE weights (better models get higher weight)
total_inverse_mse = sum(1/result['mse'] for result in model_results.values())
model_weights = {
    name: (1/result['mse']) / total_inverse_mse
    for name, result in model_results.items()
}

print("📊 Model weights based on performance:")
for name, weight in model_weights.items():
    print(f"   {name}: {weight:.3f}")

# Create ensemble prediction
ensemble_pred = np.zeros_like(y_test, dtype=float)
for name, result in model_results.items():
    ensemble_pred += model_weights[name] * result['predictions']

# Evaluate ensemble
ensemble_mse = mean_squared_error(y_test, ensemble_pred)
ensemble_mae = mean_absolute_error(y_test, ensemble_pred)
ensemble_r2 = r2_score(y_test, ensemble_pred)

print(f"\n📊 Ensemble Results:")
print(f"   Test MSE: {ensemble_mse:.4f}")
print(f"   Test MAE: {ensemble_mae:.4f}")
print(f"   Test R²:  {ensemble_r2:.4f}")

In [None]:
# Final Results and Target Achievement
# ============================================================================

print(f"\n{'='*60}")
print("🏆 FINAL MODEL COMPARISON AND TARGET ASSESSMENT")
print(f"{'='*60}")

# Compare all models
all_results = {
    'XGBoost': xgb_mse,
    'LightGBM': lgb_mse,
    'CatBoost': cb_mse,
    'Ensemble': ensemble_mse
}

print("📊 Final MSE Comparison:")
for model_name, mse in sorted(all_results.items(), key=lambda x: x[1]):
    target_status = "🎯 TARGET ACHIEVED!" if 2.0 <= mse <= 3.0 else "❌ Outside target"
    print(f"   {model_name:12}: MSE = {mse:.4f} {target_status}")

# Find best model
best_model_name = min(all_results.keys(), key=lambda k: all_results[k])
best_mse = all_results[best_model_name]

print(f"\n🏆 BEST MODEL: {best_model_name}")
print(f"🎯 BEST MSE: {best_mse:.4f}")

# Check target achievement
target_achieved = 2.0 <= best_mse <= 3.0
print(f"🎯 TARGET RANGE (2.0-3.0): {'✅ ACHIEVED!' if target_achieved else '❌ Not achieved'}")

if target_achieved:
    print(f"\n🎉 SUCCESS! Achieved target MSE of {best_mse:.4f}")
    print(f"🏆 This performance is competitive for age prediction!")
else:
    print(f"\n💡 Further optimization suggestions:")
    print(f"   • Try different feature engineering approaches")
    print(f"   • Experiment with neural networks")
    print(f"   • Use more sophisticated ensemble methods")
    print(f"   • Consider external validation datasets")

In [None]:
# Save Best Models
# ============================================================================

print(f"\n💾 Saving trained models...")

# Create models package
models_package = {
    'best_model_name': best_model_name,
    'best_mse': best_mse,
    'target_achieved': target_achieved,
    'models': {
        'xgboost': final_xgb,
        'lightgbm': final_lgb,
        'catboost': final_cb
    },
    'model_results': model_results,
    'ensemble_weights': model_weights,
    'predictions': {
        'xgboost': xgb_pred,
        'lightgbm': lgb_pred,
        'catboost': cb_pred,
        'ensemble': ensemble_pred,
        'y_test': y_test
    }
}

# Save models
try:
    models_save_path = os.path.join(PROJECT_DIR, "trained_models.joblib")
    joblib.dump(models_package, models_save_path)
    print(f"✅ Models saved to: {models_save_path}")
except:
    backup_models_path = "/content/trained_models.joblib"
    joblib.dump(models_package, backup_models_path)
    print(f"✅ Models saved to backup: {backup_models_path}")

print(f"\n🚀 Advanced model training complete!")
print(f"🎯 Ready for evaluation on external dataset!")

## Random Forest Regressor

In [None]:
from sklearn.ensemble import RandomForestRegressor

# Train
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Predict and evaluate
y_pred_rf = rf_model.predict(X_test)
mse_rf = mean_squared_error(y_test, y_pred_rf)
print(f"Random Forest MSE: {mse_rf:.2f}")

# Plot
plot_results(y_test, y_pred_rf, "Random Forest - Age Prediction")

# Save model & predictions
joblib.dump(rf_model, os.path.join(PROJECT_DIR, "RandomForest_model.joblib"))
pd.DataFrame({"Actual_Age": y_test.values, "Predicted_Age": y_pred_rf}).to_csv(
    os.path.join(PROJECT_DIR, "RandomForest_predictions.csv"), index=False)


In [None]:
# Random Forest Regressor 2.1

# Set your output directory
output_dir = "/content/drive/My Drive/Project/Aging Biomarkers"

# Apply age transformation
y_train_log = np.log1p(y_train)
y_test_log = np.log1p(y_test)

# Hyperparameter tuning
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

grid_search = GridSearchCV(
    estimator=RandomForestRegressor(random_state=42),
    param_grid=param_grid,
    cv=5,
    scoring='neg_mean_squared_error',
    n_jobs=-1,
    verbose=1
)

grid_search.fit(X_train, y_train_log)
best_rf = grid_search.best_estimator_

# Predict & evaluate
y_pred_rf_log = best_rf.predict(X_test)
y_pred_rf = np.expm1(y_pred_rf_log)
mse_rf = mean_squared_error(y_test, y_pred_rf)
print(f"RandomForest (Tuned) Test MSE: {mse_rf:.4f}")

# Plot: Actual vs Predicted
plt.figure(figsize=(6, 6))
plt.scatter(y_test, y_pred_rf, alpha=0.6)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], "r--")
plt.xlabel("Actual Age")
plt.ylabel("Predicted Age")
plt.title("RandomForest (Tuned): Actual vs Predicted")
plt.grid(True)
plt.show()

# Feature Importance Plot
importances = pd.Series(best_rf.feature_importances_, index=range(X_train.shape[1]))
top_features = importances.sort_values(ascending=False).head(20)
top_features.plot(kind="barh", figsize=(6, 8))
plt.title("Top 20 Important Features")
plt.xlabel("Importance Score")
plt.show()

# Save model and predictions
joblib.dump(best_rf, os.path.join(output_dir, "RandomForest_model_tuned.joblib"))

pred_df = pd.DataFrame({
    "Actual_Age": y_test,
    "Predicted_Age": y_pred_rf
})
pred_df.to_csv(os.path.join(output_dir, "RandomForest_test_predictions_tuned.csv"), index=False)

print("✅ Tuned model and predictions saved to:", output_dir)

In [None]:
# Random Forest Regressor 2.2

# === Step 1: Prepare feature matrix using stable_lasso CpGs ===
df = data.dnam.loc[stable_lasso].T.copy()
df["age"] = data.metadata["age"]

X_feat = df[stable_lasso].values
y_feat = df["age"].values

# === Step 2: Log-transform age to stabilize variance ===
y_feat_log = np.log1p(y_feat)

# === Step 3: Train-test split ===
X_train, X_test, y_train_log, y_test_log = train_test_split(
    X_feat, y_feat_log, test_size=0.2, random_state=42
)

# Inverse transform for evaluation
y_train = np.expm1(y_train_log)
y_test = np.expm1(y_test_log)

print(f"✅ Train shape: X={X_train.shape}, y={y_train.shape}")
print(f"✅ Test  shape: X={X_test.shape}, y={y_test.shape}")

# === Step 4: Train Random Forest Regressor ===
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train_log)

# === Step 5: Predict and evaluate ===
y_pred_log = rf.predict(X_test)
y_pred = np.expm1(y_pred_log)

mse_rf = mean_squared_error(y_test, y_pred)
print(f"📉 RandomForest Test MSE: {mse_rf:.4f}")

# === Step 6: Plot Actual vs Predicted ===
plt.figure(figsize=(6, 6))
plt.scatter(y_test, y_pred, alpha=0.6)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], "r--")
plt.xlabel("Actual Age")
plt.ylabel("Predicted Age")
plt.title("RandomForest: Actual vs Predicted")
plt.grid(True)
plt.show()

# === Step 7: Save model and predictions ===
output_dir = "/content/drive/My Drive/Project/Aging Biomarkers"

joblib.dump(rf, os.path.join(output_dir, "RandomForest_model.joblib"))

pd.DataFrame({
    "Actual_Age": y_test,
    "Predicted_Age": y_pred
}).to_csv(os.path.join(output_dir, "RandomForest_test_predictions.csv"), index=False)

print("✅ Model and predictions saved to:", output_dir)

# === Step 8: Feature Importance Plot ===
importances = pd.Series(rf.feature_importances_, index=stable_lasso)
top_features = importances.sort_values(ascending=False).head(20)

plt.figure(figsize=(6, 8))
top_features.plot(kind="barh")
plt.title("Top 20 Important CpG Features (Random Forest)")
plt.xlabel("Importance Score")
plt.gca().invert_yaxis()
plt.grid(True)
plt.show()

## Gradient Boosting Regressor

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

# Train
gb_model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
gb_model.fit(X_train, y_train)

# Predict and evaluate
y_pred_gb = gb_model.predict(X_test)
mse_gb = mean_squared_error(y_test, y_pred_gb)
print(f"Gradient Boosting MSE: {mse_gb:.2f}")

# Plot
plot_results(y_test, y_pred_gb, "Gradient Boosting - Age Prediction")

# Save model & predictions
joblib.dump(gb_model, os.path.join(PROJECT_DIR, "GradientBoosting_model.joblib"))
pd.DataFrame({"Actual_Age": y_test.values, "Predicted_Age": y_pred_gb}).to_csv(
    os.path.join(PROJECT_DIR, "GradientBoosting_predictions.csv"), index=False)

## Support Vector Regression (SVR)

In [None]:
# Set your local output directory
output_dir = "/content/drive/My Drive/Project/Aging Biomarkers"

# Train SVR model (you can tune kernel, C, and epsilon later)
svr = SVR(kernel='rbf', C=10, epsilon=0.1)
svr.fit(X_train, y_train)

# Predict & evaluate
y_pred_svr = svr.predict(X_test)
mse_svr = mean_squared_error(y_test, y_pred_svr)
print(f"SVR Test MSE: {mse_svr:.4f}")

# Plot Actual vs Predicted
plt.figure(figsize=(6, 6))
plt.scatter(y_test, y_pred_svr, alpha=0.6)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], "r--")
plt.xlabel("Actual Age")
plt.ylabel("Predicted Age")
plt.title("SVR: Actual vs Predicted")
plt.grid(True)
plt.show()

# Save model and predictions
joblib.dump(svr, os.path.join(output_dir, "SVR_model.joblib"))
pd.DataFrame({
    "Actual_Age": y_test,
    "Predicted_Age": y_pred_svr
}).to_csv(os.path.join(output_dir, "SVR_test_predictions.csv"), index=False)

print("✅ SVR model and predictions saved.")

## RidgeCV

In [None]:
from sklearn.linear_model import RidgeCV
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import joblib
import os
import pandas as pd

# Set your local output directory
output_dir = "/content/drive/My Drive/Project/Aging Biomarkers"

# Train Ridge Regression with built-in cross-validation over a range of alphas
alphas = [0.01, 0.1, 1.0, 10.0, 100.0]
ridge = RidgeCV(alphas=alphas, cv=5)
ridge.fit(X_train, y_train)

# Predict & evaluate
y_pred_ridge = ridge.predict(X_test)
mse_ridge    = mean_squared_error(y_test, y_pred_ridge)
print(f"Ridge Regression Test MSE: {mse_ridge:.4f}")

# Plot Actual vs Predicted
plt.figure(figsize=(6, 6))
plt.scatter(y_test, y_pred_ridge, alpha=0.6)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], "r--")
plt.xlabel("Actual Age")
plt.ylabel("Predicted Age")
plt.title("Ridge Regression: Actual vs Predicted")
plt.grid(True)
plt.show()

# Save model and predictions
joblib.dump(ridge, os.path.join(output_dir, "Ridge_model.joblib"))

pd.DataFrame({
    "Actual_Age":    y_test,
    "Predicted_Age": y_pred_ridge
}).to_csv(os.path.join(output_dir, "Ridge_test_predictions.csv"), index=False)

print("✅ Ridge model and predictions saved to:", output_dir)


In [None]:
import joblib
import os

# Gather test‐set MSEs
mse_dict = {
    "ElasticNet":    mse_enet,
    "RandomForest":  mse_rf,
    "GradientBoosting": mse_gb,
    "SVR":           mse_svr,
    "Ridge":         mse_ridge
}

# Sort ascending by MSE and pick best + second best
sorted_models    = sorted(mse_dict.items(), key=lambda x: x[1])
best_name, _     = sorted_models[0]
second_name, _   = sorted_models[1]

# Model name → fitted object mapping
model_map = {
    "ElasticNet":    enet,
    "RandomForest":  rf,
    "GradientBoosting": gb,
    "SVR":           svr,
    "Ridge":         ridge
}

best_mod   = model_map[best_name]
second_mod = model_map[second_name]

# Report
print(f"🏆 Best Model:   {best_name} (MSE={mse_dict[best_name]:.4f})")
print(f"🥈 Second Best: {second_name} (MSE={mse_dict[second_name]:.4f})")

# Save both to disk
joblib.dump(best_mod,   os.path.join(output_dir, f"{best_name}_model.joblib"))
joblib.dump(second_mod, os.path.join(output_dir, f"{second_name}_model.joblib"))
print("✅ Top two models saved to:", output_dir)


In [None]:
import joblib
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import pandas as pd

# Load the best model
best_model_path = os.path.join(output_dir, f"{best_name}_model.joblib")
best_model = joblib.load(best_model_path)

# Predict on the test split
y_test_pred = best_model.predict(X_test)

# Compute and print MSE
mse_best = mean_squared_error(y_test, y_test_pred)
print(f"✅ {best_name} Test MSE: {mse_best:.4f}")

# Plot Actual vs. Predicted
plt.figure(figsize=(6, 6))
plt.scatter(y_test, y_test_pred, alpha=0.6)
plt.plot([y_test.min(), y_test.max()],
         [y_test.min(), y_test.max()],
         "r--", lw=2)
plt.xlabel("Actual Age")
plt.ylabel("Predicted Age")
plt.title(f"{best_name}: Actual vs Predicted")
plt.grid(True)
plt.show()

# Save test‐set predictions to CSV
pred_df = pd.DataFrame({
    "Actual_Age":    y_test,
    "Predicted_Age": y_test_pred
})
pred_df.to_csv(os.path.join(output_dir, f"{best_name}_test_predictions.csv"), index=False)
print(f"✅ Test predictions saved to: {output_dir}")

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.metrics import mean_squared_error, mean_absolute_error

# Predict on test set
y_test_pred = best_model.predict(X_test)

# Plot
plt.figure(figsize=(10, 8))
plt.scatter(y_test, y_test_pred, alpha=0.7, edgecolors='w', linewidth=0.5)
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], 'k--', lw=3)
plt.xlabel('Actual Age')
plt.ylabel('Predicted Age')
plt.title(f'{best_name}: Actual vs. Predicted Age (Test Set)')
plt.grid(True, which='both', linestyle='--', linewidth=0.5)
plt.show()

# Calculate metrics
mse_test_final = mean_squared_error(y_test, y_test_pred)
mae_test_final = mean_absolute_error(y_test, y_test_pred)

print(f"✅ Final Test MSE: {mse_test_final:.4f}")
print(f"✅ Final Test MAE: {mae_test_final:.4f}")

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Align eval features with training features
X_eval = evaluation_data.dnam.T.copy()

# Ensure all selected CpGs exist in eval data; fill missing with 0
for cpg in set(stable_lasso) - set(X_eval.columns):
    X_eval[cpg] = 0.0

# Reorder columns to match training order
X_eval = X_eval[stable_lasso].values

# Predict using the best model (Ridge)
y_eval_pred = best_mod.predict(X_eval)

# Create DataFrame of predictions
eval_pred_df = pd.DataFrame({
    "Predicted_Age": y_eval_pred
}, index=evaluation_data.dnam.columns)

# Preview predictions
print("✅ Eval predictions preview:")
display(eval_pred_df.head())

# Plot histogram of predicted ages
plt.figure(figsize=(7, 5))
plt.hist(y_eval_pred, bins=30, edgecolor="k")
plt.title("Evaluation Set: Predicted Age Distribution")
plt.xlabel("Predicted Age")
plt.ylabel("Count")
plt.grid(True)
plt.show()

# Save predictions to CSV
eval_output_path = os.path.join(output_dir, "GSE157131_predicted_ages.csv")
eval_pred_df.to_csv(eval_output_path)

print(f"✅ Evaluation predictions saved to: {eval_output_path}")

In [None]:
## RidgeCV
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os

# Ensure consistency of feature columns (add missing CpGs as zeros)
pruned_data = evaluation_data.dnam.T.copy()
missing_cpg_sites = list(set(stable_lasso) - set(pruned_data.columns))
pruned_data.loc[:, missing_cpg_sites] = 0
pruned_data = pruned_data[stable_lasso]

# Predict using best trained model
challenge_results = best_mod.predict(pruned_data)

# Combine predictions with actual ages (if available)
predicted_age_df = pd.DataFrame({
    'predictedAge': challenge_results,
    'ActualAge': evaluation_data.metadata.age
}, index=evaluation_data.dnam.columns)
predicted_age_df.index.name = 'sampleId'

# Plot actual vs predicted
y_pred = challenge_results
y_true = evaluation_data.metadata.age

plt.figure(figsize=(10, 8))
plt.scatter(y_true, y_pred, alpha=0.7, edgecolors='w', linewidth=0.5)
plt.plot([min(y_true), max(y_true)], [min(y_true), max(y_true)], 'k--', lw=3)
plt.xlabel('Actual Age')
plt.ylabel('Predicted Age')
plt.title('Evaluation Set: Actual vs. Predicted Age')
plt.grid(True, which='both', linestyle='--', linewidth=0.5)
plt.show()

# Metrics
mse_eval = np.mean((y_true - y_pred) ** 2)
mae_eval = np.mean(np.abs(y_true - y_pred))
print(f"📊 Evaluation MSE: {mse_eval:.4f}")
print(f"📊 Evaluation MAE: {mae_eval:.4f}")

# Save predictions
eval_output_path = os.path.join(output_dir, "EvaluationSet_predictions.csv")
predicted_age_df.to_csv(eval_output_path)
print(f"✅ Evaluation predictions saved to: {eval_output_path}")

In [None]:
## ElasticNet
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os

# Ensure consistency of feature columns (add missing CpGs as zeros)
pruned_data = evaluation_data.dnam.T.copy()
missing_cpg_sites = list(set(stable_lasso) - set(pruned_data.columns))
pruned_data.loc[:, missing_cpg_sites] = 0
pruned_data = pruned_data[stable_lasso]

# Predict using ElasticNet
challenge_results_enet = enet.predict(pruned_data)

# Combine predictions with actual ages
predicted_age_df_enet = pd.DataFrame({
    'predictedAge': challenge_results_enet,
    'ActualAge': evaluation_data.metadata.age
}, index=evaluation_data.dnam.columns)
predicted_age_df_enet.index.name = 'sampleId'

# Plot actual vs predicted
y_pred_enet = challenge_results_enet
y_true_enet = evaluation_data.metadata.age

plt.figure(figsize=(10, 8))
plt.scatter(y_true_enet, y_pred_enet, alpha=0.7, edgecolors='w', linewidth=0.5)
plt.plot([min(y_true_enet), max(y_true_enet)], [min(y_true_enet), max(y_true_enet)], 'k--', lw=3)
plt.xlabel('Actual Age')
plt.ylabel('Predicted Age')
plt.title('ElasticNet: Evaluation Set - Actual vs. Predicted Age')
plt.grid(True, which='both', linestyle='--', linewidth=0.5)
plt.show()

# Metrics
mse_eval_enet = np.mean((y_true_enet - y_pred_enet) ** 2)
mae_eval_enet = np.mean(np.abs(y_true_enet - y_pred_enet))
print(f"📊 Evaluation MSE (ElasticNet): {mse_eval_enet:.4f}")
print(f"📊 Evaluation MAE (ElasticNet): {mae_eval_enet:.4f}")

# Save predictions
eval_output_path_enet = os.path.join(output_dir, "ElasticNet_EvaluationSet_predictions.csv")
predicted_age_df_enet.to_csv(eval_output_path_enet)
print(f"✅ Evaluation predictions saved to: {eval_output_path_enet}")


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os

# Step 1: Align evaluation data with training features (fill missing CpGs with 0.0)
pruned_data = evaluation_data.dnam.T.copy()
missing_cpg_sites = list(set(stable_lasso) - set(pruned_data.columns))
pruned_data.loc[:, missing_cpg_sites] = 0
pruned_data = pruned_data[stable_lasso]  # ensure column order

# Step 2: Predict with Gradient Boosting Regressor
y_eval_pred = gb.predict(pruned_data)

# Step 3: Create DataFrame with predictions and actual ages
predicted_age_df = pd.DataFrame({
    'predictedAge': y_eval_pred,
    'ActualAge': evaluation_data.metadata.age
}, index=evaluation_data.dnam.columns)
predicted_age_df.index.name = 'sampleId'

# Step 4: Plot actual vs predicted
plt.figure(figsize=(10, 8))
plt.scatter(predicted_age_df["ActualAge"], predicted_age_df["predictedAge"], alpha=0.7, edgecolors='w', linewidth=0.5)
plt.plot([min(predicted_age_df["ActualAge"]), max(predicted_age_df["ActualAge"])],
         [min(predicted_age_df["ActualAge"]), max(predicted_age_df["ActualAge"])],
         'k--', lw=3)
plt.xlabel('Actual Age')
plt.ylabel('Predicted Age')
plt.title('Gradient Boosting - Evaluation Set: Actual vs. Predicted Age')
plt.grid(True, linestyle='--', linewidth=0.5)
plt.show()

# Step 5: Compute and print metrics
mse = np.mean((predicted_age_df["ActualAge"] - predicted_age_df["predictedAge"]) ** 2)
mae = np.mean(np.abs(predicted_age_df["ActualAge"] - predicted_age_df["predictedAge"]))
print(f"📊 Evaluation MSE (Gradient Boosting): {mse:.4f}")
print(f"📊 Evaluation MAE (Gradient Boosting): {mae:.4f}")

# Step 6: Show the prediction table
display(predicted_age_df.head())

# Step 7: Plot histogram of predicted ages
plt.figure(figsize=(8, 5))
plt.hist(predicted_age_df["predictedAge"], bins=30, edgecolor='black')
plt.xlabel("Predicted Age")
plt.ylabel("Count")
plt.title("Distribution of Predicted Ages - Evaluation Set (Gradient Boosting)")
plt.grid(True, linestyle='--', linewidth=0.5)
plt.show()

# Step 8: Save results
eval_output_path = os.path.join(output_dir, "EvaluationSet_Predicted_vs_Actual_GB.csv")
predicted_age_df.to_csv(eval_output_path)
print(f"✅ Evaluation predictions saved to: {eval_output_path}")

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, make_scorer

# Define parameter grid
param_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.05, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 3]
}

# Initialize base model
gb_base = GradientBoostingRegressor(random_state=42)

# Define MSE scorer (negative because sklearn wants higher=better)
mse_scorer = make_scorer(mean_squared_error, greater_is_better=False)

# Grid search
grid_search = GridSearchCV(
    gb_base,
    param_grid,
    scoring=mse_scorer,
    cv=5,
    n_jobs=-1,
    verbose=2
)

# Fit to training data
grid_search.fit(X_train, y_train)

# Best model
best_gb = grid_search.best_estimator_
print(f"✅ Best Params: {grid_search.best_params_}")

# Evaluate on test set
y_pred_best_gb = best_gb.predict(X_test)
mse_best_gb = mean_squared_error(y_test, y_pred_best_gb)
print(f"🎯 Tuned Gradient Boosting Test MSE: {mse_best_gb:.4f}")


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os

# Step 1: Align evaluation data with training features (fill missing CpGs with 0.0)
pruned_data = evaluation_data.dnam.T.copy()
missing_cpg_sites = list(set(stable_lasso) - set(pruned_data.columns))
pruned_data.loc[:, missing_cpg_sites] = 0
pruned_data = pruned_data[stable_lasso]  # ensure column order

# Step 2: Predict with the tuned Gradient Boosting Regressor
y_eval_pred = best_gb.predict(pruned_data)

# Step 3: Create DataFrame with predictions and actual ages
predicted_age_df = pd.DataFrame({
    'predictedAge': y_eval_pred,
    'ActualAge': evaluation_data.metadata.age
}, index=evaluation_data.dnam.columns)
predicted_age_df.index.name = 'sampleId'

# Step 4: Plot actual vs predicted
plt.figure(figsize=(10, 8))
plt.scatter(predicted_age_df["ActualAge"], predicted_age_df["predictedAge"], alpha=0.7, edgecolors='w', linewidth=0.5)
plt.plot([min(predicted_age_df["ActualAge"]), max(predicted_age_df["ActualAge"])],
         [min(predicted_age_df["ActualAge"]), max(predicted_age_df["ActualAge"])],
         'k--', lw=3)
plt.xlabel('Actual Age')
plt.ylabel('Predicted Age')
plt.title('Tuned Gradient Boosting - Evaluation Set: Actual vs. Predicted Age')
plt.grid(True, linestyle='--', linewidth=0.5)
plt.show()

# Step 5: Compute and print metrics
mse = np.mean((predicted_age_df["ActualAge"] - predicted_age_df["predictedAge"]) ** 2)
mae = np.mean(np.abs(predicted_age_df["ActualAge"] - predicted_age_df["predictedAge"]))
print(f"📊 Evaluation MSE (Tuned GB): {mse:.4f}")
print(f"📊 Evaluation MAE (Tuned GB): {mae:.4f}")

# Step 6: Show the prediction table
display(predicted_age_df.head())

# Step 7: Plot histogram of predicted ages
plt.figure(figsize=(8, 5))
plt.hist(predicted_age_df["predictedAge"], bins=30, edgecolor='black')
plt.xlabel("Predicted Age")
plt.ylabel("Count")
plt.title("Distribution of Predicted Ages - Evaluation Set (Tuned GB)")
plt.grid(True, linestyle='--', linewidth=0.5)
plt.show()

# Step 8: Save results
eval_output_path = os.path.join(output_dir, "EvaluationSet_Predicted_vs_Actual_TunedGB.csv")
predicted_age_df.to_csv(eval_output_path)
print(f"✅ Evaluation predictions saved to: {eval_output_path}")

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os

# 🧮 Recompute final metrics
mse_final = np.mean((predicted_age_df["ActualAge"] - predicted_age_df["predictedAge"]) ** 2)
mae_final = np.mean(np.abs(predicted_age_df["ActualAge"] - predicted_age_df["predictedAge"]))

# 📉 Print final evaluation metrics
print(f"📉 Final Evaluation MSE (Tuned GB): {mse_final:.4f}")
print(f"📊 Final Evaluation MAE (Tuned GB): {mae_final:.4f}")

# 📋 Display predicted vs actual table
predicted_age_df.index.name = 'sampleId'
display(predicted_age_df.head(10))  # Top 10 preview

# 📈 Histogram of predicted ages
plt.figure(figsize=(8, 5))
plt.hist(predicted_age_df["predictedAge"], bins=30, edgecolor='black')
plt.xlabel("Predicted Age")
plt.ylabel("Count")
plt.title("Distribution of Predicted Ages - Evaluation Set (Tuned GB)")
plt.grid(True, linestyle='--', linewidth=0.5)
plt.tight_layout()
plt.show()

# 💾 Save predictions table to CSV
eval_output_path = os.path.join(output_dir, "Final_EvaluationSet_Predicted_vs_Actual_TunedGB.csv")
predicted_age_df.to_csv(eval_output_path)
print(f"✅ Final prediction table saved to: {eval_output_path}")
