# Agent Readiness ML: Model Training

**Phase 2:** Train initial Random Forest and XGBoost models  
**Goal:** Beat baseline MAE of 24.04 on validation set  
**Strategy:** Start with reasonable hyperparameters, evaluate, then optimize

---

## Setup & Imports

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import xgboost as xgb
import joblib
import warnings
warnings.filterwarnings('ignore')

# Visualization settings
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 11

print("‚úì Libraries imported successfully!")

---
## STEP 1: LOAD PREPARED DATA

In [None]:
print("=" * 70)
print("LOADING PREPARED DATA")
print("=" * 70)

# Load feature matrices
X_train = pd.read_csv('../data/processed/X_train.csv')
X_val = pd.read_csv('../data/processed/X_val.csv')
X_test = pd.read_csv('../data/processed/X_test.csv')

# Load target variables
y_train = pd.read_csv('../data/processed/y_train.csv').values.ravel()
y_val = pd.read_csv('../data/processed/y_val.csv').values.ravel()
y_test = pd.read_csv('../data/processed/y_test.csv').values.ravel()

print("\n‚úì Data loaded successfully!\n")
print(f"Training Set:")
print(f"  X_train: {X_train.shape} | y_train: {y_train.shape}")
print(f"\nValidation Set:")
print(f"  X_val:   {X_val.shape} | y_val:   {y_val.shape}")
print(f"\nTest Set (HELD OUT):")
print(f"  X_test:  {X_test.shape} | y_test:  {y_test.shape}")

print(f"\nüìã Features: {X_train.shape[1]}")
print(f"üìä Feature names (first 5): {list(X_train.columns[:5])}")

---
## STEP 2: BASELINE PERFORMANCE

In [None]:
print("\n" + "=" * 70)
print("BASELINE MODEL: Mean Prediction")
print("=" * 70)

# Baseline: predict training mean
baseline_pred = np.full(len(y_val), y_train.mean())

baseline_mae = mean_absolute_error(y_val, baseline_pred)
baseline_rmse = np.sqrt(mean_squared_error(y_val, baseline_pred))
baseline_r2 = r2_score(y_val, baseline_pred)

print(f"\nüìä BASELINE METRICS (Validation Set):\n")
print(f"   Strategy: Always predict {y_train.mean():.2f}")
print(f"   MAE:  {baseline_mae:.2f}")
print(f"   RMSE: {baseline_rmse:.2f}")
print(f"   R¬≤:   {baseline_r2:.4f}")
print(f"\n   üéØ Goal: Our ML models must beat MAE < {baseline_mae:.2f}")

---
## STEP 3: RANDOM FOREST MODEL

### 3.1 Train Random Forest

In [None]:
print("\n" + "=" * 70)
print("RANDOM FOREST REGRESSOR")
print("=" * 70)

# Initialize model with specified hyperparameters
rf_model = RandomForestRegressor(
    n_estimators=200,
    max_depth=15,
    min_samples_leaf=3,
    random_state=42,
    n_jobs=-1,
    verbose=0
)

print(f"\nüå≤ Hyperparameters:")
print(f"   n_estimators:     {rf_model.n_estimators}")
print(f"   max_depth:        {rf_model.max_depth}")
print(f"   min_samples_leaf: {rf_model.min_samples_leaf}")
print(f"   random_state:     {rf_model.random_state}")

print(f"\n‚è≥ Training Random Forest...")
rf_model.fit(X_train, y_train)
print(f"‚úì Training complete!")

### 3.2 Evaluate Random Forest

In [None]:
# Predictions
rf_train_pred = rf_model.predict(X_train)
rf_val_pred = rf_model.predict(X_val)

# Metrics - Training Set
rf_train_mae = mean_absolute_error(y_train, rf_train_pred)
rf_train_rmse = np.sqrt(mean_squared_error(y_train, rf_train_pred))
rf_train_r2 = r2_score(y_train, rf_train_pred)

# Metrics - Validation Set
rf_val_mae = mean_absolute_error(y_val, rf_val_pred)
rf_val_rmse = np.sqrt(mean_squared_error(y_val, rf_val_pred))
rf_val_r2 = r2_score(y_val, rf_val_pred)

print(f"\nüìä RANDOM FOREST PERFORMANCE:\n")
print(f"   TRAINING SET:")
print(f"     MAE:  {rf_train_mae:.2f}")
print(f"     RMSE: {rf_train_rmse:.2f}")
print(f"     R¬≤:   {rf_train_r2:.4f}")
print(f"\n   VALIDATION SET:")
print(f"     MAE:  {rf_val_mae:.2f}")
print(f"     RMSE: {rf_val_rmse:.2f}")
print(f"     R¬≤:   {rf_val_r2:.4f}")

improvement = ((baseline_mae - rf_val_mae) / baseline_mae) * 100
print(f"\n   üéØ Improvement over baseline: {improvement:.1f}%")
if rf_val_mae < baseline_mae:
    print(f"   ‚úÖ SUCCESS! Beat baseline by {baseline_mae - rf_val_mae:.2f} MAE points")
else:
    print(f"   ‚ùå Did not beat baseline")

### 3.3 Random Forest: Predictions vs Actual

In [None]:
# Scatter plot: Predicted vs Actual
fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# Training set
axes[0].scatter(y_train, rf_train_pred, alpha=0.6, s=50, edgecolors='black', linewidth=0.5)
axes[0].plot([y_train.min(), y_train.max()], [y_train.min(), y_train.max()], 'r--', lw=2, label='Perfect Prediction')
axes[0].set_xlabel('Actual Score', fontsize=12, fontweight='bold')
axes[0].set_ylabel('Predicted Score', fontsize=12, fontweight='bold')
axes[0].set_title(f'Random Forest: Training Set\nMAE={rf_train_mae:.2f}, R¬≤={rf_train_r2:.3f}', fontsize=13, fontweight='bold')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# Validation set
axes[1].scatter(y_val, rf_val_pred, alpha=0.6, s=50, edgecolors='black', linewidth=0.5, color='orange')
axes[1].plot([y_val.min(), y_val.max()], [y_val.min(), y_val.max()], 'r--', lw=2, label='Perfect Prediction')
axes[1].set_xlabel('Actual Score', fontsize=12, fontweight='bold')
axes[1].set_ylabel('Predicted Score', fontsize=12, fontweight='bold')
axes[1].set_title(f'Random Forest: Validation Set\nMAE={rf_val_mae:.2f}, R¬≤={rf_val_r2:.3f}', fontsize=13, fontweight='bold')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('../outputs/rf_predictions_vs_actual.png', dpi=300, bbox_inches='tight')
plt.show()

print("\n‚úì Saved: outputs/rf_predictions_vs_actual.png")

### 3.4 Random Forest: Feature Importance

In [None]:
# Extract feature importances
rf_importances = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': rf_model.feature_importances_
}).sort_values('Importance', ascending=False)

# Get top 20
top_20_rf = rf_importances.head(20).copy()
top_20_rf['Feature_Short'] = top_20_rf['Feature'].str.replace('has_', '')

# Visualize
plt.figure(figsize=(12, 8))
bars = plt.barh(range(len(top_20_rf)), top_20_rf['Importance'], color='steelblue', edgecolor='black')

# Color by importance
for i, bar in enumerate(bars):
    if top_20_rf.iloc[i]['Importance'] > 0.08:
        bar.set_color('darkgreen')
    elif top_20_rf.iloc[i]['Importance'] > 0.05:
        bar.set_color('steelblue')
    else:
        bar.set_color('lightblue')

plt.yticks(range(len(top_20_rf)), top_20_rf['Feature_Short'])
plt.xlabel('Feature Importance', fontsize=12, fontweight='bold')
plt.title('Random Forest: Top 20 Most Important Features', fontsize=14, fontweight='bold')
plt.grid(True, alpha=0.3, axis='x')
plt.tight_layout()
plt.savefig('../outputs/rf_feature_importance.png', dpi=300, bbox_inches='tight')
plt.show()

print("\n‚úì Saved: outputs/rf_feature_importance.png")
print(f"\nüîù TOP 10 FEATURES (Random Forest):\n")
for idx, row in top_20_rf.head(10).iterrows():
    print(f"   {top_20_rf.index.get_loc(idx)+1:2d}. {row['Feature_Short']:30s}  {row['Importance']:.4f}")

---
## STEP 4: XGBOOST MODEL

### 4.1 Train XGBoost

In [None]:
print("\n" + "=" * 70)
print("XGBOOST REGRESSOR")
print("=" * 70)

# Initialize model with specified hyperparameters
xgb_model = xgb.XGBRegressor(
    n_estimators=200,
    learning_rate=0.05,
    max_depth=8,
    random_state=42,
    n_jobs=-1,
    verbosity=0
)

print(f"\nüöÄ Hyperparameters:")
print(f"   n_estimators:  {xgb_model.n_estimators}")
print(f"   learning_rate: {xgb_model.learning_rate}")
print(f"   max_depth:     {xgb_model.max_depth}")
print(f"   random_state:  {xgb_model.random_state}")

print(f"\n‚è≥ Training XGBoost...")
xgb_model.fit(X_train, y_train)
print(f"‚úì Training complete!")

### 4.2 Evaluate XGBoost

In [None]:
# Predictions
xgb_train_pred = xgb_model.predict(X_train)
xgb_val_pred = xgb_model.predict(X_val)

# Metrics - Training Set
xgb_train_mae = mean_absolute_error(y_train, xgb_train_pred)
xgb_train_rmse = np.sqrt(mean_squared_error(y_train, xgb_train_pred))
xgb_train_r2 = r2_score(y_train, xgb_train_pred)

# Metrics - Validation Set
xgb_val_mae = mean_absolute_error(y_val, xgb_val_pred)
xgb_val_rmse = np.sqrt(mean_squared_error(y_val, xgb_val_pred))
xgb_val_r2 = r2_score(y_val, xgb_val_pred)

print(f"\nüìä XGBOOST PERFORMANCE:\n")
print(f"   TRAINING SET:")
print(f"     MAE:  {xgb_train_mae:.2f}")
print(f"     RMSE: {xgb_train_rmse:.2f}")
print(f"     R¬≤:   {xgb_train_r2:.4f}")
print(f"\n   VALIDATION SET:")
print(f"     MAE:  {xgb_val_mae:.2f}")
print(f"     RMSE: {xgb_val_rmse:.2f}")
print(f"     R¬≤:   {xgb_val_r2:.4f}")

improvement = ((baseline_mae - xgb_val_mae) / baseline_mae) * 100
print(f"\n   üéØ Improvement over baseline: {improvement:.1f}%")
if xgb_val_mae < baseline_mae:
    print(f"   ‚úÖ SUCCESS! Beat baseline by {baseline_mae - xgb_val_mae:.2f} MAE points")
else:
    print(f"   ‚ùå Did not beat baseline")

### 4.3 XGBoost: Predictions vs Actual

In [None]:
# Scatter plot: Predicted vs Actual
fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# Training set
axes[0].scatter(y_train, xgb_train_pred, alpha=0.6, s=50, edgecolors='black', linewidth=0.5, color='green')
axes[0].plot([y_train.min(), y_train.max()], [y_train.min(), y_train.max()], 'r--', lw=2, label='Perfect Prediction')
axes[0].set_xlabel('Actual Score', fontsize=12, fontweight='bold')
axes[0].set_ylabel('Predicted Score', fontsize=12, fontweight='bold')
axes[0].set_title(f'XGBoost: Training Set\nMAE={xgb_train_mae:.2f}, R¬≤={xgb_train_r2:.3f}', fontsize=13, fontweight='bold')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# Validation set
axes[1].scatter(y_val, xgb_val_pred, alpha=0.6, s=50, edgecolors='black', linewidth=0.5, color='purple')
axes[1].plot([y_val.min(), y_val.max()], [y_val.min(), y_val.max()], 'r--', lw=2, label='Perfect Prediction')
axes[1].set_xlabel('Actual Score', fontsize=12, fontweight='bold')
axes[1].set_ylabel('Predicted Score', fontsize=12, fontweight='bold')
axes[1].set_title(f'XGBoost: Validation Set\nMAE={xgb_val_mae:.2f}, R¬≤={xgb_val_r2:.3f}', fontsize=13, fontweight='bold')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('../outputs/xgb_predictions_vs_actual.png', dpi=300, bbox_inches='tight')
plt.show()

print("\n‚úì Saved: outputs/xgb_predictions_vs_actual.png")

### 4.4 XGBoost: Feature Importance

In [None]:
# Extract feature importances
xgb_importances = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': xgb_model.feature_importances_
}).sort_values('Importance', ascending=False)

# Get top 20
top_20_xgb = xgb_importances.head(20).copy()
top_20_xgb['Feature_Short'] = top_20_xgb['Feature'].str.replace('has_', '')

# Visualize
plt.figure(figsize=(12, 8))
bars = plt.barh(range(len(top_20_xgb)), top_20_xgb['Importance'], color='purple', edgecolor='black')

# Color by importance
for i, bar in enumerate(bars):
    if top_20_xgb.iloc[i]['Importance'] > 0.08:
        bar.set_color('darkgreen')
    elif top_20_xgb.iloc[i]['Importance'] > 0.05:
        bar.set_color('purple')
    else:
        bar.set_color('plum')

plt.yticks(range(len(top_20_xgb)), top_20_xgb['Feature_Short'])
plt.xlabel('Feature Importance', fontsize=12, fontweight='bold')
plt.title('XGBoost: Top 20 Most Important Features', fontsize=14, fontweight='bold')
plt.grid(True, alpha=0.3, axis='x')
plt.tight_layout()
plt.savefig('../outputs/xgb_feature_importance.png', dpi=300, bbox_inches='tight')
plt.show()

print("\n‚úì Saved: outputs/xgb_feature_importance.png")
print(f"\nüîù TOP 10 FEATURES (XGBoost):\n")
for idx, row in top_20_xgb.head(10).iterrows():
    print(f"   {top_20_xgb.index.get_loc(idx)+1:2d}. {row['Feature_Short']:30s}  {row['Importance']:.4f}")

---
## STEP 5: MODEL COMPARISON

In [None]:
print("\n" + "=" * 70)
print("MODEL COMPARISON")
print("=" * 70)

# Create comparison table
comparison = pd.DataFrame({
    'Model': ['Baseline (Mean)', 'Random Forest', 'XGBoost'],
    'MAE': [baseline_mae, rf_val_mae, xgb_val_mae],
    'RMSE': [baseline_rmse, rf_val_rmse, xgb_val_rmse],
    'R¬≤': [baseline_r2, rf_val_r2, xgb_val_r2]
})

print("\nüìä VALIDATION SET PERFORMANCE:\n")
print(comparison.to_string(index=False))

# Find best model
best_model_idx = comparison.iloc[1:]['MAE'].idxmin()
best_model_name = comparison.iloc[best_model_idx]['Model']
best_mae = comparison.iloc[best_model_idx]['MAE']

print(f"\nüèÜ BEST MODEL: {best_model_name}")
print(f"   MAE: {best_mae:.2f}")
print(f"   Improvement over baseline: {((baseline_mae - best_mae) / baseline_mae * 100):.1f}%")

In [None]:
# Visualize comparison
fig, axes = plt.subplots(1, 3, figsize=(16, 5))

metrics = ['MAE', 'RMSE', 'R¬≤']
colors = ['#FF6B6B', '#4ECDC4', '#45B7D1']

for idx, metric in enumerate(metrics):
    values = comparison[metric].values
    bars = axes[idx].bar(comparison['Model'], values, color=colors[idx], edgecolor='black', linewidth=1.5)
    
    # Highlight best (lowest MAE/RMSE, highest R¬≤)
    if metric in ['MAE', 'RMSE']:
        best_idx = np.argmin(values[1:]) + 1
    else:
        best_idx = np.argmax(values[1:]) + 1
    bars[best_idx].set_color('gold')
    bars[best_idx].set_edgecolor('darkred')
    bars[best_idx].set_linewidth(3)
    
    axes[idx].set_ylabel(metric, fontsize=12, fontweight='bold')
    axes[idx].set_title(f'{metric} Comparison', fontsize=13, fontweight='bold')
    axes[idx].grid(True, alpha=0.3, axis='y')
    axes[idx].tick_params(axis='x', rotation=15)
    
    # Add value labels
    for bar in bars:
        height = bar.get_height()
        axes[idx].text(bar.get_x() + bar.get_width()/2., height,
                      f'{height:.2f}', ha='center', va='bottom', fontweight='bold')

plt.suptitle('Model Performance Comparison (Validation Set)', fontsize=15, fontweight='bold', y=1.02)
plt.tight_layout()
plt.savefig('../outputs/model_comparison.png', dpi=300, bbox_inches='tight')
plt.show()

print("\n‚úì Saved: outputs/model_comparison.png")

In [None]:
# Side-by-side predictions scatter
fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# Random Forest
axes[0].scatter(y_val, rf_val_pred, alpha=0.7, s=60, edgecolors='black', linewidth=0.5, color='steelblue', label='RF Predictions')
axes[0].plot([y_val.min(), y_val.max()], [y_val.min(), y_val.max()], 'r--', lw=2, label='Perfect')
axes[0].set_xlabel('Actual Score', fontsize=12, fontweight='bold')
axes[0].set_ylabel('Predicted Score', fontsize=12, fontweight='bold')
axes[0].set_title(f'Random Forest\nMAE={rf_val_mae:.2f}, R¬≤={rf_val_r2:.3f}', fontsize=13, fontweight='bold')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# XGBoost
axes[1].scatter(y_val, xgb_val_pred, alpha=0.7, s=60, edgecolors='black', linewidth=0.5, color='purple', label='XGB Predictions')
axes[1].plot([y_val.min(), y_val.max()], [y_val.min(), y_val.max()], 'r--', lw=2, label='Perfect')
axes[1].set_xlabel('Actual Score', fontsize=12, fontweight='bold')
axes[1].set_ylabel('Predicted Score', fontsize=12, fontweight='bold')
axes[1].set_title(f'XGBoost\nMAE={xgb_val_mae:.2f}, R¬≤={xgb_val_r2:.3f}', fontsize=13, fontweight='bold')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.suptitle('Predictions vs Actual: Model Comparison (Validation Set)', fontsize=14, fontweight='bold', y=1.02)
plt.tight_layout()
plt.savefig('../outputs/predictions_comparison.png', dpi=300, bbox_inches='tight')
plt.show()

print("\n‚úì Saved: outputs/predictions_comparison.png")

---
## STEP 6: ERROR ANALYSIS

### 6.1 Identify Worst Predictions

In [None]:
print("\n" + "=" * 70)
print("ERROR ANALYSIS")
print("=" * 70)

# Calculate errors
rf_errors = np.abs(y_val - rf_val_pred)
xgb_errors = np.abs(y_val - xgb_val_pred)

# Create error DataFrame
error_df = pd.DataFrame({
    'Actual': y_val,
    'RF_Predicted': rf_val_pred,
    'RF_Error': rf_errors,
    'XGB_Predicted': xgb_val_pred,
    'XGB_Error': xgb_errors
})

# Random Forest: Top 5 errors
print("\nüî¥ RANDOM FOREST - 5 WORST PREDICTIONS:\n")
rf_worst = error_df.nlargest(5, 'RF_Error')
for idx, row in rf_worst.iterrows():
    print(f"   Website #{idx+1}: Actual={row['Actual']:.1f}, Predicted={row['RF_Predicted']:.1f}, Error={row['RF_Error']:.1f}")

# XGBoost: Top 5 errors
print("\nüî¥ XGBOOST - 5 WORST PREDICTIONS:\n")
xgb_worst = error_df.nlargest(5, 'XGB_Error')
for idx, row in xgb_worst.iterrows():
    print(f"   Website #{idx+1}: Actual={row['Actual']:.1f}, Predicted={row['XGB_Predicted']:.1f}, Error={row['XGB_Error']:.1f}")

### 6.2 Error Patterns

In [None]:
# Analyze error patterns by score range
error_df['Score_Range'] = pd.cut(error_df['Actual'], bins=[0, 30, 60, 80, 100], labels=['Low (0-30)', 'Med (30-60)', 'High (60-80)', 'Very High (80-100)'])

print("\nüìä ERROR PATTERNS BY SCORE RANGE:\n")
print("Random Forest:")
print(error_df.groupby('Score_Range')['RF_Error'].agg(['mean', 'max', 'count']))
print("\nXGBoost:")
print(error_df.groupby('Score_Range')['XGB_Error'].agg(['mean', 'max', 'count']))

### 6.3 Residual Plots

In [None]:
# Residual plots (errors vs predicted)
rf_residuals = y_val - rf_val_pred
xgb_residuals = y_val - xgb_val_pred

fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# Random Forest residuals
axes[0].scatter(rf_val_pred, rf_residuals, alpha=0.6, s=50, edgecolors='black', linewidth=0.5, color='steelblue')
axes[0].axhline(y=0, color='red', linestyle='--', linewidth=2)
axes[0].set_xlabel('Predicted Score', fontsize=12, fontweight='bold')
axes[0].set_ylabel('Residual (Actual - Predicted)', fontsize=12, fontweight='bold')
axes[0].set_title(f'Random Forest: Residual Plot\nMean Error={rf_residuals.mean():.2f}', fontsize=13, fontweight='bold')
axes[0].grid(True, alpha=0.3)

# XGBoost residuals
axes[1].scatter(xgb_val_pred, xgb_residuals, alpha=0.6, s=50, edgecolors='black', linewidth=0.5, color='purple')
axes[1].axhline(y=0, color='red', linestyle='--', linewidth=2)
axes[1].set_xlabel('Predicted Score', fontsize=12, fontweight='bold')
axes[1].set_ylabel('Residual (Actual - Predicted)', fontsize=12, fontweight='bold')
axes[1].set_title(f'XGBoost: Residual Plot\nMean Error={xgb_residuals.mean():.2f}', fontsize=13, fontweight='bold')
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('../outputs/residual_plots.png', dpi=300, bbox_inches='tight')
plt.show()

print("\n‚úì Saved: outputs/residual_plots.png")

---
## STEP 7: SAVE MODELS

In [None]:
print("\n" + "=" * 70)
print("SAVING MODELS")
print("=" * 70)

# Save Random Forest
rf_metadata = {
    'model': rf_model,
    'hyperparameters': {
        'n_estimators': 200,
        'max_depth': 15,
        'min_samples_leaf': 3,
        'random_state': 42
    },
    'performance': {
        'train_mae': rf_train_mae,
        'train_r2': rf_train_r2,
        'val_mae': rf_val_mae,
        'val_r2': rf_val_r2
    },
    'feature_names': list(X_train.columns)
}
joblib.dump(rf_metadata, '../models/random_forest_initial.joblib')
print("\n‚úì Saved: models/random_forest_initial.joblib")
print(f"   Val MAE: {rf_val_mae:.2f}, Val R¬≤: {rf_val_r2:.3f}")

# Save XGBoost
xgb_metadata = {
    'model': xgb_model,
    'hyperparameters': {
        'n_estimators': 200,
        'learning_rate': 0.05,
        'max_depth': 8,
        'random_state': 42
    },
    'performance': {
        'train_mae': xgb_train_mae,
        'train_r2': xgb_train_r2,
        'val_mae': xgb_val_mae,
        'val_r2': xgb_val_r2
    },
    'feature_names': list(X_train.columns)
}
joblib.dump(xgb_metadata, '../models/xgboost_initial.joblib')
print("\n‚úì Saved: models/xgboost_initial.joblib")
print(f"   Val MAE: {xgb_val_mae:.2f}, Val R¬≤: {xgb_val_r2:.3f}")

print("\n" + "=" * 70)
print("‚úÖ MODEL TRAINING COMPLETE!")
print("=" * 70)

---
## Summary & Next Steps

In [None]:
print("\n" + "=" * 70)
print("TRAINING SUMMARY")
print("=" * 70)

print(f"\nüéØ GOAL: Beat baseline MAE of {baseline_mae:.2f}")
print(f"\nüìä RESULTS (Validation Set):\n")
print(f"   Baseline:      MAE={baseline_mae:.2f}  R¬≤={baseline_r2:.4f}")
print(f"   Random Forest: MAE={rf_val_mae:.2f}  R¬≤={rf_val_r2:.4f}  ({((baseline_mae-rf_val_mae)/baseline_mae*100):.1f}% improvement)")
print(f"   XGBoost:       MAE={xgb_val_mae:.2f}  R¬≤={xgb_val_r2:.4f}  ({((baseline_mae-xgb_val_mae)/baseline_mae*100):.1f}% improvement)")

print(f"\nüèÜ WINNER: {best_model_name} (MAE={best_mae:.2f})")

print(f"\nüìÅ OUTPUTS GENERATED:")
print(f"   ‚Ä¢ rf_predictions_vs_actual.png")
print(f"   ‚Ä¢ xgb_predictions_vs_actual.png")
print(f"   ‚Ä¢ rf_feature_importance.png")
print(f"   ‚Ä¢ xgb_feature_importance.png")
print(f"   ‚Ä¢ model_comparison.png")
print(f"   ‚Ä¢ predictions_comparison.png")
print(f"   ‚Ä¢ residual_plots.png")
print(f"   ‚Ä¢ random_forest_initial.joblib")
print(f"   ‚Ä¢ xgboost_initial.joblib")

print(f"\nüöÄ NEXT STEPS:")
print(f"   1. Hyperparameter tuning (Grid Search / Random Search)")
print(f"   2. Feature engineering (interactions, polynomial features)")
print(f"   3. Ensemble methods (stacking, blending)")
print(f"   4. Final evaluation on test set")

print("\n" + "=" * 70)