# Model Optimization & Selection

Comprehensive model training, hyperparameter tuning, and selection for house price prediction.

## Overview
- Linear models: Ridge, Lasso, ElasticNet with regularization
- Tree-based models: LightGBM, XGBoost
- 5-Fold Cross-Validation for model selection
- Comprehensive evaluation metrics

## Data
- Train: 1239 samples (85%)
- Test: 219 samples (15%)
- Features: 176 numeric features (already encoded and scaled)
- Target: SalePrice (log-transformed)


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Machine Learning
from sklearn.model_selection import KFold, RandomizedSearchCV, GridSearchCV
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler
import joblib

# Tree-based models
try:
    import lightgbm as lgb
    LIGHTGBM_AVAILABLE = True
except ImportError:
    LIGHTGBM_AVAILABLE = False
    print("LightGBM not available. Install with: pip install lightgbm")

try:
    import xgboost as xgb
    XGBOOST_AVAILABLE = True
except ImportError:
    XGBOOST_AVAILABLE = False
    print("XGBoost not available. Install with: pip install xgboost")

# Set style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("✅ Libraries imported successfully")
print(f"LightGBM available: {LIGHTGBM_AVAILABLE}")
print(f"XGBoost available: {XGBOOST_AVAILABLE}")


In [None]:
# Load processed data
train_df = pd.read_csv('../data/processed/train_encoded.csv')
test_df = pd.read_csv('../data/processed/test_encoded.csv')

print(f"Train data shape: {train_df.shape}")
print(f"Test data shape: {test_df.shape}")
print(f"Features: {train_df.shape[1] - 1}")
print(f"Target: SalePrice")

# Separate features and target
X_train = train_df.drop('SalePrice', axis=1)
y_train = train_df['SalePrice']
X_test = test_df.drop('SalePrice', axis=1)
y_test = test_df['SalePrice']

print(f"\n✅ Data loaded successfully")
print(f"X_train: {X_train.shape}")
print(f"y_train: {y_train.shape}")
print(f"X_test: {X_test.shape}")
print(f"y_test: {y_test.shape}")

# Check for any missing values
print(f"\nMissing values:")
print(f"X_train: {X_train.isnull().sum().sum()}")
print(f"y_train: {y_train.isnull().sum()}")
print(f"X_test: {X_test.isnull().sum().sum()}")
print(f"y_test: {y_test.isnull().sum()}")


In [None]:
# Define evaluation metrics
def calculate_metrics(y_true, y_pred, model_name="Model"):
    """Calculate comprehensive evaluation metrics"""
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    
    # RMSLE (Root Mean Squared Log Error) - commonly used for house price prediction
    # Since target is already log-transformed, we calculate RMSLE differently
    rmsle = np.sqrt(mean_squared_error(y_true, y_pred))
    
    return {
        'Model': model_name,
        'RMSE': rmse,
        'MAE': mae,
        'R²': r2,
        'RMSLE': rmsle
    }

def cross_validate_model(model, X, y, cv_folds=5, model_name="Model"):
    """Perform cross-validation and return mean scores"""
    kf = KFold(n_splits=cv_folds, shuffle=True, random_state=42)
    
    rmse_scores = []
    mae_scores = []
    r2_scores = []
    
    for train_idx, val_idx in kf.split(X):
        X_fold_train, X_fold_val = X.iloc[train_idx], X.iloc[val_idx]
        y_fold_train, y_fold_val = y.iloc[train_idx], y.iloc[val_idx]
        
        model.fit(X_fold_train, y_fold_train)
        y_pred = model.predict(X_fold_val)
        
        rmse_scores.append(np.sqrt(mean_squared_error(y_fold_val, y_pred)))
        mae_scores.append(mean_absolute_error(y_fold_val, y_pred))
        r2_scores.append(r2_score(y_fold_val, y_pred))
    
    return {
        'Model': model_name,
        'CV_RMSE_mean': np.mean(rmse_scores),
        'CV_RMSE_std': np.std(rmse_scores),
        'CV_MAE_mean': np.mean(mae_scores),
        'CV_MAE_std': np.std(mae_scores),
        'CV_R²_mean': np.mean(r2_scores),
        'CV_R²_std': np.std(r2_scores)
    }

print("✅ Evaluation functions defined")


## 1. Linear Models with Regularization


In [None]:
# Define hyperparameter grids for linear models
ridge_params = {
    'alpha': [0.001, 0.01, 0.1, 1, 10, 50, 100]
}

lasso_params = {
    'alpha': [0.0001, 0.001, 0.01, 0.1, 1, 10]
}

elasticnet_params = {
    'alpha': [0.001, 0.01, 0.1, 1],
    'l1_ratio': [0.1, 0.3, 0.5, 0.7, 0.9]
}

# Initialize models
ridge_model = Ridge(random_state=42)
lasso_model = Lasso(random_state=42, max_iter=2000)
elasticnet_model = ElasticNet(random_state=42, max_iter=2000)

print("✅ Linear models initialized")


In [None]:
# Train Ridge Regression
print("Training Ridge Regression...")
ridge_search = GridSearchCV(
    ridge_model, ridge_params, 
    cv=5, scoring='neg_mean_squared_error', 
    n_jobs=-1, verbose=1
)
ridge_search.fit(X_train, y_train)

print(f"Best Ridge alpha: {ridge_search.best_params_['alpha']}")
print(f"Best Ridge CV score: {-ridge_search.best_score_:.6f}")

# Train Lasso Regression
print("\nTraining Lasso Regression...")
lasso_search = GridSearchCV(
    lasso_model, lasso_params, 
    cv=5, scoring='neg_mean_squared_error', 
    n_jobs=-1, verbose=1
)
lasso_search.fit(X_train, y_train)

print(f"Best Lasso alpha: {lasso_search.best_params_['alpha']}")
print(f"Best Lasso CV score: {-lasso_search.best_score_:.6f}")

# Train ElasticNet
print("\nTraining ElasticNet...")
elasticnet_search = GridSearchCV(
    elasticnet_model, elasticnet_params, 
    cv=5, scoring='neg_mean_squared_error', 
    n_jobs=-1, verbose=1
)
elasticnet_search.fit(X_train, y_train)

print(f"Best ElasticNet params: {elasticnet_search.best_params_}")
print(f"Best ElasticNet CV score: {-elasticnet_search.best_score_:.6f}")


In [None]:
# Evaluate linear models on test set
linear_results = []

# Ridge
ridge_pred = ridge_search.predict(X_test)
ridge_metrics = calculate_metrics(y_test, ridge_pred, "Ridge")
ridge_metrics.update({
    'Best_Params': ridge_search.best_params_,
    'CV_Score': -ridge_search.best_score_
})
linear_results.append(ridge_metrics)

# Lasso
lasso_pred = lasso_search.predict(X_test)
lasso_metrics = calculate_metrics(y_test, lasso_pred, "Lasso")
lasso_metrics.update({
    'Best_Params': lasso_search.best_params_,
    'CV_Score': -lasso_search.best_score_
})
linear_results.append(lasso_metrics)

# ElasticNet
elasticnet_pred = elasticnet_search.predict(X_test)
elasticnet_metrics = calculate_metrics(y_test, elasticnet_pred, "ElasticNet")
elasticnet_metrics.update({
    'Best_Params': elasticnet_search.best_params_,
    'CV_Score': -elasticnet_search.best_score_
})
linear_results.append(elasticnet_metrics)

# Display results
linear_df = pd.DataFrame(linear_results)
print("\n📊 Linear Models Results:")
print(linear_df[['Model', 'RMSE', 'MAE', 'R²', 'CV_Score']].round(4))


## 2. Tree-Based Models


In [None]:
# Define hyperparameter grids for tree-based models
if LIGHTGBM_AVAILABLE:
    lgb_params = {
        'learning_rate': [0.01, 0.05, 0.1, 0.2],
        'num_leaves': [31, 50, 100, 200],
        'max_depth': [3, 5, 7, 10],
        'min_child_samples': [20, 50, 100],
        'subsample': [0.8, 0.9, 1.0],
        'colsample_bytree': [0.8, 0.9, 1.0]
    }

if XGBOOST_AVAILABLE:
    xgb_params = {
        'learning_rate': [0.01, 0.05, 0.1, 0.2],
        'max_depth': [3, 5, 7, 10],
        'min_child_weight': [1, 3, 5],
        'subsample': [0.8, 0.9, 1.0],
        'colsample_bytree': [0.8, 0.9, 1.0],
        'gamma': [0, 0.1, 0.2]
    }

print("✅ Tree-based model parameters defined")


In [None]:
tree_results = []

# Train LightGBM
if LIGHTGBM_AVAILABLE:
    print("Training LightGBM...")
    lgb_model = lgb.LGBMRegressor(random_state=42, verbose=-1)
    
    lgb_search = RandomizedSearchCV(
        lgb_model, lgb_params, 
        n_iter=30, cv=5, scoring='neg_mean_squared_error', 
        n_jobs=-1, random_state=42, verbose=1
    )
    lgb_search.fit(X_train, y_train)
    
    print(f"Best LightGBM params: {lgb_search.best_params_}")
    print(f"Best LightGBM CV score: {-lgb_search.best_score_:.6f}")
    
    # Evaluate
    lgb_pred = lgb_search.predict(X_test)
    lgb_metrics = calculate_metrics(y_test, lgb_pred, "LightGBM")
    lgb_metrics.update({
        'Best_Params': lgb_search.best_params_,
        'CV_Score': -lgb_search.best_score_
    })
    tree_results.append(lgb_metrics)
else:
    print("⚠️ LightGBM not available")

# Train XGBoost
if XGBOOST_AVAILABLE:
    print("\nTraining XGBoost...")
    xgb_model = xgb.XGBRegressor(random_state=42, verbosity=0)
    
    xgb_search = RandomizedSearchCV(
        xgb_model, xgb_params, 
        n_iter=30, cv=5, scoring='neg_mean_squared_error', 
        n_jobs=-1, random_state=42, verbose=1
    )
    xgb_search.fit(X_train, y_train)
    
    print(f"Best XGBoost params: {xgb_search.best_params_}")
    print(f"Best XGBoost CV score: {-xgb_search.best_score_:.6f}")
    
    # Evaluate
    xgb_pred = xgb_search.predict(X_test)
    xgb_metrics = calculate_metrics(y_test, xgb_pred, "XGBoost")
    xgb_metrics.update({
        'Best_Params': xgb_search.best_params_,
        'CV_Score': -xgb_search.best_score_
    })
    tree_results.append(xgb_metrics)
else:
    print("⚠️ XGBoost not available")


In [None]:
# Display tree-based results
if tree_results:
    tree_df = pd.DataFrame(tree_results)
    print("\n📊 Tree-Based Models Results:")
    print(tree_df[['Model', 'RMSE', 'MAE', 'R²', 'CV_Score']].round(4))
else:
    print("\n⚠️ No tree-based models trained (dependencies not available)")


## 3. Model Comparison & Analysis


In [None]:
# Combine all results
all_results = linear_results + tree_results
results_df = pd.DataFrame(all_results)

# Sort by RMSE (lower is better)
results_df = results_df.sort_values('RMSE').reset_index(drop=True)

print("\n🏆 FINAL MODEL COMPARISON:")
print("=" * 60)
print(results_df[['Model', 'RMSE', 'MAE', 'R²', 'CV_Score']].round(4))

# Identify best model
best_model_name = results_df.iloc[0]['Model']
best_rmse = results_df.iloc[0]['RMSE']
best_r2 = results_df.iloc[0]['R²']

print(f"\n🥇 BEST MODEL: {best_model_name}")
print(f"   RMSE: {best_rmse:.4f}")
print(f"   R²: {best_r2:.4f}")


In [None]:
# Visualization: Model Comparison
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# RMSE comparison
axes[0, 0].bar(results_df['Model'], results_df['RMSE'], color='skyblue', alpha=0.7)
axes[0, 0].set_title('RMSE Comparison (Lower is Better)', fontsize=12, fontweight='bold')
axes[0, 0].set_ylabel('RMSE')
axes[0, 0].tick_params(axis='x', rotation=45)

# R² comparison
axes[0, 1].bar(results_df['Model'], results_df['R²'], color='lightgreen', alpha=0.7)
axes[0, 1].set_title('R² Comparison (Higher is Better)', fontsize=12, fontweight='bold')
axes[0, 1].set_ylabel('R²')
axes[0, 1].tick_params(axis='x', rotation=45)

# MAE comparison
axes[1, 0].bar(results_df['Model'], results_df['MAE'], color='orange', alpha=0.7)
axes[1, 0].set_title('MAE Comparison (Lower is Better)', fontsize=12, fontweight='bold')
axes[1, 0].set_ylabel('MAE')
axes[1, 0].tick_params(axis='x', rotation=45)

# CV Score comparison
axes[1, 1].bar(results_df['Model'], results_df['CV_Score'], color='pink', alpha=0.7)
axes[1, 1].set_title('Cross-Validation Score (Lower is Better)', fontsize=12, fontweight='bold')
axes[1, 1].set_ylabel('CV Score (MSE)')
axes[1, 1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()


In [None]:
# Feature Importance Analysis (for tree-based models)
if LIGHTGBM_AVAILABLE and 'lgb_search' in locals():
    print("\n🔍 LightGBM Feature Importance (Top 20):")
    feature_importance = lgb_search.best_estimator_.feature_importances_
    feature_names = X_train.columns
    
    importance_df = pd.DataFrame({
        'feature': feature_names,
        'importance': feature_importance
    }).sort_values('importance', ascending=False)
    
    print(importance_df.head(20))
    
    # Plot feature importance
    plt.figure(figsize=(10, 8))
    top_features = importance_df.head(20)
    plt.barh(range(len(top_features)), top_features['importance'])
    plt.yticks(range(len(top_features)), top_features['feature'])
    plt.xlabel('Feature Importance')
    plt.title('LightGBM Feature Importance (Top 20)', fontweight='bold')
    plt.gca().invert_yaxis()
    plt.tight_layout()
    plt.show()

if XGBOOST_AVAILABLE and 'xgb_search' in locals():
    print("\n🔍 XGBoost Feature Importance (Top 20):")
    feature_importance = xgb_search.best_estimator_.feature_importances_
    feature_names = X_train.columns
    
    importance_df = pd.DataFrame({
        'feature': feature_names,
        'importance': feature_importance
    }).sort_values('importance', ascending=False)
    
    print(importance_df.head(20))


In [None]:
# Residual Analysis for best model
def plot_residuals(y_true, y_pred, model_name, ax):
    residuals = y_true - y_pred
    ax.scatter(y_pred, residuals, alpha=0.6)
    ax.axhline(y=0, color='red', linestyle='--')
    ax.set_xlabel('Predicted Values')
    ax.set_ylabel('Residuals')
    ax.set_title(f'{model_name} Residuals')
    ax.grid(True, alpha=0.3)

# Get predictions for best model
if best_model_name == 'Ridge':
    best_predictions = ridge_pred
elif best_model_name == 'Lasso':
    best_predictions = lasso_pred
elif best_model_name == 'ElasticNet':
    best_predictions = elasticnet_pred
elif best_model_name == 'LightGBM' and LIGHTGBM_AVAILABLE:
    best_predictions = lgb_pred
elif best_model_name == 'XGBoost' and XGBOOST_AVAILABLE:
    best_predictions = xgb_pred

# Plot residuals
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Residuals plot
plot_residuals(y_test, best_predictions, best_model_name, axes[0])

# Actual vs Predicted
axes[1].scatter(y_test, best_predictions, alpha=0.6)
axes[1].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
axes[1].set_xlabel('Actual Values')
axes[1].set_ylabel('Predicted Values')
axes[1].set_title(f'{best_model_name} Actual vs Predicted')
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()


In [None]:
# Save results for production use
import json
from pathlib import Path

# Create models directory if it doesn't exist
models_dir = Path('../models')
models_dir.mkdir(exist_ok=True)

# Save model comparison results
results_df.to_csv(models_dir / 'model_comparison.csv', index=False)

# Save best model hyperparameters
best_model_info = results_df.iloc[0]
hyperparams = {
    'best_model': best_model_name,
    'best_params': best_model_info['Best_Params'],
    'performance': {
        'RMSE': float(best_model_info['RMSE']),
        'MAE': float(best_model_info['MAE']),
        'R²': float(best_model_info['R²']),
        'CV_Score': float(best_model_info['CV_Score'])
    }
}

with open(models_dir / 'best_model_config.json', 'w') as f:
    json.dump(hyperparams, f, indent=2)

print(f"\n💾 Results saved to {models_dir}/")
print(f"   - model_comparison.csv")
print(f"   - best_model_config.json")

print(f"\n🎯 Next Steps:")
print(f"   1. Best model identified: {best_model_name}")
print(f"   2. Implement ModelTrainer class in src/Modeling.py")
print(f"   3. Update app.py to integrate modeling pipeline")
print(f"   4. Generate comprehensive ModelReport.md")
