In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

# Model Libraries
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

# Metrics
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Configuration
sns.set_style("whitegrid")
warnings.filterwarnings('ignore') # Clean up clutter in output

In [None]:
# --- LOAD THE DATA ---
data_path = '../data/processed/'

print("‚è≥ Loading data from Parquet files...")

# Load Features
X_train = pd.read_parquet(f'{data_path}X_train_processed.parquet')
X_test = pd.read_parquet(f'{data_path}X_test_processed.parquet')

# Load Targets (Series)
y_train_log = pd.read_parquet(f'{data_path}y_train_log.parquet')['Price']
y_test_log = pd.read_parquet(f'{data_path}y_test_log.parquet')['Price']

# Create Real Dollar Targets for Evaluation
# We must reverse the log transformation to measure error in real money ($)
y_test_real = np.expm1(y_test_log)

print("‚úÖ Data Loaded Successfully.")
print(f"   Training Data Shape: {X_train.shape}")
print(f"   Test Data Shape:     {X_test.shape}")

In [None]:
# --- DEFINE MODEL DICTIONARY ---
# We compare different "families" of algorithms to find the best fit.

models = {
    # 1. Baseline: Simple linear relationships
    "Linear Regression": LinearRegression(),
    
    # 2. Bagging: Robust to outliers, handles non-linear data well
    "Random Forest": RandomForestRegressor(
        n_estimators=100, 
        random_state=42, 
        n_jobs=-1
    ),
    
    # 3. Boosting (XGB): Optimized gradient boosting, industry standard
    "XGBoost": XGBRegressor(
        n_estimators=1000,       # More trees
        learning_rate=0.05,      # Slower learning for better accuracy
        n_jobs=-1, 
        random_state=42
    ),
    
    # 4. Boosting (LGBM): Very fast, handles large datasets well
    "LightGBM": LGBMRegressor(
        n_estimators=1000, 
        learning_rate=0.05, 
        n_jobs=-1, 
        random_state=42, 
        verbose=-1
    )
}

In [None]:
from sklearn.model_selection import cross_val_score

# --- TRAINING LOOP (CROSS-VALIDATION) ---
results = []
trained_models = {}

print("üöÄ Starting Model Selection with Cross-Validation...\n")

for name, model in models.items():
    print(f"   Evaluating {name}...")
    
    # A. CROSS-VALIDATION (Strictly on Training Data)
    # We use Negative MAE because sklearn metrics are "higher is better" (so they return negative error)
    # We are evaluating on Log Prices, so this is "Log MAE"
    # cv=5 means we split X_train into 5 parts, train on 4, test on 1, repeat 5 times.
    cv_scores = cross_val_score(model, X_train, y_train_log, cv=5, scoring='neg_mean_absolute_error', n_jobs=-1)
    
    # Convert to positive
    avg_log_mae = -1 * cv_scores.mean()
    
    # B. FIT ON FULL TRAINING DATA (For final testing later)
    # We still need to fit the model to use it for Feature Importance / Final Test
    model.fit(X_train, y_train_log)
    trained_models[name] = model
    
    # C. SAVE METRICS
    results.append({
        "Model": name,
        "CV MAE (Log)": avg_log_mae
    })

print("\n‚úÖ Cross-Validation Complete.")

In [None]:
# --- DISPLAY RESULTS ---
results_df = pd.DataFrame(results).sort_values(by="CV MAE (Log)", ascending=True)

print("\nüèÜ MODEL SELECTION LEADERBOARD (Based on CV) üèÜ")
# Style the dataframe for easy reading
# Greens_r because lower error is better
display(results_df.style.background_gradient(cmap="Greens_r", subset=["CV MAE (Log)"])) 

# --- VISUALIZATION ---
plt.figure(figsize=(12, 6))
# Plot MAE (Lower is better)
sns.barplot(data=results_df, x="Model", y="CV MAE (Log)", palette="viridis")
plt.title("Model Comparison: Cross-Validation Error (Log Scale)")
plt.ylabel("Mean Absolute Error (Log Units)")
plt.xlabel("Model Name")
plt.show()

In [None]:
# --- FINAL EVALUATION OF THE WINNER ---
best_model_name = results_df.iloc[0]['Model']
best_model = trained_models[best_model_name]

print(f"üåü The Winner (based on CV) is: {best_model_name}")

# Now we open the "Test Vault" ONLY for the winner
y_pred_log = best_model.predict(X_test)
y_pred_final = np.expm1(y_pred_log)

# Calculate Real Dollar Metrics
final_mae = mean_absolute_error(y_test_real, y_pred_final)
final_r2 = r2_score(y_test_real, y_pred_final)

print(f"üí∞ Final Test Set MAE: ${final_mae:,.0f}")
print(f"üìä Final Test Set R2:  {final_r2:.4f}")

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.base import clone

# --- HYPERPARAMETER TUNING ---
print(f"üîß Starting Hyperparameter Tuning for: {best_model_name}")

# 1. Define Parameter Grids for each model family
param_grids = {
    "Random Forest": {
        'n_estimators': [100, 200, 500],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    },
    "XGBoost": {
        'n_estimators': [500, 1000, 2000],
        'learning_rate': [0.01, 0.05, 0.1],
        'max_depth': [3, 5, 7],
        'subsample': [0.7, 0.8, 0.9],
        'colsample_bytree': [0.7, 0.8, 0.9]
    },
    "LightGBM": {
        'n_estimators': [500, 1000, 2000],
        'learning_rate': [0.01, 0.05, 0.1],
        'num_leaves': [31, 50, 100],
        'max_depth': [-1, 10, 20],
        'subsample': [0.7, 0.8, 0.9]
    }
}

# 2. Select the Grid
if best_model_name in param_grids:
    param_grid = param_grids[best_model_name]
    base_model = clone(models[best_model_name]) # Get the original model instance
    
    # 3. Setup Randomized Search
    # n_iter=20 means we try 20 random combinations (adjust for speed vs accuracy)
    random_search = RandomizedSearchCV(
        estimator=base_model,
        param_distributions=param_grid,
        n_iter=20,
        scoring='neg_mean_absolute_error', # Optimize for MAE
        cv=3, # 3-Fold Cross Validation
        verbose=1,
        random_state=42,
        n_jobs=-1
    )
    
    # 4. Run Search
    print("   Running RandomizedSearchCV (this may take a while)...")
    random_search.fit(X_train, y_train_log)
    
    # 5. Get Best Results
    tuned_model = random_search.best_estimator_
    print(f"\n‚úÖ Tuning Complete. Best Params: {random_search.best_params_}")
    
    # 6. Evaluate Tuned Model
    y_pred_log_tuned = tuned_model.predict(X_test)
    y_pred_tuned = np.expm1(y_pred_log_tuned)
    
    mae_tuned = mean_absolute_error(y_test_real, y_pred_tuned)
    
    print(f"\nüìä Performance Comparison:")
    print(f"   Original {best_model_name} MAE: ${final_mae:,.0f}")
    print(f"   Tuned {best_model_name} MAE:    ${mae_tuned:,.0f}")
    
    if mae_tuned < final_mae:
        print("üéâ Improvement! The tuned model is better.")
    else:
        print("ü§∑ No improvement. The default parameters were already quite good.")

else:
    print(f"Skipping tuning for {best_model_name} (No grid defined or Linear Regression).")

In [None]:
import joblib
import os

# --- SAVE THE BEST MODEL ---
print("üíæ Saving the best model...")

# Create models directory
model_dir = '../models/'
os.makedirs(model_dir, exist_ok=True)

# Determine which model was actually better (Tuned vs Original)
# We compare the Tuned Model's Test MAE vs the Original Model's Test MAE
# We check if 'tuned_model' exists and if it improved the score
if 'tuned_model' in locals() and 'mae_tuned' in locals() and mae_tuned < final_mae:
    final_model = tuned_model
    final_name = f"{best_model_name}_tuned"
    print(f"   Selecting Tuned {best_model_name} (MAE: ${mae_tuned:,.0f})")
else:
    final_model = best_model
    final_name = best_model_name
    print(f"   Selecting Original {best_model_name} (MAE: ${final_mae:,.0f})")

# Save
save_path = f'{model_dir}house_price_model.joblib'
joblib.dump(final_model, save_path)

print(f"‚úÖ Model saved successfully to: {save_path}")