# House Price Prediction - Model Training and Evaluation

This notebook demonstrates the complete model training and evaluation pipeline using the modular components.

## Table of Contents
1. [Setup and Data Loading](#setup)
2. [Data Preprocessing](#preprocessing)
3. [Model Training](#training)
4. [Model Evaluation](#evaluation)
5. [Model Comparison](#comparison)
6. [Feature Importance Analysis](#feature-importance)
7. [Results Summary](#summary)

In [None]:
# Import necessary libraries
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Add src directory to path
sys.path.append('../src')

# Import custom modules
from data_generator import HousePriceDataGenerator
from data_preprocessor import HousePricePreprocessor
from model_trainer import HousePriceModelTrainer

# Set style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("✅ All modules imported successfully!")

## 1. Setup and Data Loading {#setup}

In [None]:
# Generate dataset if it doesn't exist
data_path = '../data/house_data_more_logic.xlsx'

if not Path(data_path).exists():
    print("📊 Generating synthetic dataset...")
    generator = HousePriceDataGenerator(random_seed=42)
    dataset = generator.generate_dataset(num_samples=300, save_path=data_path)
    print("✅ Dataset generated successfully!")
else:
    print("📁 Dataset already exists, loading...")

# Initialize preprocessor
preprocessor = HousePricePreprocessor()
data = preprocessor.load_data(data_path)

print(f"\nDataset shape: {data.shape}")
data.head()

## 2. Data Preprocessing {#preprocessing}

In [None]:
# Check data quality
quality_report = preprocessor.check_data_quality(data)
print("Data Quality Report:")
print(f"Shape: {quality_report['shape']}")
print(f"Missing values: {sum(quality_report['missing_values'].values())}")
print(f"Duplicates: {quality_report['duplicates']}")

In [None]:
# Handle missing values and prepare features
data_clean = preprocessor.handle_missing_values(data)
X, y = preprocessor.prepare_features(data_clean, include_engineered=True)

print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"\nFeature columns:")
for i, col in enumerate(X.columns):
    print(f"{i+1:2d}. {col}")

In [None]:
# Transform features and split data
X_processed = preprocessor.fit_transform(X)
X_train, X_test, y_train, y_test = preprocessor.split_data(X_processed, y, test_size=0.2, random_state=42)

print(f"Training set: {X_train.shape}")
print(f"Test set: {X_test.shape}")
print(f"Processed features: {X_processed.shape[1]}")

## 3. Model Training {#training}

In [None]:
# Initialize model trainer
trainer = HousePriceModelTrainer()

print("Available models:")
for i, model_name in enumerate(trainer.available_models.keys(), 1):
    print(f"{i}. {model_name}")

In [None]:
# Train all models (without hyperparameter tuning for speed)
print("🤖 Training all models...")
models = trainer.train_all_models(X_train, y_train, tune_hyperparameters=False)
print(f"\n✅ Successfully trained {len(models)} models!")

## 4. Model Evaluation {#evaluation}

In [None]:
# Evaluate all models
print("📈 Evaluating all models...")
results_df = trainer.evaluate_all_models(X_test, y_test)

print("\n🏆 Model Performance Results:")
print("=" * 80)
results_display = results_df.copy()
results_display['rmse'] = results_display['rmse'].apply(lambda x: f"${x:,.0f}")
results_display['mae'] = results_display['mae'].apply(lambda x: f"${x:,.0f}")
results_display['r2'] = results_display['r2'].apply(lambda x: f"{x:.4f}")
results_display['mape'] = results_display['mape'].apply(lambda x: f"{x:.2f}%")

print(results_display)

In [None]:
# Cross-validation
print("🔄 Performing cross-validation...")
cv_results = trainer.cross_validate_models(X_train, y_train, cv_folds=5)

print("\n📊 Cross-Validation Results:")
print("=" * 50)
cv_display = cv_results.copy()
cv_display['CV_RMSE_Mean'] = cv_display['CV_RMSE_Mean'].apply(lambda x: f"${x:,.0f}")
cv_display['CV_RMSE_Std'] = cv_display['CV_RMSE_Std'].apply(lambda x: f"${x:,.0f}")
print(cv_display)

## 5. Model Comparison {#comparison}

In [None]:
# Plot model comparison
trainer.plot_model_comparison(figsize=(15, 10))

In [None]:
# Plot predictions for best model
print(f"📊 Plotting predictions for best model: {trainer.best_model_name}")
trainer.plot_predictions(X_test, y_test, figsize=(15, 10))

## 6. Feature Importance Analysis {#feature-importance}

In [None]:
# Get feature names
try:
    feature_names = preprocessor.get_feature_names()
    print(f"Total features after preprocessing: {len(feature_names)}")
    
    # Plot feature importance for tree-based models
    tree_models = ['random_forest', 'gradient_boosting']
    
    for model_name in tree_models:
        if model_name in trainer.models:
            print(f"\n🌳 Feature importance for {model_name}:")
            importance_df = trainer.get_feature_importance(feature_names, model_name, top_n=15)
            if importance_df is not None:
                trainer.plot_feature_importance(feature_names, model_name, top_n=15, figsize=(10, 8))
                
                # Display top 10 features
                print("\nTop 10 Most Important Features:")
                for i, (_, row) in enumerate(importance_df.head(10).iterrows(), 1):
                    print(f"{i:2d}. {row['feature']:30s}: {row['importance']:.4f}")
            
except Exception as e:
    print(f"Could not analyze feature importance: {e}")

## 7. Results Summary {#summary}

In [None]:
# Final summary
print("🎯 FINAL RESULTS SUMMARY")
print("=" * 60)

print(f"📊 Dataset Information:")
print(f"   • Total samples: {len(data)}")
print(f"   • Original features: {len(X.columns)}")
print(f"   • Processed features: {X_processed.shape[1]}")
print(f"   • Training samples: {X_train.shape[0]}")
print(f"   • Test samples: {X_test.shape[0]}")

print(f"\n🤖 Model Training:")
print(f"   • Models trained: {len(trainer.models)}")
print(f"   • Best model: {trainer.best_model_name}")

best_metrics = trainer.results[trainer.best_model_name]
print(f"\n🏆 Best Model Performance ({trainer.best_model_name}):")
print(f"   • RMSE: ${best_metrics['rmse']:,.2f}")
print(f"   • R² Score: {best_metrics['r2']:.4f}")
print(f"   • MAE: ${best_metrics['mae']:,.2f}")
print(f"   • MAPE: {best_metrics['mape']:.2f}%")

print(f"\n📈 Model Interpretation:")
print(f"   • The model explains {best_metrics['r2']*100:.1f}% of price variance")
print(f"   • Average prediction error: ${best_metrics['rmse']:,.0f}")
print(f"   • Typical error percentage: {best_metrics['mape']:.1f}%")

print("\n✅ Analysis completed successfully!")
print("=" * 60)

In [None]:
# Save results for future reference
results_df.to_csv('../results/model_results.csv')
cv_results.to_csv('../results/cv_results.csv')
print("💾 Results saved to ../results/ directory")