# House Price Prediction - Model Training
## Building and Training the Regression Model

### 1. Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import joblib
import warnings
warnings.filterwarnings('ignore')

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

### 2. Load Cleaned Data

In [None]:
# Load the cleaned dataset
df = pd.read_csv('house_data_cleaned.csv')
print("Dataset loaded successfully!")
print(f"Shape: {df.shape}")
df.head()

### 3. Feature Engineering

In [None]:
# Create a copy for feature engineering
df_model = df.copy()

# Create new features
df_model['Age'] = 2025 - df_model['YearBuilt']
df_model['TotalRooms'] = df_model['Bedrooms'] + df_model['Bathrooms']
df_model['Area_per_Room'] = df_model['Area'] / df_model['TotalRooms']

print("New features created:")
print("- Age (house age)")
print("- TotalRooms (bedrooms + bathrooms)")
print("- Area_per_Room")

df_model.head()

### 4. Encode Categorical Variables

In [None]:
# Identify categorical columns
categorical_cols = df_model.select_dtypes(include=['object']).columns.tolist()
print(f"Categorical columns: {categorical_cols}")

# Label encoding for categorical variables
label_encoders = {}

for col in categorical_cols:
    le = LabelEncoder()
    df_model[col] = le.fit_transform(df_model[col])
    label_encoders[col] = le
    print(f"\nEncoded {col}:")
    print(f"Classes: {le.classes_}")

# Save label encoders for later use
joblib.dump(label_encoders, 'label_encoders.pkl')
print("\nLabel encoders saved as 'label_encoders.pkl'")

### 5. Prepare Features and Target

In [None]:
# Separate features and target
X = df_model.drop('Price', axis=1)
y = df_model['Price']

print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"\nFeature columns: {list(X.columns)}")

### 6. Split Data into Train and Test Sets

In [None]:
# Split the data (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training set size: {X_train.shape[0]} samples")
print(f"Test set size: {X_test.shape[0]} samples")
print(f"\nTrain-Test split ratio: 80-20")

### 7. Feature Scaling

In [None]:
# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Save scaler for later use
joblib.dump(scaler, 'scaler.pkl')
print("Features scaled using StandardScaler")
print("Scaler saved as 'scaler.pkl'")

### 8. Train Multiple Models

In [None]:
# Initialize models
models = {
    'Linear Regression': LinearRegression(),
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, random_state=42)
}

# Train and evaluate each model
results = {}

for name, model in models.items():
    print(f"\n{'='*60}")
    print(f"Training {name}...")
    print('='*60)
    
    # Train the model
    model.fit(X_train_scaled, y_train)
    
    # Make predictions
    y_train_pred = model.predict(X_train_scaled)
    y_test_pred = model.predict(X_test_scaled)
    
    # Calculate metrics
    train_mse = mean_squared_error(y_train, y_train_pred)
    test_mse = mean_squared_error(y_test, y_test_pred)
    train_rmse = np.sqrt(train_mse)
    test_rmse = np.sqrt(test_mse)
    train_mae = mean_absolute_error(y_train, y_train_pred)
    test_mae = mean_absolute_error(y_test, y_test_pred)
    train_r2 = r2_score(y_train, y_train_pred)
    test_r2 = r2_score(y_test, y_test_pred)
    
    # Store results
    results[name] = {
        'Train MSE': train_mse,
        'Test MSE': test_mse,
        'Train RMSE': train_rmse,
        'Test RMSE': test_rmse,
        'Train MAE': train_mae,
        'Test MAE': test_mae,
        'Train R²': train_r2,
        'Test R²': test_r2
    }
    
    # Print results
    print(f"\nTraining Metrics:")
    print(f"  MSE:  {train_mse:,.2f}")
    print(f"  RMSE: {train_rmse:,.2f}")
    print(f"  MAE:  {train_mae:,.2f}")
    print(f"  R²:   {train_r2:.4f}")
    
    print(f"\nTest Metrics:")
    print(f"  MSE:  {test_mse:,.2f}")
    print(f"  RMSE: {test_rmse:,.2f}")
    print(f"  MAE:  {test_mae:,.2f}")
    print(f"  R²:   {test_r2:.4f}")

### 9. Compare Model Performance

In [None]:
# Create comparison dataframe
results_df = pd.DataFrame(results).T
print("\nModel Comparison:")
print(results_df)

In [None]:
# Visualize model comparison
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# R² Score comparison
axes[0, 0].bar(results_df.index, results_df['Test R²'], color=['blue', 'green', 'orange'])
axes[0, 0].set_title('R² Score Comparison (Test Set)', fontsize=12, fontweight='bold')
axes[0, 0].set_ylabel('R² Score')
axes[0, 0].set_ylim([0, 1])
axes[0, 0].tick_params(axis='x', rotation=45)

# RMSE comparison
axes[0, 1].bar(results_df.index, results_df['Test RMSE'], color=['blue', 'green', 'orange'])
axes[0, 1].set_title('RMSE Comparison (Test Set)', fontsize=12, fontweight='bold')
axes[0, 1].set_ylabel('RMSE')
axes[0, 1].tick_params(axis='x', rotation=45)

# MAE comparison
axes[1, 0].bar(results_df.index, results_df['Test MAE'], color=['blue', 'green', 'orange'])
axes[1, 0].set_title('MAE Comparison (Test Set)', fontsize=12, fontweight='bold')
axes[1, 0].set_ylabel('MAE')
axes[1, 0].tick_params(axis='x', rotation=45)

# Train vs Test R² comparison
x = np.arange(len(results_df.index))
width = 0.35
axes[1, 1].bar(x - width/2, results_df['Train R²'], width, label='Train', color='lightblue')
axes[1, 1].bar(x + width/2, results_df['Test R²'], width, label='Test', color='lightcoral')
axes[1, 1].set_title('Train vs Test R² Score', fontsize=12, fontweight='bold')
axes[1, 1].set_ylabel('R² Score')
axes[1, 1].set_xticks(x)
axes[1, 1].set_xticklabels(results_df.index)
axes[1, 1].tick_params(axis='x', rotation=45)
axes[1, 1].legend()

plt.tight_layout()
plt.show()

### 10. Select Best Model and Save

In [None]:
# Select best model based on Test R² score
best_model_name = results_df['Test R²'].idxmax()
best_model = models[best_model_name]

print(f"\n{'='*60}")
print(f"BEST MODEL: {best_model_name}")
print('='*60)
print(f"Test R² Score: {results_df.loc[best_model_name, 'Test R²']:.4f}")
print(f"Test RMSE: {results_df.loc[best_model_name, 'Test RMSE']:,.2f}")
print(f"Test MAE: {results_df.loc[best_model_name, 'Test MAE']:,.2f}")

# Save the best model
joblib.dump(best_model, 'house_price_model.pkl')
print(f"\nBest model saved as 'house_price_model.pkl'")

### 11. Feature Importance (for tree-based models)

In [None]:
# If best model is tree-based, show feature importance
if hasattr(best_model, 'feature_importances_'):
    feature_importance = pd.DataFrame({
        'Feature': X.columns,
        'Importance': best_model.feature_importances_
    }).sort_values('Importance', ascending=False)
    
    print("\nFeature Importance:")
    print(feature_importance)
    
    # Plot feature importance
    plt.figure(figsize=(10, 6))
    sns.barplot(x='Importance', y='Feature', data=feature_importance, palette='viridis')
    plt.title(f'Feature Importance - {best_model_name}', fontsize=14, fontweight='bold')
    plt.xlabel('Importance')
    plt.ylabel('Feature')
    plt.tight_layout()
    plt.show()
else:
    print("\nFeature importance not available for Linear Regression")
    print("Showing coefficients instead:")
    
    coefficients = pd.DataFrame({
        'Feature': X.columns,
        'Coefficient': best_model.coef_
    }).sort_values('Coefficient', ascending=False)
    
    print(coefficients)
    
    # Plot coefficients
    plt.figure(figsize=(10, 6))
    sns.barplot(x='Coefficient', y='Feature', data=coefficients, palette='coolwarm')
    plt.title('Feature Coefficients - Linear Regression', fontsize=14, fontweight='bold')
    plt.xlabel('Coefficient')
    plt.ylabel('Feature')
    plt.tight_layout()
    plt.show()

### 12. Prediction Analysis

In [None]:
# Get predictions for test set
y_pred = best_model.predict(X_test_scaled)

# Create prediction dataframe
prediction_df = pd.DataFrame({
    'Actual Price': y_test.values,
    'Predicted Price': y_pred,
    'Difference': y_test.values - y_pred,
    'Absolute Error': np.abs(y_test.values - y_pred),
    'Percentage Error': np.abs((y_test.values - y_pred) / y_test.values) * 100
})

print("Sample Predictions:")
print(prediction_df.head(10))

In [None]:
# Visualize predictions vs actual values
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# Scatter plot
axes[0].scatter(y_test, y_pred, alpha=0.5)
axes[0].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
axes[0].set_xlabel('Actual Price', fontsize=12)
axes[0].set_ylabel('Predicted Price', fontsize=12)
axes[0].set_title('Actual vs Predicted Prices', fontsize=14, fontweight='bold')
axes[0].grid(True, alpha=0.3)

# Residual plot
residuals = y_test - y_pred
axes[1].scatter(y_pred, residuals, alpha=0.5)
axes[1].axhline(y=0, color='r', linestyle='--', lw=2)
axes[1].set_xlabel('Predicted Price', fontsize=12)
axes[1].set_ylabel('Residuals', fontsize=12)
axes[1].set_title('Residual Plot', fontsize=14, fontweight='bold')
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# Distribution of errors
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Distribution of absolute errors
axes[0].hist(prediction_df['Absolute Error'], bins=30, color='skyblue', edgecolor='black')
axes[0].set_xlabel('Absolute Error', fontsize=12)
axes[0].set_ylabel('Frequency', fontsize=12)
axes[0].set_title('Distribution of Absolute Errors', fontsize=14, fontweight='bold')
axes[0].grid(True, alpha=0.3)

# Distribution of percentage errors
axes[1].hist(prediction_df['Percentage Error'], bins=30, color='lightcoral', edgecolor='black')
axes[1].set_xlabel('Percentage Error (%)', fontsize=12)
axes[1].set_ylabel('Frequency', fontsize=12)
axes[1].set_title('Distribution of Percentage Errors', fontsize=14, fontweight='bold')
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

### 13. Save Feature Names

In [None]:
# Save feature names for the app
feature_names = list(X.columns)
joblib.dump(feature_names, 'feature_names.pkl')
print(f"Feature names saved: {feature_names}")

### 14. Summary

In [None]:
print("\n" + "="*70)
print("MODEL TRAINING SUMMARY")
print("="*70)
print(f"\nDataset: {len(df)} samples")
print(f"Training set: {len(X_train)} samples")
print(f"Test set: {len(X_test)} samples")
print(f"\nNumber of features: {X.shape[1]}")
print(f"Features: {list(X.columns)}")
print(f"\nBest Model: {best_model_name}")
print(f"\nPerformance Metrics:")
print(f"  R² Score: {results_df.loc[best_model_name, 'Test R²']:.4f}")
print(f"  RMSE: ${results_df.loc[best_model_name, 'Test RMSE']:,.2f}")
print(f"  MAE: ${results_df.loc[best_model_name, 'Test MAE']:,.2f}")
print(f"\nMean Absolute Percentage Error: {prediction_df['Percentage Error'].mean():.2f}%")
print(f"\nFiles Saved:")
print("  - house_price_model.pkl (trained model)")
print("  - scaler.pkl (feature scaler)")
print("  - label_encoders.pkl (categorical encoders)")
print("  - feature_names.pkl (feature names)")
print("\n" + "="*70)