In [None]:
#Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Configuration
sns.set_style("whitegrid")
pd.set_option('display.float_format', '{:.2f}'.format)

In [None]:
#Load Data and Model
data_path = '../data/processed/'
model_path = '../models/house_price_model.joblib'


#Load Test Features
X_test = pd.read_parquet(f'{data_path}X_test_processed.parquet')

#Load Test Targets (Log Scale)
y_test_log = pd.read_parquet(f'{data_path}y_test_log.parquet')['Price']

#Convert to Real Dollars
y_test_real = np.expm1(y_test_log)

#Load Model
model = joblib.load(model_path)

print(f"Loaded Model: {type(model).__name__}")
print(f"Test Data Shape: {X_test.shape}")

In [None]:
#Generate Predictions

#Predict (Log Scale)
y_pred_log = model.predict(X_test)

#Convert to Real Dollars
y_pred = np.expm1(y_pred_log)

#Create a DataFrame for analysis
results_df = pd.DataFrame({
    'Actual': y_test_real,
    'Predicted': y_pred
})

#Calculate Residuals (Error)
results_df['Error'] = results_df['Actual'] - results_df['Predicted']
results_df['Abs_Error'] = results_df['Error'].abs()

print("Predictions complete.")
results_df.head(10)

In [None]:
#Metrics Calculation
mae = mean_absolute_error(y_test_real, y_pred)
rmse = np.sqrt(mean_squared_error(y_test_real, y_pred))
r2 = r2_score(y_test_real, y_pred)

print("FINAL TEST METRICS")
print(f"MAE:  ${mae:,.0f}")
print(f"RMSE: ${rmse:,.0f}")
print(f"R2:   {r2:.4f}")

In [None]:
#Plot Actual vs Predicted
plt.figure(figsize=(10, 6))
sns.scatterplot(data=results_df, x='Actual', y='Predicted', alpha=0.5, color='blue', edgecolor=None)

#Perfect prediction line
plt.plot([results_df['Actual'].min(), results_df['Actual'].max()], 
         [results_df['Actual'].min(), results_df['Actual'].max()], 
         'r--', lw=3, label='Perfect Prediction')

plt.title("Actual vs Predicted Prices")
plt.xlabel("Actual Price ($)")
plt.ylabel("Predicted Price ($)")
plt.legend()
plt.show()

In [None]:
#Plot Residuals

# Check for patterns in errors
plt.figure(figsize=(10, 6))
sns.scatterplot(data=results_df, x='Predicted', y='Error', alpha=0.5, color='purple', edgecolor=None)
plt.axhline(0, color='red', linestyle='--', lw=2)

plt.title("Residual Plot (Predicted vs Error)")
plt.xlabel("Predicted Price ($)")
plt.ylabel("Error (Actual - Predicted)")
plt.show()

In [None]:
#Plot Error Distribution

plt.figure(figsize=(10, 6))
sns.histplot(results_df['Error'], bins=50, kde=True, color='orange')
plt.title("Distribution of Prediction Errors")
plt.xlabel("Error ($)")
plt.show()

In [None]:
#Plot Feature Importances

# Check if model supports feature importance
if hasattr(model, 'feature_importances_'):
    importances = model.feature_importances_
    features = X_test.columns
    
    feat_df = pd.DataFrame({'Feature': features, 'Importance': importances})
    feat_df = feat_df.sort_values(by='Importance', ascending=False).head(20)
    
    plt.figure(figsize=(10, 8))
    sns.barplot(data=feat_df, y='Feature', x='Importance', palette='magma')
    plt.title("Top 20 Features Driving House Prices")
    plt.show()
else:
    print("Model does not support feature importance.")

In [None]:
#Worst Predictions Analysis
print("\nTOP 10 WORST PREDICTIONS (High Error)")
worst_predictions = results_df.sort_values(by='Abs_Error', ascending=False).head(10)

# Format for easier reading
display(worst_predictions.style.format({
    'Actual': '${:,.0f}', 
    'Predicted': '${:,.0f}', 
    'Error': '${:,.0f}',
    'Abs_Error': '${:,.0f}'
}))