In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder
import joblib
import os

In [2]:
# Load the dataset
try:
    df = pd.read_csv('dataset/house_price.csv')
except FileNotFoundError:
    print("The dataset 'house_price.csv' was not found. Please check the file path.")
    raise

In [3]:
# Encode categorical features
label_encoders = {}
for column in ['Location', 'Condition', 'Garage']:
    if column in df.columns:
        le = LabelEncoder()
        df[column] = le.fit_transform(df[column])
        label_encoders[column] = le
    else:
        print(f"Warning: Column '{column}' not found in the dataset.")

In [4]:
# Features and target
X = df.drop(columns=['Id', 'Price'], errors='ignore')  # Ignore errors if 'Id' or 'Price' not found
y = df['Price'] if 'Price' in df.columns else None

if y is None:
    print("Error: 'Price' column is missing in the dataset.")
    raise ValueError("'Price' column is required for the target variable.")

In [5]:
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
# Define models
models = {
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(random_state=42)
}

In [7]:
# Dictionary to store model performance
performance = {}

In [8]:
# Create directory for saving plots
os.makedirs('static/plots', exist_ok=True)

In [9]:
# Train and evaluate each model
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    performance[name] = {'MSE': mse, 'R2': r2}

In [10]:
# Create DataFrame for performance metrics
performance_df = pd.DataFrame(performance).T.reset_index().rename(columns={'index': 'Model'})

In [11]:
# MSE Bar Plot (Seaborn)
plt.figure(figsize=(10, 5))
sns.barplot(x='Model', y='MSE', data=performance_df, palette='coolwarm')
plt.title('Model Comparison - Mean Squared Error (MSE)')
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig('static/plots/mse_bar_plot.png')
plt.close()


Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x='Model', y='MSE', data=performance_df, palette='coolwarm')


In [12]:
# Choose the best model based on lowest MSE
best_model_name = min(performance, key=lambda x: performance[x]['MSE'])
best_model = models[best_model_name]

In [13]:
# Scatter Plot: Actual vs Predicted Prices for the best model (Seaborn)
y_pred_best = best_model.predict(X_test)
plt.figure(figsize=(8, 6))
sns.scatterplot(x=y_test, y=y_pred_best, alpha=0.7, edgecolor='k')
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='red', linewidth=2)
plt.xlabel('Actual Prices')
plt.ylabel('Predicted Prices')
plt.title(f'{best_model_name} - Actual vs Predicted Prices')
plt.tight_layout()
plt.savefig('static/plots/scatter_plot_actual_vs_predicted.png')
plt.close()

In [14]:
# KDE Plot: Price distribution (Seaborn)
plt.figure(figsize=(8, 6))
sns.kdeplot(df['Price'], shade=True)
plt.title('Price Distribution KDE')
plt.tight_layout()
plt.savefig('static/plots/price_distribution_kde.png')
plt.close()


`shade` is now deprecated in favor of `fill`; setting `fill=True`.
This will become an error in seaborn v0.14.0; please update your code.

  sns.kdeplot(df['Price'], shade=True)


In [15]:
# Save the best model to a file
joblib.dump(best_model, 'house_price_predictor_model.pkl')
print(f"The best model is {best_model_name} and has been saved.")

The best model is Gradient Boosting and has been saved.
