In [54]:
# Import necessary libraries
import pandas as pd
import numpy as np
import joblib
import matplotlib.pyplot as plt
import seaborn as sns
import os
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler

In [55]:
# Set random seed for reproducibility
np.random.seed(42)

In [56]:
# Create directories if they don’t exist
os.makedirs('Images', exist_ok=True)
os.makedirs('Models', exist_ok=True)

In [57]:
# Load dataset
try:
    df = pd.read_csv('C:/Users/macie/OneDrive/Pulpit/Praca/Housing.csv')
    print("Dataset loaded successfully.")
except FileNotFoundError:
    print("Error: Could not find Housing.csv at the specified path.")
    exit(1)

Dataset loaded successfully.


In [58]:
# Dataset exploration
print("\nDataset Info:")
print(df.info())

print("\nFirst Few Rows:")
print(df.head())

print("\nSummary Statistics:")
print(df.describe())

print("\nMissing Values:")
print(df.isnull().sum())


Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 545 entries, 0 to 544
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   price             545 non-null    int64 
 1   area              545 non-null    int64 
 2   bedrooms          545 non-null    int64 
 3   bathrooms         545 non-null    int64 
 4   stories           545 non-null    int64 
 5   mainroad          545 non-null    object
 6   guestroom         545 non-null    object
 7   basement          545 non-null    object
 8   hotwaterheating   545 non-null    object
 9   airconditioning   545 non-null    object
 10  parking           545 non-null    int64 
 11  prefarea          545 non-null    object
 12  furnishingstatus  545 non-null    object
dtypes: int64(6), object(7)
memory usage: 55.5+ KB
None

First Few Rows:
      price  area  bedrooms  bathrooms  stories mainroad guestroom basement  \
0  13300000  7420         4          2

In [59]:
# --- Graph 1: Raw Price Skewness ---
plt.figure(figsize=(10, 6))
sns.histplot(df['price'], kde=True, color='blue')
plt.title('Raw Price Distribution (Right-Skewed)')
plt.xlabel('Price')
plt.ylabel('Frequency')
plt.savefig('Images/raw_price_skewness.png')
plt.close()
print("Raw price skewness graph saved to images/raw_price_skewness.png")

Raw price skewness graph saved to images/raw_price_skewness.png


In [60]:
# Encoding categorical variables
binary_columns = ['mainroad', 'guestroom', 'basement', 'hotwaterheating', 'airconditioning', 'prefarea']
for col in binary_columns:
    df[col] = df[col].apply(lambda x: 1 if x == 'yes' else 0)

df['furnishingstatus'] = df['furnishingstatus'].apply(
    lambda x: 1 if x == 'semi-furnished' else (2 if x == 'furnished' else 0)
)

In [61]:
# Log transform target variable
df['log_price'] = np.log(df['price'])
df.drop(columns=['price'], inplace=True)

In [62]:
# One-hot encode "furnishingstatus" while avoiding multicollinearity
df = pd.get_dummies(df, columns=['furnishingstatus'], drop_first=True)

In [63]:
# Convert furnishingstatus columns to integers (1s & 0s)
furnishing_columns = [col for col in df.columns if 'furnishingstatus' in col]
df[furnishing_columns] = df[furnishing_columns].astype(int)

In [64]:
# Check for NaN/Inf
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.dropna(inplace=True)

In [65]:
# Dataset exploration after preproccessing
print("\nDataset Info:")
print(df.info())

print("\nFirst Few Rows:")
print(df.head())

print("\nSummary Statistics:")
print(df.describe())

print("\nMissing Values:")
print(df.isnull().sum())


Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 545 entries, 0 to 544
Data columns (total 14 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   area                545 non-null    int64  
 1   bedrooms            545 non-null    int64  
 2   bathrooms           545 non-null    int64  
 3   stories             545 non-null    int64  
 4   mainroad            545 non-null    int64  
 5   guestroom           545 non-null    int64  
 6   basement            545 non-null    int64  
 7   hotwaterheating     545 non-null    int64  
 8   airconditioning     545 non-null    int64  
 9   parking             545 non-null    int64  
 10  prefarea            545 non-null    int64  
 11  log_price           545 non-null    float64
 12  furnishingstatus_1  545 non-null    int32  
 13  furnishingstatus_2  545 non-null    int32  
dtypes: float64(1), int32(2), int64(11)
memory usage: 55.5 KB
None

First Few Rows:
   area  bed

In [66]:
# Splitting features and target (before feature engineering)
X_before = df.drop('log_price', axis=1)
y = df['log_price']

In [67]:
# Train-Test split (before feature engineering)
X_train_before, X_test_before, y_train, y_test = train_test_split(X_before, y, test_size=0.2, random_state=42)

In [68]:
# Standardize features
scaler = StandardScaler()
X_train_before_scaled = scaler.fit_transform(X_train_before)
X_test_before_scaled = scaler.transform(X_test_before)
X_train_before = pd.DataFrame(X_train_before_scaled, columns=X_before.columns, index=X_train_before.index)
X_test_before = pd.DataFrame(X_test_before_scaled, columns=X_before.columns, index=X_test_before.index)

In [69]:
# Train initial models before feature engineering
rf_before_initial = RandomForestRegressor(random_state=42, n_jobs=-1)
rf_before_initial.fit(X_train_before, y_train)
xgb_before_initial = XGBRegressor(random_state=42)
xgb_before_initial.fit(X_train_before, y_train)

In [70]:
# Define hyperparameter space
param_dist_rf = {
    'n_estimators': [100, 300, 500, 800, 1000],
    'max_depth': [3, 5, 10, 15, 20, None],
    'min_samples_split': [2, 5, 10, 15, 20],
    'min_samples_leaf': [1, 2, 4, 6, 8],
    'max_features': ['sqrt', 'log2']
}

param_dist_xgb = {
    'n_estimators': [100, 300, 500, 800, 1000],
    'learning_rate': [0.005, 0.01, 0.03, 0.05, 0.1],
    'max_depth': [2, 3, 5, 7, 10],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'gamma': [0, 0.1, 0.2, 0.5],
    'lambda': [1, 1.5, 2, 3]
}

In [71]:
# RandomizedSearchCV (before feature engineering)
random_search_rf_before = RandomizedSearchCV(RandomForestRegressor(n_jobs=-1, random_state=42), param_dist_rf, n_iter=30, cv=5, scoring='neg_root_mean_squared_error', n_jobs=-1, verbose=1)
random_search_rf_before.fit(X_train_before, y_train)
best_rf_before = random_search_rf_before.best_estimator_

random_search_xgb_before = RandomizedSearchCV(XGBRegressor(random_state=42), param_dist_xgb, n_iter=30, cv=5, scoring='neg_root_mean_squared_error', n_jobs=-1, verbose=1)
random_search_xgb_before.fit(X_train_before, y_train)
best_xgb_before = random_search_xgb_before.best_estimator_

Fitting 5 folds for each of 30 candidates, totalling 150 fits
Fitting 5 folds for each of 30 candidates, totalling 150 fits


In [72]:
# Apply feature engineering
df["area_per_bedroom"] = df["area"] / df["bedrooms"]
df["bathrooms_per_bedroom"] = df["bathrooms"] / df["bedrooms"]
df["total_rooms"] = df["bedrooms"] + df["bathrooms"]

df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.dropna(inplace=True)

In [73]:
# --- Graph 2b: Correlation Heatmap (All Features) ---
numerical_cols_all = [col for col in df.columns if col != 'log_price']
plt.figure(figsize=(12, 10))
sns.heatmap(df[numerical_cols_all].corr(), annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Correlation Heatmap of All Features After Engineering')
plt.savefig('images/correlation_heatmap_all.png')
plt.close()
print("Correlation heatmap (all features) saved to Images/correlation_heatmap_all.png")

Correlation heatmap (all features) saved to images/correlation_heatmap_all.png


In [74]:
# Splitting features and target (After Feature Engineering)
X_after = df.drop('log_price', axis=1)
y = df['log_price']

X_train_after, X_test_after, y_train, y_test = train_test_split(X_after, y, test_size=0.2, random_state=42)

In [75]:
# Standardize features
X_train_after_scaled = scaler.fit_transform(X_train_after)
X_test_after_scaled = scaler.transform(X_test_after)
X_train_after = pd.DataFrame(X_train_after_scaled, columns=X_after.columns, index=X_train_after.index)
X_test_after = pd.DataFrame(X_test_after_scaled, columns=X_after.columns, index=X_test_after.index)

In [76]:
# Train initial models after feature engineering
rf_after_initial = RandomForestRegressor(random_state=42, n_jobs=-1)
rf_after_initial.fit(X_train_after, y_train)
xgb_after_initial = XGBRegressor(random_state=42)
xgb_after_initial.fit(X_train_after, y_train)

In [77]:
# RandomizedSearchCV (After Feature Engineering)
random_search_rf_after = RandomizedSearchCV(RandomForestRegressor(n_jobs=-1, random_state=42), param_dist_rf, n_iter=30, cv=5, scoring='neg_root_mean_squared_error', n_jobs=-1, verbose=1)
random_search_rf_after.fit(X_train_after, y_train)
best_rf_after = random_search_rf_after.best_estimator_

random_search_xgb_after = RandomizedSearchCV(XGBRegressor(random_state=42), param_dist_xgb, n_iter=30, cv=5, scoring='neg_root_mean_squared_error', n_jobs=-1, verbose=1)
random_search_xgb_after.fit(X_train_after, y_train)
best_xgb_after = random_search_xgb_after.best_estimator_

Fitting 5 folds for each of 30 candidates, totalling 150 fits
Fitting 5 folds for each of 30 candidates, totalling 150 fits


In [78]:
# Define models and test sets
models = {
    "RF Before Features (Initial)": (rf_before_initial, X_test_before),
    "RF Before Features (Tuned)": (best_rf_before, X_test_before),
    "RF After Features (Initial)": (rf_after_initial, X_test_after),
    "RF After Features (Tuned)": (best_rf_after, X_test_after),
    "XGB Before Features (Initial)": (xgb_before_initial, X_test_before),
    "XGB Before Features (Tuned)": (best_xgb_before, X_test_before),
    "XGB After Features (Initial)": (xgb_after_initial, X_test_after),
    "XGB After Features (Tuned)": (best_xgb_after, X_test_after)
}

In [79]:
# Evaluate models
results = {}
for name, (model, X_test) in models.items():
    y_pred = model.predict(X_test)
    rmse_log = np.sqrt(mean_squared_error(y_test, y_pred))
    y_pred_original = np.exp(y_pred)
    y_test_original = np.exp(y_test)
    rmse_original = np.sqrt(mean_squared_error(y_test_original, y_pred_original))
    r2 = r2_score(y_test, y_pred)
    rmse_mean_ratio = (rmse_original / y_test_original.mean()) * 100
    results[name] = {
        "R²": r2,
        "RMSE (Log)": rmse_log,
        "RMSE (Original)": rmse_original,
        "RMSE / Mean Price (%)": rmse_mean_ratio
    }

results_df = pd.DataFrame(results).T
print("\nFinal Model Comparison:")
print(results_df.round(3))


Final Model Comparison:
                                  R²  RMSE (Log)  RMSE (Original)  \
RF Before Features (Initial)   0.619       0.271      1426153.219   
RF Before Features (Tuned)     0.629       0.268      1450327.404   
RF After Features (Initial)    0.616       0.272      1419026.448   
RF After Features (Tuned)      0.628       0.268      1437218.040   
XGB Before Features (Initial)  0.572       0.288      1492328.349   
XGB Before Features (Tuned)    0.671       0.252      1339130.140   
XGB After Features (Initial)   0.583       0.284      1468310.082   
XGB After Features (Tuned)     0.646       0.262      1397023.093   

                               RMSE / Mean Price (%)  
RF Before Features (Initial)                  28.480  
RF Before Features (Tuned)                    28.963  
RF After Features (Initial)                   28.338  
RF After Features (Tuned)                     28.701  
XGB Before Features (Initial)                 29.802  
XGB Before Features (Tu

In [80]:
# --- Graph 3: Feature Importance ---
plt.figure(figsize=(10, 6))
importances = best_xgb_before.feature_importances_
feature_names = X_test_before.columns
sns.barplot(x=importances, y=feature_names)
plt.title('Feature Importance (XGBoost Before Features Tuned)')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.savefig('images/feature_importance.png')
plt.close()
print("Feature importance graph saved to images/feature_importance.png")

Feature importance graph saved to Images/feature_importance.png


In [81]:
# --- Graph 4: Predicted vs Actual ---
y_pred = best_xgb_before.predict(X_test_before)
plt.figure(figsize=(10, 6))
plt.scatter(np.exp(y_test), np.exp(y_pred), alpha=0.5, color='blue')
plt.plot([1.75e6, 1.33e7], [1.75e6, 1.33e7], 'r--')
plt.title('Predicted vs Actual Prices (XGBoost Before Features Tuned)')
plt.xlabel('Actual Price')
plt.ylabel('Predicted Price')
plt.savefig('images/predicted_vs_actual.png')
plt.close()
print("Predicted vs actual prices graph saved to Images/predicted_vs_actual.png")

Predicted vs actual prices graph saved to Images/predicted_vs_actual.png


In [82]:
# --- Graph 5: Residual Plot ---
plt.figure(figsize=(10, 6))
residuals = y_test - y_pred
plt.scatter(y_pred, residuals, alpha=0.5, color='purple')
plt.axhline(0, color='red', linestyle='--')
plt.title('Residual Plot (XGBoost Before Features Tuned)')
plt.xlabel('Predicted Log Price')
plt.ylabel('Residuals (Actual - Predicted)')
plt.savefig('images/residual_plot.png')
plt.close()
print("Residual plot saved to Images/residual_plot.png")

Residual plot saved to Images/residual_plot.png


In [83]:
# Save the best model
joblib.dump(best_xgb_before, "Models/best_xgb_model.pkl")
print("Best model saved to Models/best_xgb_model.pkl")

Best model saved to models/best_xgb_model.pkl
