# Housing Pricing Predictive Model`

Importing necessary libraries and Modules

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
import category_encoders as ce

Loading the Dataset. The Hosuing Price Dataset 

In [None]:
h_p = pd.read_csv("Datasets/House_sales_Prices.csv")

h_p1 = h_p.copy()

Inspecting the Dataset

In [None]:
def inspec_dataset(df):
    """
    Inspecting and understanding the dataset.
    Stats information and summaries, null and duplicate
    values
    """
    print("\n Top Rows of House Sales Dataset")
    print(df.head(10))

    print("\n General Information about House Sales Dataset")
    print(df.info())

    print("\n House Sales Dataset Statistical Sumamries")
    print(df.describe())

    print("\n House Sales Dataset Columns:")
    print(df.columns.tolist())

In [None]:
inspec_dataset(h_p1)

# Preprocessing of House Price Dataset

Cleaning null Values

In [None]:
"""
Calculating the percentage of null values per column
The threahold of null values is pegged at 60%.
Any column with >60% of null values  is dropped.
"""
missing_percent = h_p1.isna().sum()/len(h_p1)*100 # Calculating % of missing values per column
print(missing_percent.sort_values(ascending = True))

In [None]:
threshold = 60
columns_to_drop = missing_percent[missing_percent>threshold].index
h_p1c = h_p1.drop(columns = columns_to_drop)

print(f"Dropped {len(columns_to_drop)} columns due to excessive missing values.")

In [None]:
h_p1c.shape
h_p1c1 = h_p1c.copy()

In [None]:
# Imputing remaining null values with mean for numerical values and mode for categorical values

h_p1c1 = h_p1c1.fillna(h_p1c1.mean(numeric_only = True)) # Imputing null numeric values with the mean

In [None]:
h_p1c1.shape

In [None]:
h_p1c2 = h_p1c1.fillna(h_p1c1.mode().iloc[0]) # Filling Missing categorcal values with the mode

In [None]:
h_p1c2.shape

In [None]:
h_p1c2.isna().sum().tolist() # confirming the dataset has no missing values

In [None]:
# Identifying duplicate values
h_p1c2.duplicated().sum().tolist() # Apparently shows the dataset has no duplicate values


In [None]:
h_p1c2.head(10)

Feature Encording

In [None]:
h_p1c2.head(10)

In [None]:
print(h_p1c2['SalePrice'])

In [None]:
cols_category = h_p1c2.select_dtypes(include = ["object"]).columns.tolist() # Separating categorical columns for encording
print(f"Total categorical columns: {len(cols_category)}")

In [None]:
encoder = ce.TargetEncoder(cols = cols_category) # Initializing the encoder

In [None]:
# Splitting the dataset into X,y

X = h_p1c2.drop(columns = ['Id', 'SalePrice'])
y = h_p1c2['SalePrice']

In [None]:
# Splitting X, y into training and testing 

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state = 42)

In [None]:
X_train_encoded = encoder.fit_transform(X_train, y_train) # Fitting only on training data to avoid data leakage

In [None]:
X_test_encoded = encoder.transform(X_test) # tranforming test data

In [None]:
print(X_train_encoded.head()) # confirming the encoding

# Feature Scaling

In [None]:
# initialize StandardScaler
scaler = StandardScaler()

In [None]:
# Get all columns after encoding

all_columns = X_train_encoded.columns.tolist()

In [None]:
X_train_encoded

In [None]:
X_train_scaled = scaler.fit_transform(X_train_encoded) # SCALING THE TRAINING DATSSET

In [None]:
X_test_scaled = scaler.transform(X_test_encoded)

In [None]:
rf_model = RandomForestRegressor(
    n_estimators = 1000, # number of trees
    max_depth = 100, # Maximum tree depth
    min_samples_split = 10, # minimum sample to split node
    random_state = 42,
    n_jobs = -1 # Use all CPU cores
)
rf_model

In [None]:
rf_model.fit(X_train_scaled, y_train)

In [None]:
y_pred = rf_model.predict(X_test_scaled)

In [None]:
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = mse**0.5 #root mean_squared_error
r2 = r2_score(y_test, y_pred)

In [151]:
print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
print(f"R Squared(R2): {r2:.4f}")

Mean Absolute Error (MAE): 17174.1685
Mean Squared Error (MSE): 850757648.4966
Root Mean Squared Error (RMSE): 29167.7501
R Squared(R2): 0.8891


# Business Impact Perspective
<!-- Metric	Business Question	            Model's Answer -->
<!-- MAE	How wrong are we typically?	            "We're usually off by $17k" -->
<!-- RMSE	How bad can the worst errors get?    "Biggest mistakes cost ~$29k" -->
<!-- R²	How much pattern do we capture?	     "We understand 89% of price drivers" -->