***Imports and Data Loading***

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

df_encoded = pd.read_excel("housing_encoded.xlsx")

***Target and Feature Selection***

In [6]:
Y = df_encoded["purchase_price"]
X = df_encoded.drop(columns=["purchase_price"])

print(X.shape)
print(Y.shape)

(99989, 16)
(99989,)


Train-Test ***Split***

In [7]:
X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, test_size=0.2, random_state=42
)

print(X_train.shape)
print(X_test.shape)

(79991, 16)
(19998, 16)


***Feature Scaling***

In [8]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

***Linear Regression Model***

In [9]:
lr_model = LinearRegression()
lr_model.fit(X_train_scaled, Y_train)

Y_pred = lr_model.predict(X_test_scaled)

lr_mse = mean_squared_error(Y_test, Y_pred)
lr_rmse = np.sqrt(lr_mse)
lr_r2 = r2_score(Y_test, Y_pred)

print("Linear Regression MSE:", lr_mse)
print("Linear Regression RMSE:", lr_rmse)
print("Linear Regression R2:", lr_r2)

Linear Regression MSE: 619261515190.5358
Linear Regression RMSE: 786931.7093563683
Linear Regression R2: 0.8025031678597603


***Decision Tree Regressor***

In [10]:
tree = DecisionTreeRegressor(
    max_depth=10, min_samples_split=20, random_state=42
)

tree.fit(X_train, Y_train)
y_pred = tree.predict(X_test)

tree_mse = mean_squared_error(Y_test, y_pred)
tree_rmse = np.sqrt(tree_mse)
tree_r2 = r2_score(Y_test, y_pred)

print("Tree MSE:", tree_mse)
print("Tree RMSE:", tree_rmse)
print("Tree R2:", tree_r2)

Tree MSE: 24377448838.107838
Tree RMSE: 156132.7923214974
Tree R2: 0.9922254672653027


***Random Forest Regressor***

In [11]:
rf = RandomForestRegressor(
    n_estimators=100, max_depth=15, min_samples_split=20, random_state=42
)

rf.fit(X_train, Y_train)
y_pred = rf.predict(X_test)

rf_mse = mean_squared_error(Y_test, y_pred)
rf_rmse = np.sqrt(rf_mse)
rf_r2 = r2_score(Y_test, y_pred)

print("rf MSE:", rf_mse)
print("rf RMSE:", rf_rmse)
print("rf R2:", rf_r2)

rf MSE: 8246635709.727509
rf RMSE: 90810.98892605184
rf R2: 0.9973699569753101


***Ensemble Model (Model Averaging soft voting)***

In [12]:
pred_lr = lr_model.predict(X_test_scaled)
pred_tree = tree.predict(X_test)
pred_rf = rf.predict(X_test)

ensemble_pred = (pred_lr + pred_tree + pred_rf) / 3

ensemble_mse = mean_squared_error(Y_test, ensemble_pred)
rmse_ensemble = np.sqrt(ensemble_mse)
r2_ensemble = r2_score(Y_test, ensemble_pred)

print("Ensemble MSE:", ensemble_mse)
print("Ensemble RMSE:", rmse_ensemble)
print("Ensemble R2:", r2_ensemble)

Ensemble MSE: 81543035522.22261
Ensemble RMSE: 285557.4119546236
Ensemble R2: 0.9739940383768513


In [13]:
df = pd.DataFrame({
    "Model": ["Linear Regression", "Decision Tree", "Random Forest", "Ensemble"],
    "MSE": [lr_mse, tree_mse, rf_mse, ensemble_mse],
    "RMSE": [lr_rmse, tree_rmse, rf_rmse, rmse_ensemble],
    "R2": [lr_r2, tree_r2, rf_r2, r2_ensemble]
})
print(df)

               Model           MSE           RMSE        R2
0  Linear Regression  6.192615e+11  786931.709356  0.802503
1      Decision Tree  2.437745e+10  156132.792321  0.992225
2      Random Forest  8.246636e+09   90810.988926  0.997370
3           Ensemble  8.154304e+10  285557.411955  0.973994


## Model Comparison Summary
Based on **RMSE** and **RÂ²**scores, the **Random Forest model** achieved the best performance
and was selected as the final regression model.