In [1]:
# this contain the implementation of house price prediction dataset with streamlit
# this file contian the code for hyper parameter tuning using grid search
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV 
from sklearn.model_selection import RandomizedSearchCV
import pickle
import statistics as stats

In [2]:
df = pd.read_csv("Housing.csv")
df.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished


In [3]:
def stat(array, name="array"):
    mean_val  = np.mean(array)
    median_val = np.median(array)
    std = np.std(array, ddof=1)

    print(f"""For {name}:
    Mean   : {mean_val},
    Median : {median_val},
    Std dev: {std}""")


cols = ["price", "area", "bedrooms", "bathrooms", "stories", "parking"]

for col in cols:
    stat(df[col], name=col)




For price:
    Mean   : 4766729.247706422,
    Median : 4340000.0,
    Std dev: 1870439.6156573922
For area:
    Mean   : 5150.54128440367,
    Median : 4600.0,
    Std dev: 2170.141022508803
For bedrooms:
    Mean   : 2.9651376146788992,
    Median : 3.0,
    Std dev: 0.7380638605685743
For bathrooms:
    Mean   : 1.2862385321100918,
    Median : 1.0,
    Std dev: 0.5024696160532146
For stories:
    Mean   : 1.8055045871559634,
    Median : 2.0,
    Std dev: 0.8674924629255264
For parking:
    Mean   : 0.6935779816513762,
    Median : 0.0,
    Std dev: 0.8615857504605412


In [4]:
X = df.drop(["price"], axis=1) # drop using list
X = pd.get_dummies(X, drop_first=True)
y = df["price"]

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2) 

print(X_train.columns)

Index(['area', 'bedrooms', 'bathrooms', 'stories', 'parking', 'mainroad_yes',
       'guestroom_yes', 'basement_yes', 'hotwaterheating_yes',
       'airconditioning_yes', 'prefarea_yes',
       'furnishingstatus_semi-furnished', 'furnishingstatus_unfurnished'],
      dtype='object')


In [5]:
# random forest model 2 that is hyperparameter tuned
# n_estimators, max_depth, max_features, min_samples_split

param_grid = {
    'n_estimators': [100, 300, 500, 800],          # more trees → more stable
    'max_depth': [None, 10, 20, 30, 50],           # deeper trees may capture more patterns
    'min_samples_split': [2, 5, 10, 20],
    'min_samples_leaf': [1, 2, 4, 6],
    'max_features': ['sqrt', 'log2', None]
       # controls feature randomness
}




random_search = RandomizedSearchCV(
    estimator = RandomForestRegressor(random_state=42),
    param_distributions=param_grid,
    n_iter=50,             # number of random combos to try
    cv=3,
    n_jobs=-1,
    verbose=2,
    random_state=42
)
print(X_train)
random_search.fit(X_train, y_train)
print(random_search.best_params_)




     area  bedrooms  bathrooms  stories  parking  mainroad_yes  guestroom_yes  \
428  4040         2          1        1        1          True          False   
268  4950         4          1        2        0          True          False   
99   6000         4          1        3        0          True           True   
44   6000         4          2        4        1          True          False   
303  4500         3          1        1        0          True          False   
..    ...       ...        ...      ...      ...           ...            ...   
243  2550         3          1        2        0          True          False   
114  6800         2          1        1        2          True           True   
256  4000         3          1        2        2          True          False   
206  5800         2          1        1        0          True           True   
459  3500         2          1        1        0          True          False   

     basement_yes  hotwater

In [6]:
best_model = random_search.best_estimator_
best_model.fit(X_train, y_train)
y_pred_best = best_model.predict(X_test)

In [7]:
with open("house_price_model.pkl", "wb") as file:
    pickle.dump(best_model, file)