In [1]:
# import joblib
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
import os
import pickle

In [2]:
# Load the dataset
file_path = "D:\Github\learning\machine-learning\Project\project-testing\project1\dataset\HOUSE_PRICE_PREDICTION.csv"
df = pd.read_csv(file_path)

In [3]:
# Selecting the target variable
target = "Price"

In [4]:
# Dropping non-numeric and identifier columns
drop_columns = ["ID", "City/District", "State", "Locality"]
df_clean = df.drop(columns=[col for col in drop_columns if col in df.columns], errors='ignore')

In [5]:

# Encoding categorical variables
for col in df_clean.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    df_clean[col] = le.fit_transform(df_clean[col])

In [6]:
# Splitting data into features and target variable
X = df_clean.drop(columns=[target])
y = df_clean[target]

In [8]:
X

Unnamed: 0,id,Date,number of bedrooms,number of bathrooms,living area,lot area,number of floors,waterfront present,number of views,condition of the house,...,Area of the house(excluding basement),Area of the basement,Built Year,Renovation Year,Postal Code,living_area_renov,lot_area_renov,Number of schools nearby,Distance from the airport,City
0,1,42491,5,2.50,3650,9050,2.0,0,4,5,...,3370,280,1921,0,122003,2880,5400,2,58,8
1,2,42491,4,2.50,2920,4000,1.5,0,0,5,...,1910,1010,1909,0,122004,2470,4000,2,51,3
2,3,42491,5,2.75,2910,9480,1.5,0,0,3,...,2910,0,1939,0,122004,2940,6600,1,53,1
3,4,42491,4,2.50,3310,42998,2.0,0,0,3,...,3310,0,2001,0,122005,3350,42847,3,76,4
4,5,42491,3,2.00,2710,4500,1.5,0,0,4,...,1880,830,1929,0,122006,2060,4500,1,51,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14615,14616,42734,2,1.50,1556,20000,1.0,0,0,4,...,1556,0,1957,0,122066,2250,17286,3,76,6
14616,14617,42734,3,2.00,1680,7000,1.5,0,0,4,...,1680,0,1968,0,122072,1540,7480,3,59,9
14617,14618,42734,2,1.00,1070,6120,1.0,0,0,3,...,1070,0,1962,0,122056,1130,6120,2,64,0
14618,14619,42734,4,1.00,1030,6621,1.0,0,0,4,...,1030,0,1955,0,122042,1420,6631,3,54,5


In [12]:
y

0        2380000
1        1400000
2        1200000
3         838000
4         805000
          ...   
14615     221700
14616     219200
14617     209000
14618     205000
14619     146000
Name: Price, Length: 14620, dtype: int64

In [9]:
# Standardizing the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [10]:

# Splitting into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)


In [11]:
X_train

array([[-0.20151973, -0.26042715, -0.40412136, ...,  1.2086254 ,
        -1.56126035,  0.87038828],
       [ 0.84718846,  0.80868362,  0.66119676, ..., -0.01498123,
         1.23650522,  0.52223297],
       [-1.40755784, -1.3740842 ,  0.66119676, ..., -1.23858786,
         1.68414771,  0.87038828],
       ...,
       [-0.45481143, -0.48315856, -0.40412136, ..., -0.01498123,
        -1.44934973,  1.21854359],
       [-1.52816166, -1.47802553, -1.46943948, ..., -1.23858786,
        -0.44215412,  1.21854359],
       [-0.00935923, -0.06739326, -0.40412136, ..., -1.23858786,
         1.57223709,  1.21854359]])

In [9]:
# Initialize and train the Random Forest model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

In [10]:
# Model evaluation
y_pred = rf_model.predict(X_test)
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"Random Forest Model Performance:\nR² Score: {r2:.4f}\nRMSE: {rmse:.2f}")


Random Forest Model Performance:
R² Score: 0.8102
RMSE: 163522.00


In [11]:
# Save the model and scaler
pickle.dump(rf_model, open(r'D:\Github\learning\machine-learning\Project\project-testing\project1\models\random_forest_model.pkl', 'wb'))
pickle.dump(scaler, open(r'D:\Github\learning\machine-learning\Project\project-testing\project1\models\scaler.pkl', 'wb'))