In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
dataset = pd.read_csv("housing.csv")
dataset.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished


In [4]:
dataset.isnull().sum()

price               0
area                0
bedrooms            0
bathrooms           0
stories             0
mainroad            0
guestroom           0
basement            0
hotwaterheating     0
airconditioning     0
parking             0
prefarea            0
furnishingstatus    0
dtype: int64

In [5]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 545 entries, 0 to 544
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   price             545 non-null    int64 
 1   area              545 non-null    int64 
 2   bedrooms          545 non-null    int64 
 3   bathrooms         545 non-null    int64 
 4   stories           545 non-null    int64 
 5   mainroad          545 non-null    object
 6   guestroom         545 non-null    object
 7   basement          545 non-null    object
 8   hotwaterheating   545 non-null    object
 9   airconditioning   545 non-null    object
 10  parking           545 non-null    int64 
 11  prefarea          545 non-null    object
 12  furnishingstatus  545 non-null    object
dtypes: int64(6), object(7)
memory usage: 55.5+ KB


In [6]:
dataset.describe()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,parking
count,545.0,545.0,545.0,545.0,545.0,545.0
mean,4766729.0,5150.541284,2.965138,1.286239,1.805505,0.693578
std,1870440.0,2170.141023,0.738064,0.50247,0.867492,0.861586
min,1750000.0,1650.0,1.0,1.0,1.0,0.0
25%,3430000.0,3600.0,2.0,1.0,1.0,0.0
50%,4340000.0,4600.0,3.0,1.0,2.0,0.0
75%,5740000.0,6360.0,3.0,2.0,2.0,1.0
max,13300000.0,16200.0,6.0,4.0,4.0,3.0


# Seperating the data into features and target

In [26]:
# Numerical → median
num_cols = dataset.select_dtypes(include=["int64", "float64"]).columns
dataset[num_cols] = dataset[num_cols].fillna(dataset[num_cols].median())

# Categorical → mode
cat_cols = dataset.select_dtypes(include=["object"]).columns
dataset[cat_cols] = dataset[cat_cols].fillna(dataset[cat_cols].mode().iloc[0])


In [27]:
X = dataset.drop("price", axis=1)
y = dataset["price"]
print(y)

0      13300000
1      12250000
2      12250000
3      12215000
4      11410000
         ...   
540     1820000
541     1767150
542     1750000
543     1750000
544     1750000
Name: price, Length: 545, dtype: int64


# Split the data into training, testing and validation set

In [28]:
from sklearn.model_selection import train_test_split
x_train, x_, y_train, y_ = train_test_split(X, y, test_size=0.2, random_state=42)
x_cv, x_test, y_cv, y_test = train_test_split(x_, y_, test_size=0.5, random_state=42)

In [29]:
numerical_features = [
    "area", "bedrooms", "bathrooms",
    "stories", "parking"
]

binary_features = [
    "mainroad", "guestroom", "basement",
    "hotwaterheating", "airconditioning",
    "prefarea"
]

categorical_features = ["furnishingstatus"]


In [30]:
binary_map = {"yes": 1, "no": 0}

for col in binary_features:
    x_train[col] = x_train[col].map(binary_map)
    x_cv[col]   = x_cv[col].map(binary_map)
    x_test[col]  = x_test[col].map(binary_map)


# One hot encoding

In [None]:
from sklearn.preprocessing import OneHotEncoder

if set(categorical_features).issubset(x_train.columns):
    ohe = OneHotEncoder(drop="first", sparse_output=False)
    
    x_train_ohe = ohe.fit_transform(x_train[categorical_features])
    x_cv_ohe   = ohe.transform(x_cv[categorical_features])
    x_test_ohe  = ohe.transform(x_test[categorical_features])
    
    ohe_cols = ohe.get_feature_names_out(categorical_features)
    
    x_train_ohe = pd.DataFrame(x_train_ohe, columns=ohe_cols, index=x_train.index)
    x_cv_ohe   = pd.DataFrame(x_cv_ohe, columns=ohe_cols, index=x_cv.index)
    x_test_ohe  = pd.DataFrame(x_test_ohe, columns=ohe_cols, index=x_test.index)
    
    x_train = pd.concat([x_train.drop(columns=categorical_features), x_train_ohe], axis=1)
    x_cv   = pd.concat([x_cv.drop(columns=categorical_features), x_cv_ohe], axis=1)
    x_test  = pd.concat([x_test.drop(columns=categorical_features), x_test_ohe], axis=1)
    print("One-hot encoding completed.")
else:
    print("One-hot encoding already completed or columns missing.")

# Verify columns
print(x_train.columns)

One-hot encoding completed.
Index(['area', 'bedrooms', 'bathrooms', 'stories', 'mainroad', 'guestroom',
       'basement', 'hotwaterheating', 'airconditioning', 'parking', 'prefarea',
       'furnishingstatus_semi-furnished', 'furnishingstatus_unfurnished'],
      dtype='object')


# Feature Scaling

In [23]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

x_train[numerical_features] = scaler.fit_transform(x_train[numerical_features])
x_cv[numerical_features]   = scaler.transform(x_cv[numerical_features])
x_test[numerical_features]  = scaler.transform(x_test[numerical_features])

In [32]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(x_train, y_train)

In [37]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

y_cv_pred = model.predict(x_cv)

print("Validation MAE :", mean_absolute_error(y_cv, y_cv_pred))
print("Validation RMSE:", np.sqrt(mean_squared_error(y_cv, y_cv_pred)))
print("Validation R²  :", r2_score(y_cv, y_cv_pred))


Validation MAE : 917947.5271673952
Validation RMSE: 1305190.4173620935
Validation R²  : 0.6344222341441875


In [39]:
y_test_pred = model.predict(x_test)

print("Test MAE :", mean_absolute_error(y_test, y_test_pred))
print("Test RMSE:", np.sqrt(mean_squared_error(y_test, y_test_pred)))
print("Test R²  :", r2_score(y_test, y_test_pred))


Test MAE : 1021192.082913792
Test RMSE: 1343202.0597881707
Test R²  : 0.6679234454252654


In [40]:
new_house = pd.DataFrame({
    "area": [3000],
    "bedrooms": [3],
    "bathrooms": [2],
    "stories": [2],
    "mainroad": [1],          # yes → 1, no → 0
    "guestroom": [0],
    "basement": [1],
    "hotwaterheating": [0],
    "airconditioning": [1],
    "parking": [2],
    "prefarea": [1],
    "furnishingstatus": ["furnished"]
})


In [None]:
binary_map = {"yes": 1, "no": 0}


In [42]:
new_ohe = ohe.transform(new_house[["furnishingstatus"]])
new_ohe = pd.DataFrame(
    new_ohe,
    columns=ohe.get_feature_names_out(["furnishingstatus"])
)

new_house = pd.concat(
    [new_house.drop(columns=["furnishingstatus"]), new_ohe],
    axis=1
)


In [43]:
new_house[numerical_features] = scaler.transform(
    new_house[numerical_features]
)


In [44]:
predicted_price = model.predict(new_house)
print("Predicted House Price:", predicted_price[0])


Predicted House Price: 6831289.889876218
