**Import Dependencies**

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn.datasets
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from sklearn import metrics
#from sklearn.datasets import fetch_california_housing

In [13]:
price = pd.read_csv('Housing.csv')
price.head(50)

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished
5,10850000,7500,3,3,1,yes,no,yes,no,yes,2,yes,semi-furnished
6,10150000,8580,4,3,4,yes,no,no,no,yes,2,yes,semi-furnished
7,10150000,16200,5,3,2,yes,no,no,no,no,0,no,unfurnished
8,9870000,8100,4,1,2,yes,yes,yes,no,yes,2,yes,furnished
9,9800000,5750,3,2,4,yes,yes,no,no,yes,1,yes,unfurnished


# **DATA PRE-PROCESSING**

In [11]:
# Check datatypes and non-null values
print(price.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 545 entries, 0 to 544
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   price             545 non-null    int64 
 1   area              545 non-null    int64 
 2   bedrooms          545 non-null    int64 
 3   bathrooms         545 non-null    int64 
 4   stories           545 non-null    int64 
 5   mainroad          545 non-null    object
 6   guestroom         545 non-null    object
 7   basement          545 non-null    object
 8   hotwaterheating   545 non-null    object
 9   airconditioning   545 non-null    object
 10  parking           545 non-null    int64 
 11  prefarea          545 non-null    object
 12  furnishingstatus  545 non-null    object
dtypes: int64(6), object(7)
memory usage: 55.5+ KB
None


In [12]:
# Get statistics of numerical columns
print(price.describe())

              price          area  ...     stories     parking
count  5.450000e+02    545.000000  ...  545.000000  545.000000
mean   4.766729e+06   5150.541284  ...    1.805505    0.693578
std    1.870440e+06   2170.141023  ...    0.867492    0.861586
min    1.750000e+06   1650.000000  ...    1.000000    0.000000
25%    3.430000e+06   3600.000000  ...    1.000000    0.000000
50%    4.340000e+06   4600.000000  ...    2.000000    0.000000
75%    5.740000e+06   6360.000000  ...    2.000000    1.000000
max    1.330000e+07  16200.000000  ...    4.000000    3.000000

[8 rows x 6 columns]


In [14]:
# label encoding for binary cols
binary_cols = ['mainroad' , 'guestroom' , 'basement' , 'hotwaterheating' , 'airconditioning' , 'prefarea']
for col in binary_cols:
  price[col] = price[col].map({'yes' : 1, 'no' : 0})

In [15]:
price.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,1,0,0,0,1,2,1,furnished
1,12250000,8960,4,4,4,1,0,0,0,1,3,0,furnished
2,12250000,9960,3,2,2,1,0,1,0,0,2,1,semi-furnished
3,12215000,7500,4,2,2,1,0,1,0,1,3,1,furnished
4,11410000,7420,4,1,2,1,1,1,0,1,2,0,furnished


In [16]:
# hot label encoding for cols having option other than ys or no
price = pd.get_dummies(price , columns=['furnishingstatus'], drop_first=True)


In [21]:
print(price.head())
print(price.dtypes)


      price  ...  furnishingstatus_unfurnished
0  13300000  ...                             0
1  12250000  ...                             0
2  12250000  ...                             0
3  12215000  ...                             0
4  11410000  ...                             0

[5 rows x 14 columns]
price                              int64
area                               int64
bedrooms                           int64
bathrooms                          int64
stories                            int64
mainroad                           int64
guestroom                          int64
basement                           int64
hotwaterheating                    int64
airconditioning                    int64
parking                            int64
prefarea                           int64
furnishingstatus_semi-furnished    int64
furnishingstatus_unfurnished       int64
dtype: object


In [22]:
# Convert boolean columns to integers (0 and 1)
price['furnishingstatus_semi-furnished'] = price['furnishingstatus_semi-furnished'].astype(int)
price['furnishingstatus_unfurnished'] = price['furnishingstatus_unfurnished'].astype(int)


In [23]:
print(price.head())
print(price.dtypes)


      price  ...  furnishingstatus_unfurnished
0  13300000  ...                             0
1  12250000  ...                             0
2  12250000  ...                             0
3  12215000  ...                             0
4  11410000  ...                             0

[5 rows x 14 columns]
price                              int64
area                               int64
bedrooms                           int64
bathrooms                          int64
stories                            int64
mainroad                           int64
guestroom                          int64
basement                           int64
hotwaterheating                    int64
airconditioning                    int64
parking                            int64
prefarea                           int64
furnishingstatus_semi-furnished    int64
furnishingstatus_unfurnished       int64
dtype: object


# **Split your dataset into features and target**

In [25]:
x = price.drop('price', axis = 1) # features exceptpric of the house
y = price['price'] # target value

# **Train-Test Split**

In [26]:
X_train, X_test, Y_train, Y_test = train_test_split(x, y, test_size=0.2, random_state=2)

In [27]:
model = LinearRegression()
model.fit(X_train, Y_train)

In [29]:
# Make Predictions
y_pred = model.predict(X_test)


In [31]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

print("MAE:", mean_absolute_error(Y_test, y_pred))
print("MSE:", mean_squared_error(Y_test, y_pred))
print("RMSE:", np.sqrt(mean_squared_error(Y_test, y_pred)))
print("R² Score:", r2_score(Y_test, y_pred))


MAE: 873450.0161445083
MSE: 1313683610862.5576
RMSE: 1146160.377461443
R² Score: 0.6298518608872238
