In [1]:
#imported libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sn
import warnings as wr
wr.filterwarnings('ignore')

In [2]:
#reading our dataset
data=pd.read_csv('Housing.csv') 

In [3]:
#quick overview of the dataset
data.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 545 entries, 0 to 544
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   price             545 non-null    int64 
 1   area              545 non-null    int64 
 2   bedrooms          545 non-null    int64 
 3   bathrooms         545 non-null    int64 
 4   stories           545 non-null    int64 
 5   mainroad          545 non-null    object
 6   guestroom         545 non-null    object
 7   basement          545 non-null    object
 8   hotwaterheating   545 non-null    object
 9   airconditioning   545 non-null    object
 10  parking           545 non-null    int64 
 11  prefarea          545 non-null    object
 12  furnishingstatus  545 non-null    object
dtypes: int64(6), object(7)
memory usage: 55.5+ KB


In [5]:
data.describe()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,parking
count,545.0,545.0,545.0,545.0,545.0,545.0
mean,4766729.0,5150.541284,2.965138,1.286239,1.805505,0.693578
std,1870440.0,2170.141023,0.738064,0.50247,0.867492,0.861586
min,1750000.0,1650.0,1.0,1.0,1.0,0.0
25%,3430000.0,3600.0,2.0,1.0,1.0,0.0
50%,4340000.0,4600.0,3.0,1.0,2.0,0.0
75%,5740000.0,6360.0,3.0,2.0,2.0,1.0
max,13300000.0,16200.0,6.0,4.0,4.0,3.0


In [8]:
data.shape

(545, 13)

In [9]:
data.columns.tolist()

['price',
 'area',
 'bedrooms',
 'bathrooms',
 'stories',
 'mainroad',
 'guestroom',
 'basement',
 'hotwaterheating',
 'airconditioning',
 'parking',
 'prefarea',
 'furnishingstatus']

In [10]:
#to check for null values in dataset
data.isna().sum()

price               0
area                0
bedrooms            0
bathrooms           0
stories             0
mainroad            0
guestroom           0
basement            0
hotwaterheating     0
airconditioning     0
parking             0
prefarea            0
furnishingstatus    0
dtype: int64

In [9]:
#it gives us the unique values in dataset
data.nunique()

price               219
area                284
bedrooms              6
bathrooms             4
stories               4
mainroad              2
guestroom             2
basement              2
hotwaterheating       2
airconditioning       2
parking               4
prefarea              2
furnishingstatus      3
dtype: int64

In [11]:
original_df=pd.DataFrame(data)

#creating a copy of it
copyData=original_df.copy()

In [12]:
copyData.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished


In [13]:
from pandas_profiling import ProfileReport
profile=ProfileReport(copyData, explorative=True, dark_mode=True) 
profile.to_file('eda.html')

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

 ## Feature Engineering
 1. Create new features that might be useful for predicting house prices.
 2. Perform feature scaling if necessary.
 3. Encode categorical variables appropriately.
 ## Model Building
 1. Split the dataset into training and testing sets.
 2. Train at least three different regression models
 3. Evaluate the models using appropriate metrics (e.g., Mean Absolute Error,
 Mean Squared Error, R-squared).
 4. Select the best model based on evaluation metrics

In [14]:
#step-1, performing feature engineering!!

#will give info abput categorical and numerical columns
cat_col=copyData.select_dtypes(include=['object']).columns
num_col=copyData.select_dtypes(include=np.number).columns.tolist()
print("Cat columns")
print(cat_col) 
print("Num Columns")
print(num_col)

Cat columns
Index(['mainroad', 'guestroom', 'basement', 'hotwaterheating',
       'airconditioning', 'prefarea', 'furnishingstatus'],
      dtype='object')
Num Columns
['price', 'area', 'bedrooms', 'bathrooms', 'stories', 'parking']


In [15]:
#first of all we will be going to perform label encoding in categorical variables

from sklearn.preprocessing import LabelEncoder
col_TO_be_encoded=['mainroad', 'guestroom', 'basement', 'hotwaterheating', 'airconditioning', 'prefarea', 'furnishingstatus']
label_encoder={}
for features in col_TO_be_encoded:
    le=LabelEncoder()
    copyData[features]=le.fit_transform(copyData[features])
    label_encoder[features]=le
copyData.head()

#now we have all numerical data types


Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,1,0,0,0,1,2,1,0
1,12250000,8960,4,4,4,1,0,0,0,1,3,0,0
2,12250000,9960,3,2,2,1,0,1,0,0,2,1,1
3,12215000,7500,4,2,2,1,0,1,0,1,3,1,0
4,11410000,7420,4,1,2,1,1,1,0,1,2,0,0


In [16]:
#creating a new feature 
copyData['bgbs'] = copyData['bedrooms'] + copyData['basement'] + copyData['stories'] + copyData['guestroom'] + copyData['bathrooms']
print(copyData.head())

      price  area  bedrooms  bathrooms  stories  mainroad  guestroom  \
0  13300000  7420         4          2        3         1          0   
1  12250000  8960         4          4        4         1          0   
2  12250000  9960         3          2        2         1          0   
3  12215000  7500         4          2        2         1          0   
4  11410000  7420         4          1        2         1          1   

   basement  hotwaterheating  airconditioning  parking  prefarea  \
0         0                0                1        2         1   
1         0                0                1        3         0   
2         1                0                0        2         1   
3         1                0                1        3         1   
4         1                0                1        2         0   

   furnishingstatus  bgbs  
0                 0     9  
1                 0    12  
2                 1     8  
3                 0     9  
4                 

In [17]:
#now, droping columns bedroom, bathroom, stories and guestroom
copyData=copyData.drop(columns=['bedrooms', 'bathrooms', 'stories', 'guestroom', 'basement'])
copyData.head()

Unnamed: 0,price,area,mainroad,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus,bgbs
0,13300000,7420,1,0,1,2,1,0,9
1,12250000,8960,1,0,1,3,0,0,12
2,12250000,9960,1,0,0,2,1,1,8
3,12215000,7500,1,0,1,3,1,0,9
4,11410000,7420,1,0,1,2,0,0,9


In [19]:
profile=ProfileReport(copyData, explorative=True, dark_mode=True) 
profile.to_file('eda_updated.html')

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]