In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('Housing.csv')
df.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished


In [3]:
df.shape

(545, 13)

In [4]:
df.isnull().sum()

price               0
area                0
bedrooms            0
bathrooms           0
stories             0
mainroad            0
guestroom           0
basement            0
hotwaterheating     0
airconditioning     0
parking             0
prefarea            0
furnishingstatus    0
dtype: int64

In [5]:
df.columns

Index(['price', 'area', 'bedrooms', 'bathrooms', 'stories', 'mainroad',
       'guestroom', 'basement', 'hotwaterheating', 'airconditioning',
       'parking', 'prefarea', 'furnishingstatus'],
      dtype='object')

In [6]:
df.columns = ['price','area','bd','bt','stories','mr','gr','bs','hwh','ac','parking','pa','fs']
df.head()

Unnamed: 0,price,area,bd,bt,stories,mr,gr,bs,hwh,ac,parking,pa,fs
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished


In [7]:
df.dtypes

price       int64
area        int64
bd          int64
bt          int64
stories     int64
mr         object
gr         object
bs         object
hwh        object
ac         object
parking     int64
pa         object
fs         object
dtype: object

In [8]:
df['mr'].value_counts()

yes    468
no      77
Name: mr, dtype: int64

In [9]:
df['gr'].value_counts()

no     448
yes     97
Name: gr, dtype: int64

In [10]:
df['bs'].value_counts()

no     354
yes    191
Name: bs, dtype: int64

In [11]:
df['hwh'].value_counts()

no     520
yes     25
Name: hwh, dtype: int64

In [12]:
df['ac'].value_counts()

no     373
yes    172
Name: ac, dtype: int64

In [13]:
df['pa'].value_counts()

no     417
yes    128
Name: pa, dtype: int64

In [14]:
df['fs'].value_counts()

semi-furnished    227
unfurnished       178
furnished         140
Name: fs, dtype: int64

#### Handling the Categorical columns

In [16]:
from sklearn.preprocessing import LabelEncoder
lb = LabelEncoder()

In [17]:
df['mr'].value_counts()

yes    468
no      77
Name: mr, dtype: int64

In [19]:
df['mr'] = lb.fit_transform(df['mr'])
df['mr'].value_counts()

1    468
0     77
Name: mr, dtype: int64

In [20]:
df['gr'].value_counts()

no     448
yes     97
Name: gr, dtype: int64

In [21]:
df['gr'] = lb.fit_transform(df['gr'])
df['gr'].value_counts()

0    448
1     97
Name: gr, dtype: int64

In [23]:
df['bs'].value_counts()

no     354
yes    191
Name: bs, dtype: int64

In [25]:
df['bs'] = lb.fit_transform(df['bs'])
df['bs'].value_counts()

0    354
1    191
Name: bs, dtype: int64

In [24]:
df['hwh'] = lb.fit_transform(df['hwh'])
df['hwh'].value_counts()

0    520
1     25
Name: hwh, dtype: int64

In [26]:
df['ac'].value_counts()

no     373
yes    172
Name: ac, dtype: int64

In [27]:
df['ac'] = lb.fit_transform(df['ac'])
df['ac'].value_counts()

0    373
1    172
Name: ac, dtype: int64

In [28]:
print(df['pa'].value_counts())
print(df['fs'].value_counts())

no     417
yes    128
Name: pa, dtype: int64
semi-furnished    227
unfurnished       178
furnished         140
Name: fs, dtype: int64


In [29]:
df['pa'] = lb.fit_transform(df['pa'])
df['fs'] = lb.fit_transform(df['fs'])

In [30]:
df['pa'].value_counts()

0    417
1    128
Name: pa, dtype: int64

In [31]:
df['fs'].value_counts()

1    227
2    178
0    140
Name: fs, dtype: int64

In [32]:
df.head()

Unnamed: 0,price,area,bd,bt,stories,mr,gr,bs,hwh,ac,parking,pa,fs
0,13300000,7420,4,2,3,1,0,0,0,1,2,1,0
1,12250000,8960,4,4,4,1,0,0,0,1,3,0,0
2,12250000,9960,3,2,2,1,0,1,0,0,2,1,1
3,12215000,7500,4,2,2,1,0,1,0,1,3,1,0
4,11410000,7420,4,1,2,1,1,1,0,1,2,0,0


In [33]:
df.dtypes

price      int64
area       int64
bd         int64
bt         int64
stories    int64
mr         int32
gr         int32
bs         int32
hwh        int32
ac         int32
parking    int64
pa         int32
fs         int32
dtype: object

In [35]:
x = df.drop('price',axis=1)
y = df['price']
print(type(x))
print(type(y))
print(x.shape)
print(y.shape)

<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.series.Series'>
(545, 12)
(545,)


In [38]:
from sklearn.model_selection import train_test_split

In [39]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.25)
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(408, 12)
(137, 12)
(408,)
(137,)


In [40]:
from sklearn.linear_model import LinearRegression

In [41]:
m1 = LinearRegression()
m1.fit(x_train,y_train)

In [42]:
print('Training Score',m1.score(x_train,y_train))
print('Testing Score',m1.score(x_test,y_test))

Training Score 0.6727236429961898
Testing Score 0.6878997036129887


In [43]:
ypred_m1 = m1.predict(x_test)

In [44]:
from sklearn.metrics import mean_squared_error,mean_absolute_error

In [46]:
print('MSE',mean_squared_error(y_test,ypred_m1))
print('RMSE',np.sqrt(mean_squared_error(y_test,ypred_m1)))
print('MAE',mean_absolute_error(y_test,ypred_m1))

MSE 1243882208499.7354
RMSE 1115294.67339342
MAE 824569.5573226173
