# House Price Prediction

In [1]:
import math as m
import numpy as np
import pandas as pd

import matplotlib
matplotlib.use('TkAgg')
from matplotlib import pyplot as plt

import seaborn as sns

In [2]:
data = pd.read_csv('./data.csv')
data.head()

Unnamed: 0,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,sqft_above,sqft_basement,yr_built,yr_renovated,street,city,statezip,country
0,2014-05-02 00:00:00,313000.0,3.0,1.5,1340,7912,1.5,0,0,3,1340,0,1955,2005,18810 Densmore Ave N,Shoreline,WA 98133,USA
1,2014-05-02 00:00:00,2384000.0,5.0,2.5,3650,9050,2.0,0,4,5,3370,280,1921,0,709 W Blaine St,Seattle,WA 98119,USA
2,2014-05-02 00:00:00,342000.0,3.0,2.0,1930,11947,1.0,0,0,4,1930,0,1966,0,26206-26214 143rd Ave SE,Kent,WA 98042,USA
3,2014-05-02 00:00:00,420000.0,3.0,2.25,2000,8030,1.0,0,0,4,1000,1000,1963,0,857 170th Pl NE,Bellevue,WA 98008,USA
4,2014-05-02 00:00:00,550000.0,4.0,2.5,1940,10500,1.0,0,0,4,1140,800,1976,1992,9105 170th Ave NE,Redmond,WA 98052,USA


In [3]:
columns = data.columns
columns

Index(['date', 'price', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot',
       'floors', 'waterfront', 'view', 'condition', 'sqft_above',
       'sqft_basement', 'yr_built', 'yr_renovated', 'street', 'city',
       'statezip', 'country'],
      dtype='object')

In [4]:
columns = columns.drop(['date','street','city','statezip','country'])
columns

Index(['price', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors',
       'waterfront', 'view', 'condition', 'sqft_above', 'sqft_basement',
       'yr_built', 'yr_renovated'],
      dtype='object')

In [5]:
data = data[columns]
data.head()

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,sqft_above,sqft_basement,yr_built,yr_renovated
0,313000.0,3.0,1.5,1340,7912,1.5,0,0,3,1340,0,1955,2005
1,2384000.0,5.0,2.5,3650,9050,2.0,0,4,5,3370,280,1921,0
2,342000.0,3.0,2.0,1930,11947,1.0,0,0,4,1930,0,1966,0
3,420000.0,3.0,2.25,2000,8030,1.0,0,0,4,1000,1000,1963,0
4,550000.0,4.0,2.5,1940,10500,1.0,0,0,4,1140,800,1976,1992


In [6]:
data.size

59800

In [7]:
# sns.scatterplot('yr_built',data=data, hue='yr_renovated')
# plt.show()

In [8]:
data[(data['yr_renovated']!=0)].size

24245

In [9]:
data[(data['yr_built']>1900) & (data['yr_built']<1920)].size

3978

In [10]:
# old but renovated
renovated_houses_data = data[((data['yr_built']>1900) & (data['yr_built']<1920)) & (data['yr_renovated']!=0)]
renovated_houses_data.size

2145

In [11]:
# new and old mix
non_renovated_houses_data = data[((data['yr_built']>1900) & (data['yr_built']<1920)) & (data['yr_renovated']==0)]
non_renovated_houses_data.size

1833

In [12]:
renovated_houses_data.head()

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,sqft_above,sqft_basement,yr_built,yr_renovated
11,1400000.0,4.0,2.5,2920,4000,1.5,0,0,5,1910,1010,1909,1988
96,670000.0,3.0,2.5,1680,2000,3.0,0,0,3,1680,0,1909,1998
152,450000.0,2.0,2.0,1100,3000,1.5,0,0,3,1100,0,1912,2005
156,561000.0,3.0,2.0,2000,7000,2.0,0,0,3,2000,0,1916,1986
216,485000.0,3.0,2.0,1420,4080,1.5,0,0,3,1420,0,1905,2013


In [13]:
data[data['sqft_basement']!=0].size

24115

In [14]:
set(data['waterfront'])

{0, 1}

In [15]:
data[data['waterfront']!=0].size

429

In [16]:
set(data['view'])

{0, 1, 2, 3, 4}

In [17]:
data[data['view'] > 3].size

910

In [18]:
set(data['condition'])

{1, 2, 3, 4, 5}

In [19]:
data[data['condition'] == 5].size

5655

In [20]:
# Renovated house = basement compulsary, condition => best (=5), waterfront compulsory, view => more than average (>3) 
# renovated_houses_data = renovated_houses_data[(renovated_houses_data['sqft_basement']!=0) & (renovated_houses_data['view']>3) & (renovated_houses_data['waterfront']!=0) & (renovated_houses_data['condition']==5)]

# renoved house with best condition
renovated_houses_data = renovated_houses_data[renovated_houses_data['condition']>=5]
renovated_houses_data.size

273

In [21]:
# Non renovated houses with best condition
non_renovated_houses_data = non_renovated_houses_data[non_renovated_houses_data['condition']==5]
non_renovated_houses_data.size

767

In [22]:
renovated_houses_data.head()

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,sqft_above,sqft_basement,yr_built,yr_renovated
11,1400000.0,4.0,2.5,2920,4000,1.5,0,0,5,1910,1010,1909,1988
536,630000.0,4.0,2.0,1770,6000,2.0,0,0,5,1770,0,1911,1981
563,415000.0,3.0,1.75,1960,5000,1.0,0,0,5,980,980,1911,1984
1086,360000.0,2.0,0.75,850,7710,1.0,0,2,5,550,300,1909,1988
1287,462000.0,3.0,1.75,1300,2580,1.0,0,0,5,820,480,1919,1934


In [23]:
def convert_to_int(i):
    return int(i)

In [24]:
# renovated_houses_data[renovated_houses_data['bathrooms']%]
set(renovated_houses_data['bedrooms'])

{2.0, 3.0, 4.0, 5.0, 6.0}

In [25]:
# modified_renovated_houses_data = renovated_houses_data[['sqft_living','sqft_lot','sqft_above']]
# modified_renovated_houses_data.size

In [26]:
# modified_renovated_houses_data[] = renovated_houses_data['bedrooms'].apply(convert_to_int)

In [27]:
renovated_houses_data = renovated_houses_data.drop(['waterfront','view','condition','yr_built','yr_renovated'],axis=1)
renovated_houses_data.head()

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,sqft_above,sqft_basement
11,1400000.0,4.0,2.5,2920,4000,1.5,1910,1010
536,630000.0,4.0,2.0,1770,6000,2.0,1770,0
563,415000.0,3.0,1.75,1960,5000,1.0,980,980
1086,360000.0,2.0,0.75,850,7710,1.0,550,300
1287,462000.0,3.0,1.75,1300,2580,1.0,820,480


In [28]:
non_renovated_houses_data = non_renovated_houses_data.drop(['waterfront','view','condition','yr_built','yr_renovated'],axis=1)
non_renovated_houses_data.head()

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,sqft_above,sqft_basement
86,549000.0,2.0,1.0,1140,5400,1.0,1140,0
252,3200000.0,7.0,4.5,6210,8856,2.5,4760,1450
298,160000.0,2.0,1.0,1040,13100,1.0,1040,0
338,1039000.0,4.0,1.0,3410,5000,2.0,2190,1220
368,660000.0,3.0,1.75,1320,5750,1.5,1320,0


# Renovated Houses

In [29]:
x1 = renovated_houses_data.drop(['price'],axis=1)
y1 = renovated_houses_data[['price']]

In [30]:
from sklearn.model_selection import train_test_split

In [31]:
x1_train, x1_test, y1_train, y1_test = train_test_split(x1,y1,test_size=0.3)

Linear regression

In [32]:
from sklearn.linear_model import LinearRegression

In [33]:
model1 = LinearRegression()

In [34]:
model1.fit(x1_train,y1_train)

LinearRegression()

In [35]:
y1_predict = model1.predict(x1_test)

In [36]:
y1_predict[0:5]

array([[ 304568.68368883],
       [ 564827.47563744],
       [-325073.37506146],
       [ 902059.8517574 ],
       [ 633180.46465901]])

In [37]:
y1_test.head(5)

Unnamed: 0,price
1741,850000.0
2227,516200.0
2878,680000.0
4288,725000.0
2323,735000.0


In [38]:
model1.score(x1_train,y1_train)

0.9793098292835107

# Non Renovated Houses

In [39]:
x2 = non_renovated_houses_data.drop(['price'],axis=1)
y2 = non_renovated_houses_data[['price']]

In [40]:
x2_train, x2_test, y2_train, y2_test = train_test_split(x2,y2,test_size=0.3)

In [41]:
model2 = LinearRegression()

In [42]:
model2.fit(x2_train,y2_train)

LinearRegression()

In [43]:
y2_predict = model2.predict(x2_test)
y2_predict[0:5]

array([[479759.67388648],
       [570972.57649582],
       [574079.21098963],
       [850799.0656258 ],
       [622978.4848792 ]])

In [44]:
y2_test.head(5)

Unnamed: 0,price
1151,600000.0
4012,558000.0
913,368000.0
4557,471500.0
2854,311000.0


In [45]:
model2.score(x2_train,y2_train)

0.7946407780869899

In [46]:
print(f"Accuracy of model trained by dataset 1 is {model1.score(x1_train,y1_train)}")
print(f"Accuracy of model trained by dataset 2 is {model2.score(x2_train,y2_train)}")

Accuracy of model trained by dataset 1 is 0.9793098292835107
Accuracy of model trained by dataset 2 is 0.7946407780869899
