# King County House Price Regression Project

Coded by Luna McBride

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from sklearn.model_selection import train_test_split #Split the data into train and test
from sklearn.linear_model import LinearRegression #Add in our linear regression
from sklearn.preprocessing import StandardScaler #Test out scaling
from sklearn.neural_network import MLPRegressor #Add a multilayer perceptron to test regression ability
from sklearn import svm #Add a support vector machine to test regression ability
from sklearn.tree import DecisionTreeRegressor #Add a single tree regressor to test regression ability
from sklearn.ensemble import RandomForestRegressor #Add a forest regressor to test regression ability
from sklearn.ensemble import ExtraTreesRegressor #Add even more trees to test regression ability

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/kc-housesales-data/kc_house_data.csv


In [2]:
house = pd.read_csv("../input/kc-housesales-data/kc_house_data.csv") #Read in the houses dataset
house.head() #Take a peek at the dataset

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,10/13/2014,221900.0,3,1.0,1180,5650,1.0,0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,6414100192,12/9/2014,538000.0,3,2.25,2570,7242,2.0,0,0,...,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639
2,5631500400,2/25/2015,180000.0,2,1.0,770,10000,1.0,0,0,...,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
3,2487200875,12/9/2014,604000.0,4,3.0,1960,5000,1.0,0,0,...,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
4,1954400510,2/18/2015,510000.0,3,2.0,1680,8080,1.0,0,0,...,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503


---

# Check for nulls

In [3]:
print(house.count())
print(house.isnull().any())

id               21597
date             21597
price            21597
bedrooms         21597
bathrooms        21597
sqft_living      21597
sqft_lot         21597
floors           21597
waterfront       21597
view             21597
condition        21597
grade            21597
sqft_above       21597
sqft_basement    21597
yr_built         21597
yr_renovated     21597
zipcode          21597
lat              21597
long             21597
sqft_living15    21597
sqft_lot15       21597
dtype: int64
id               False
date             False
price            False
bedrooms         False
bathrooms        False
sqft_living      False
sqft_lot         False
floors           False
waterfront       False
view             False
condition        False
grade            False
sqft_above       False
sqft_basement    False
yr_built         False
yr_renovated     False
zipcode          False
lat              False
long             False
sqft_living15    False
sqft_lot15       False
dtype: bool


There are no null values. It is time to fix it up some more.

---

# Fix up some variables

## Change dates into an age

In [4]:
house["age"] = pd.DatetimeIndex(house['date']).year - house["yr_built"] #Get the age of the building
print(house['date'][1], " - ", house["yr_built"][1], " = ", house["age"][1]) #Print the equation as a sanity check

12/9/2014  -  1951  =  63


## Remove year renovated

In [5]:
house.drop(columns = ["yr_renovated"], inplace = True) #Drop the year renovated field
house.head() #Take a peek at the dataset

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,zipcode,lat,long,sqft_living15,sqft_lot15,age
0,7129300520,10/13/2014,221900.0,3,1.0,1180,5650,1.0,0,0,...,7,1180,0,1955,98178,47.5112,-122.257,1340,5650,59
1,6414100192,12/9/2014,538000.0,3,2.25,2570,7242,2.0,0,0,...,7,2170,400,1951,98125,47.721,-122.319,1690,7639,63
2,5631500400,2/25/2015,180000.0,2,1.0,770,10000,1.0,0,0,...,6,770,0,1933,98028,47.7379,-122.233,2720,8062,82
3,2487200875,12/9/2014,604000.0,4,3.0,1960,5000,1.0,0,0,...,7,1050,910,1965,98136,47.5208,-122.393,1360,5000,49
4,1954400510,2/18/2015,510000.0,3,2.0,1680,8080,1.0,0,0,...,8,1680,0,1987,98074,47.6168,-122.045,1800,7503,28


## Drop ID and Date, as they are not helpful here

In [6]:
house.drop(columns = ["id", "date"], inplace = True) #Drop the ID and Date Fields
house.head() #Take a peek at the dataset

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,zipcode,lat,long,sqft_living15,sqft_lot15,age
0,221900.0,3,1.0,1180,5650,1.0,0,0,3,7,1180,0,1955,98178,47.5112,-122.257,1340,5650,59
1,538000.0,3,2.25,2570,7242,2.0,0,0,3,7,2170,400,1951,98125,47.721,-122.319,1690,7639,63
2,180000.0,2,1.0,770,10000,1.0,0,0,3,6,770,0,1933,98028,47.7379,-122.233,2720,8062,82
3,604000.0,4,3.0,1960,5000,1.0,0,0,5,7,1050,910,1965,98136,47.5208,-122.393,1360,5000,49
4,510000.0,3,2.0,1680,8080,1.0,0,0,3,8,1680,0,1987,98074,47.6168,-122.045,1800,7503,28


## Drop the first living and lot columns

More updated versions exist in the sqft_living/lot 15 variables, and thus they are better representation of the current lot.

In [7]:
house.drop(columns = ["sqft_living", "sqft_lot"], inplace = True) #Drop the old sqft lot and living Fields
house.head() #Take a peek at the dataset

Unnamed: 0,price,bedrooms,bathrooms,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,zipcode,lat,long,sqft_living15,sqft_lot15,age
0,221900.0,3,1.0,1.0,0,0,3,7,1180,0,1955,98178,47.5112,-122.257,1340,5650,59
1,538000.0,3,2.25,2.0,0,0,3,7,2170,400,1951,98125,47.721,-122.319,1690,7639,63
2,180000.0,2,1.0,1.0,0,0,3,6,770,0,1933,98028,47.7379,-122.233,2720,8062,82
3,604000.0,4,3.0,1.0,0,0,5,7,1050,910,1965,98136,47.5208,-122.393,1360,5000,49
4,510000.0,3,2.0,1.0,0,0,3,8,1680,0,1987,98074,47.6168,-122.045,1800,7503,28


---

# Split the data

In [8]:
price = np.array(house["price"].copy().astype(int)) #Set price to be the prices of the houses
price = np.log(price) #Log the price
print(price) #Print the prices

[12.30998211 13.19561384 12.10071213 ... 12.90445858 12.89921983
 12.69158046]


I tried various scaling functions to try to get this to increase the the linear regression score, but I searched the internet for more options and, strangely, just taking the log worked a whole lot better.

In [9]:
characteristics = house.drop("price", axis = 1) #Get every other feature of our dataframe except price
chara = pd.get_dummies(characteristics) #Get the dummies for easier model training
scale = StandardScaler() #Add a standard scaler to scale our data for easier use later
scale.fit(chara) #Fit the scaler with our characteristics
chara = scale.transform(chara) #Transform the data with our scaler

print(chara) #Print the scaled data

[[-0.40290302 -1.45107268 -0.91555234 ... -0.94367562 -0.26062671
   0.53364623]
 [-0.40290302  0.17448579  0.93743087 ... -0.43288674 -0.18769961
   0.66980901]
 [-1.48249316 -1.45107268 -0.91555234 ...  1.07029196 -0.17219022
   1.31658219]
 ...
 [-1.48249316 -1.77618438  0.93743087 ... -1.41068259 -0.39419807
  -1.30455123]
 [-0.40290302  0.49959749  0.93743087 ... -0.84151784 -0.42059702
  -1.10030707]
 [-1.48249316 -1.77618438  0.93743087 ... -1.41068259 -0.41803046
  -1.27051054]]


In [10]:
charaTrain, charaTest, priceTrain, priceTest = train_test_split(chara, price, test_size = 0.3) #Split the data into train and test
print(priceTest) #Print one of the price splits

[12.95972679 13.79019275 13.21767356 ... 12.27839331 12.64416627
 12.69158046]


---

# Train the linear regression

In [11]:
regression = LinearRegression() #Open a linear regression model
regression.fit(charaTrain, priceTrain) #Fit the regression model

LinearRegression()

In [12]:
print(regression.score(charaTest, priceTest)) #Print the accuracy of the model
print(regression.coef_) #Print the model coefficients

0.7719466842713141
[-0.01355147  0.05345374  0.04186316  0.03086704  0.04740268  0.0384471
  0.19200476  0.11626545  0.06840185  1.3963849  -0.03776133  0.19267019
 -0.01956959  0.06422037  0.00404033  1.50632099]


---

# Test other regressions

In [13]:
neural = MLPRegressor(hidden_layer_sizes = (3,100), random_state=1, max_iter=500) #Build a neural network to test regression
neural.fit(charaTrain, priceTrain) #Fit the network with the train set

MLPRegressor(hidden_layer_sizes=(3, 100), max_iter=500, random_state=1)

In [14]:
svr =  svm.SVR() #Get a support vector regressor to test ability
svr.fit(charaTrain, priceTrain) #Fit the regresso

SVR()

In [15]:
tree = DecisionTreeRegressor() #Build a tree
tree.fit(charaTrain, priceTrain) #Fit the tree

DecisionTreeRegressor()

In [16]:
forest = RandomForestRegressor() #Build a whole forest of trees
forest.fit(charaTrain, priceTrain) #Fit the forest

RandomForestRegressor()

In [17]:
forestBig = ExtraTreesRegressor() #Build a more random forest
forestBig.fit(charaTrain, priceTrain) #Fit the more random forest

ExtraTreesRegressor()

---

# Print Results

In [18]:
#Print the accuracies of all the models
print("Linear Regression Accuracy: ", regression.score(charaTest, priceTest))
print("Neural Network Accuracy: ", neural.score(charaTest, priceTest))
print("Support Vector Accuracy: ", svr.score(charaTest, priceTest))
print("Single Tree Accuracy: ", tree.score(charaTest, priceTest))
print("Random Forest Accuracy: ",forest.score(charaTest, priceTest))
print("Even more Random Forest Accuracy: ", forestBig.score(charaTest, priceTest))

Linear Regression Accuracy:  0.7719466842713141
Neural Network Accuracy:  0.8670089600557246
Support Vector Accuracy:  0.860460434339263
Single Tree Accuracy:  0.7566770647783239
Random Forest Accuracy:  0.8755956756044758
Even more Random Forest Accuracy:  0.877362008666


According to the scores, the strongest regressors for this data were the forest, reaching above 88% accuracy. These were followed close behind by the neural network and SVM regressions, which were both above 84% accuracy. Trailing far at the end were the Linear Regression and the single tree, surprisingly. There was a difference of about 7% accuracy between the Linear Regression and its next closest neighbor, the SVM (77 vs 84). This shows that while the linear regression is the typical go-to in settings like statistics, there are better options out there depending on the data.

In [19]:
attributes = characteristics.columns #Get the tested attributes
attributes = list(zip(attributes, regression.coef_)) #Zip the attributes together with their coefficient
sortAtt = sorted(attributes, key = lambda x: x[1], reverse = True) #Sort the zipped attributes by their coefficients

print("According to the Linear Regression, the most important factors for pricing are: ") #Start printing the most important labels
i=0 #Counter variable so only the top five are printed

#For each attribute in the sorted attributes
for label, coef in sortAtt:
    if i<5: #If there has not been five printed yet
        print(label) #Print the label as an important factor
    i += 1 #Increase i by 1

According to the Linear Regression, the most important factors for pricing are: 
age
yr_built
lat
grade
sqft_above


In [20]:
attributes = characteristics.columns #Get the tested attributes
attributes = list(zip(attributes, forest.feature_importances_)) #Zip the attributes together with their coefficient
sortAtt = sorted(attributes, key = lambda x: x[1], reverse = True) #Sort the zipped attributes by their coefficients

print("According to the Random Forest (most accurate), the most important factors for pricing are: ") #Start printing the most important labels
i=0 #Counter variable so only the top five are printed

#For each attribute in the sorted attributes
for label, coef in sortAtt:
    if i<5: #If there has not been five printed yet
        print(label) #Print the label as an important factor
    i += 1 #Increase i by 1

According to the Random Forest (most accurate), the most important factors for pricing are: 
grade
lat
sqft_above
sqft_living15
long


In the listings of most important features contributing to the price, the Linear Regression and far more accurate Random Forest gave widely different results. Both models produced grade as one of the most important. Grade is how King County grades houses, so it makes sense houses in King County will rely on their grade to price houses. They both also have latitude and size (not counting the basement), but the utilization in the list is very different.

The Forest ranked location (latitude and longitude) and above-ground sizing (the size of the house and the size not counting the basement) as the most important metrics. This makes sense, as different areas tend to have higher or lower prices and size does tend to be an important factor in considering a house.

The Linear regression ranked the age a whole lot higher than anything else, which is a bit odd, considering it was not even on the Forest's list. There is also latitude and size above ground, which tells me the Linear regression was trying to come to a similar conclusion about space and location that the forest did, it just put higher emphasis on age. Price emphasis by age can make sense, but I feel like that only matters highly with the very old houses. This intuition is also shown in the forest model, as I had to change the iterator to 9 in order to even show age, thus it is ranking it as the 9th most important factor.