In [1]:
import pandas as pd 
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split 
import numpy as np 
import seaborn as sns
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.preprocessing import StandardScaler, MinMaxScaler
pd.options.display.max_columns = None
pd.options.display.max_rows = 100

standard = StandardScaler()
min_max  = MinMaxScaler()

random_state = 0 
train_size = .8

df = pd.read_csv('../Data/kc_house_data.csv').drop('Unnamed: 0', axis = 1)
df['price'] = df.price.map(lambda x: x/1000000)   #divide prices by 1m

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21592,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
21593,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
21594,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
21595,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False


## Linear Model

In [2]:
from sklearn.linear_model import LinearRegression

In [3]:
x = df[['bathrooms', 'sqft_living', 'grade', 'sqft_above', 'sqft_living15']]
y = df[['price']]


x = standard.fit_transform(x)

x_train, x_test, y_train, y_test = train_test_split(x,y, train_size = train_size, random_state = random_state)

linear_model = LinearRegression() 
linear_model.fit(x_train, y_train)
lin_predict = linear_model.predict(x_test)

linear_mse = mean_squared_error(y_test, lin_predict)
print(f'Linear Model: MSE Loss of {round(linear_mse,3)}')

Linear Model: MSE Loss of 0.055


## Sequential ANN

In [4]:
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.models import model_from_json
from keras.optimizers import Adam

Using TensorFlow backend.


In [5]:
def sequential_AAN(x,y, metrics = ['mae']): 
    drop = .2 #regularization
    model = Sequential() #Model object
    
    #input layer
    model.add(Dense(128,input_dim = x.shape[1], activation = 'relu'))
    model.add(Dropout(drop)) #regularization
    
    #1st hidden layer
    model.add(Dense(128 , activation = 'relu'))
    model.add(Dropout(drop))
    
    #2nd hidden layer
    model.add(Dense(128, activation = 'relu'))
    model.add(Dropout(drop)) 
    
    #output layer
    model.add(Dense(1))
    model.compile(loss = 'mse', optimizer = 'sgd', metrics = metrics)
    return model

epochs = 20
batch_size = 16
seq_model = sequential_AAN(x_train, y_train)
seq_history = seq_model.fit(x_train, y_train, epochs = epochs, batch_size = batch_size, validation_data = (x_test, y_test), 
                           verbose = 1)
print('Finished Running Neural Network.  If you want to see progress next time change verbose to 1')

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Finished Running Neural Network.  If you want to see progress next time change verbose to 1


In [6]:
#save model weights
ann_json = seq_model.to_json()
with open('model_weights/ANN_Model.json', 'w') as file:
    file.write(ann_json)
    
seq_model.save('model_weights/ANN_Model_Weights.h5')

In [7]:
nn_predict = seq_model.predict(x_test)
nn_mse = mean_squared_error(y_test, nn_predict)
print(f'NN Model: MSE Loss of {round(nn_mse,3)}')

NN Model: MSE Loss of 0.05


In [10]:
df[df.waterfront == 1].sort_values(by = 'lat')

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
8149,1121039059,5/22/2014,0.503,2,1.75,2860,59612,1.0,1.0,4.0,3,8,1510,1350.0,1948,,98023,47.3276,-122.389,2720,59612
9845,121039083,2/6/2015,0.629,3,1.75,1460,12367,2.0,1.0,4.0,4,8,1120,340.0,1970,0.0,98023,47.3311,-122.375,1970,18893
4907,121029034,6/24/2014,0.549,2,1.00,2034,13392,1.0,1.0,4.0,5,7,1159,875.0,1947,0.0,98070,47.3312,-122.503,1156,15961
8270,221029019,4/28/2015,0.400,3,2.50,2090,32718,2.0,1.0,4.0,3,7,1550,540.0,1919,,98070,47.3338,-122.511,1200,192268
4365,4166600115,11/21/2014,1.150,3,2.75,3230,75889,2.0,1.0,4.0,3,7,3230,590.0,1925,1993.0,98023,47.3344,-122.370,2560,72229
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15860,7403200050,11/13/2014,1.600,3,2.25,3370,23065,1.0,1.0,4.0,3,10,1920,1450.0,1980,0.0,98028,47.7434,-122.263,3410,19688
2624,7738500731,8/15/2014,4.500,5,5.50,6640,40014,2.0,1.0,4.0,3,12,6350,290.0,2004,0.0,98155,47.7493,-122.280,3030,23408
15498,4030100005,12/9/2014,1.800,5,3.75,4320,39094,2.0,1.0,4.0,3,8,4320,0.0,1938,0.0,98155,47.7519,-122.276,1920,7750
15654,4030100290,10/1/2014,1.680,5,3.50,5170,7197,3.0,1.0,4.0,3,11,3520,1650.0,1998,0.0,98155,47.7561,-122.271,3020,12880


In [11]:
df[df.waterfront == 0].sort_values(by = 'lat')

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
3292,9413400165,6/24/2014,0.38000,3,2.25,1860,15559,2.0,0.0,0.0,4,7,1860,0.0,1963,0.0,98022,47.1559,-121.646,1110,11586
15585,619079016,6/2/2014,0.68700,4,3.25,4400,186846,2.0,0.0,0.0,4,9,4400,0.0,1993,0.0,98022,47.1593,-121.957,2280,186846
12993,619079096,4/6/2015,0.75000,3,2.50,2350,715690,1.5,0.0,0.0,4,9,2350,0.0,1979,,98022,47.1622,-121.971,1280,325393
12656,619079061,6/19/2014,0.33500,4,2.00,2030,103672,1.0,0.0,0.0,4,7,2030,0.0,1969,0.0,98022,47.1647,-121.973,1560,325393
7712,2781320100,3/9/2015,0.24500,3,1.75,1670,24650,1.0,0.0,0.0,4,7,1670,0.0,1974,0.0,98022,47.1764,-122.026,1810,19465
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6808,507100005,3/10/2015,0.28500,4,2.00,2120,6865,1.0,0.0,0.0,3,7,1060,1060.0,1954,,98133,47.7775,-122.337,1460,7780
6049,507100020,3/9/2015,0.27000,3,1.00,1480,7374,1.0,0.0,0.0,3,6,760,720.0,1954,0.0,98133,47.7775,-122.336,1480,8934
15752,2644300005,4/12/2015,0.40750,4,2.50,1900,9075,2.0,0.0,0.0,3,7,1900,0.0,1988,0.0,98133,47.7776,-122.352,1800,8460
17450,5500200010,10/14/2014,0.38995,3,1.75,1580,9049,1.0,0.0,0.0,3,8,1580,0.0,1966,0.0,98177,47.7776,-122.375,2100,8446
