In [7]:
import time
import pandas as pd
import numpy as np

from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import LassoCV
from sklearn import svm
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import GridSearchCV

import keras
from keras.wrappers.scikit_learn import KerasRegressor
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.optimizers import SGD, Adam
from keras.models import load_model

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

import pickle
from sklearn.externals import joblib
import warnings
warnings.filterwarnings("ignore")

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import src.utils as utils

In [2]:
# NN structure
def feedforward_NN(x_train, optimizer, n_nodes, n_layers):
    # initialize model
    model = Sequential()
    
    # add first layer
    model.add(Dense(n_nodes, input_dim=x_train.shape[1], activation='relu'))
    
    # add subsequent layers
    for layer in range(n_layers-1):
        model.add(Dense(n_nodes, activation='relu'))
    
    # add final layer
    model.add(Dense(1, activation='linear')) 
    
    # compile the model using optimizer
    model.compile(loss='mean_squared_error',
                  optimizer=optimizer,
                  metrics=['mse'])
    return model

# Condos

In [3]:
data_filename = 'data/features/CON_feats_remarks.pkl'

### Test-Train Split, Response = 'SOLDPRICE'

In [None]:
response_col = 'SOLDPRICE'
test_size = 0.1
random_state = 9001

# read in data 
df = utils.read_preprocess_df(data_filename, response_col=response_col)

# test train split
X_train_dict, X_val_dict, y_train, y_val = utils.split_normalize_df(df=df, response_col=response_col, test_size=test_size, random_state=random_state)

### Train models

In [None]:
train_features = X_train_dict['img']
val_features = X_val_dict['img']

In [None]:
# Ridge
filename = 'models/soldprice/image_models/condo_price_img_ridge.pkl'

# print("Ridge model: ")
# t0 = time.time()

# # train and save model
# model = RidgeCV(alphas=(3.5, 4, 4.5, 5, 5.5))
# utils.train_save_model(model, X=train_features, y=y_train, filename=filename)
# print("training time: ", time.time()-t0)

# load savced model and evaluate model performance
utils.load_eval_model(filename=filename, X_train=train_features, X_val=val_features, y_train=y_train, y_val=y_val)

In [None]:
# light gbm
filename = 'models/soldprice/image_models/condo_price_img_LGBM.pkl'

params = { 
          'reg_lambda':[10, 100, 1000]
        
         }

print("Light GBM model: ")
t0 = time.time()

# model
model = model = LGBMRegressor(random_state=9001, n_estimators=512)
grid = GridSearchCV(model, params, verbose=1, n_jobs=-1)

# train and save model
utils.train_save_model(grid, X=train_features, y=y_train, filename=filename)
print("training time: ", time.time()-t0)

# load savced model and evaluate model performance
utils.load_eval_model(filename=filename, X_train=train_features, X_val=val_features, y_train=y_train, y_val=y_val)

In [None]:
# Feedforward NN
N_NODES = 120
N_LAYERS = 20

OPTIMIZER = Adam(lr=0.001)   
EPOCHS = 100
BATCH_SIZE = 200

filename = 'models/soldprice/image_models/condo_price_img_nn.h5'

# # train and save model
# model = KerasRegressor(build_fn=feedforward_NN, x_train=train_features, optimizer=OPTIMIZER, n_nodes=N_NODES, n_layers=N_LAYERS, epochs=EPOCHS, batch_size=BATCH_SIZE, verbose=1)
# utils.train_save_model(model, X=train_features, y=y_train, filename=filename)

# load savced model and evaluate model performance
utils.load_eval_model(filename=filename, X_train=train_features, X_val=val_features, y_train=y_train, y_val=y_val)

### Test-Train Split, Response = 'DOM'

In [4]:
response_col = 'DOM'
test_size = 0.1
random_state = 9001

# read in data 
df = utils.read_preprocess_df(data_filename, response_col=response_col)

# test train split
X_train_dict, X_val_dict, y_train, y_val = utils.split_normalize_df(df=df, response_col=response_col, test_size=test_size, random_state=random_state)

Number of features from zillow:  10
Number of features from redfin:  20
Number of features from images:  2048
Number of features from remarks:  20
Number of training samples:  36842
Number of validation samples:  4094


### Train models

In [5]:
train_features = X_train_dict['img']
val_features = X_val_dict['img']

In [6]:
# Ridge
filename = 'models/dom/image_models/condo_dom_img_ridge.pkl'

print("Ridge model: ")
t0 = time.time()

# train and save model
model = RidgeCV(alphas=(4.5, 5, 5.5, 10, 20, 25))
utils.train_save_model(model, X=train_features, y=y_train, filename=filename)
print("training time: ", time.time()-t0)

# load savced model and evaluate model performance
utils.load_eval_model(filename=filename, X_train=train_features, X_val=val_features, y_train=y_train, y_val=y_val)

Ridge model: 
training time:  29.37293791770935
best alpha:  20.0
----- Training scores -----
R2 on log scale:  0.14320155577778837
MAE on log scale:  0.1919140371289637
MAE on original $ scale:  0.6846854531590018
----- Validation scores -----
R2 on log scale:  0.0971237362055688
MAE on log scale:  0.20030665613906531
MAE on original $ scale:  0.7139421106196474


In [9]:
# lasso
filename = 'models/dom/image_models/condo_dom_img_lasso.pkl'

print("Lasso model: ")
t0 = time.time()

# train and save model
model = LassoCV()
utils.train_save_model(model, X=train_features, y=y_train, filename=filename)
print("training time: ", time.time()-t0)

# load savced model and evaluate model performance
utils.load_eval_model(filename=filename, X_train=train_features, X_val=val_features, y_train=y_train, y_val=y_val)

Lasso model: 
training time:  275.7376449108124
LassoCV(alphas=None, copy_X=True, cv=None, eps=0.001, fit_intercept=True,
    max_iter=1000, n_alphas=100, n_jobs=1, normalize=False, positive=False,
    precompute='auto', random_state=None, selection='cyclic', tol=0.0001,
    verbose=False)
----- Training scores -----
R2 on log scale:  0.12720233802827308
MAE on log scale:  0.19353605273144017
MAE on original $ scale:  0.690344213061593
----- Validation scores -----
R2 on log scale:  0.09106463196950354
MAE on log scale:  0.20078574946578254
MAE on original $ scale:  0.7153442592520158


In [10]:
# light gbm
filename = 'models/dom/image_models/condo_dom_img_LGBM.pkl'

params = {
          'reg_lambda':[1000, 1e4, 1e5]
         }

print("Light GBM model: ")
# t0 = time.time()

# # model
# model = model = LGBMRegressor(random_state=9001, n_estimators=512)
# grid = GridSearchCV(model, params, verbose=1, n_jobs=-1)

# # train and save model
# utils.train_save_model(grid, X=train_features, y=y_train, filename=filename)
# print("training time: ", time.time()-t0)

# load savced model and evaluate model performance
utils.load_eval_model(filename=filename, X_train=train_features, X_val=val_features, y_train=y_train, y_val=y_val)

Light GBM model: 
LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
       learning_rate=0.1, max_depth=-1, min_child_samples=20,
       min_child_weight=0.001, min_split_gain=0.0, n_estimators=512,
       n_jobs=-1, num_leaves=31, objective=None, random_state=9001,
       reg_alpha=0.0, reg_lambda=1000, silent=True, subsample=1.0,
       subsample_for_bin=200000, subsample_freq=1)
----- Training scores -----
R2 on log scale:  0.5631033943357304
MAE on log scale:  0.13450057205378732
MAE on original $ scale:  0.48159449139378085
----- Validation scores -----
R2 on log scale:  0.12991383787019128
MAE on log scale:  0.1965076918007279
MAE on original $ scale:  0.7005690933015403


In [14]:
# Feedforward NN
N_NODES = 145
N_LAYERS = 20

OPTIMIZER = Adam(lr=0.001)   
EPOCHS = 100
BATCH_SIZE = 300

filename = 'models/dom/image_models/condo_dom_img_nn.h5'

# # train and save model
# model = KerasRegressor(build_fn=feedforward_NN, x_train=train_features, optimizer=OPTIMIZER, n_nodes=N_NODES, n_layers=N_LAYERS, epochs=EPOCHS, batch_size=BATCH_SIZE, verbose=1)
# utils.train_save_model(model, X=train_features, y=y_train, filename=filename)

# load savced model and evaluate model performance
utils.load_eval_model(filename=filename, X_train=train_features, X_val=val_features, y_train=y_train, y_val=y_val)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

# Multi Families

In [15]:
data_filename = 'data/features/MF_feats_remarks.pkl'

### Test-Train Split, Response = 'SOLDPRICE'

In [16]:
response_col = 'SOLDPRICE'
test_size = 0.1
random_state = 9001

# read in data 
df = utils.read_preprocess_df(data_filename, response_col=response_col)

# test train split
X_train_dict, X_val_dict, y_train, y_val = utils.split_normalize_df(df=df, response_col=response_col, test_size=test_size, random_state=random_state)

Number of features from zillow:  10
Number of features from redfin:  20
Number of features from images:  2048
Number of features from remarks:  20
Number of training samples:  11965
Number of validation samples:  1330


### Train models

In [17]:
train_features = X_train_dict['img']
val_features = X_val_dict['img']

In [18]:
# Ridge
filename = 'models/soldprice/image_models/mf_price_img_ridge.pkl'

print("Ridge model: ")
# t0 = time.time()

# # train and save model
# model = RidgeCV(alphas=(4.5, 5, 5.5, 10, 20, 25))
# utils.train_save_model(model, X=train_features, y=y_train, filename=filename)
# print("training time: ", time.time()-t0)

# load savced model and evaluate model performance
utils.load_eval_model(filename=filename, X_train=train_features, X_val=val_features, y_train=y_train, y_val=y_val)

Ridge model: 
best alpha:  20.0
----- Training scores -----
R2 on log scale:  0.41401523541386875
MAE on log scale:  0.414405297184377
MAE on original $ scale:  194441.8784148266
----- Validation scores -----
R2 on log scale:  0.38669631875483423
MAE on log scale:  0.42945810322678607
MAE on original $ scale:  203165.10866322572


In [19]:
# light gbm
filename = 'models/soldprice/image_models/mf_price_img_LGBM.pkl'

params = {
        'learning_rate':[0.01, 0.1, 1]
         }

print("Light GBM model: ")
# t0 = time.time()
# 
# # model
# model = model = LGBMRegressor(random_state=9001, n_estimators=512, reg_lambda=100)
# grid = GridSearchCV(model, params, verbose=1, n_jobs=-1)

# # train and save model
# utils.train_save_model(grid, X=train_features, y=y_train, filename=filename)
# print("training time: ", time.time()-t0)

# load savced model and evaluate model performance
utils.load_eval_model(filename=filename, X_train=train_features, X_val=val_features, y_train=y_train, y_val=y_val)

Light GBM model: 
LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
       learning_rate=0.1, max_depth=-1, min_child_samples=20,
       min_child_weight=0.001, min_split_gain=0.0, n_estimators=512,
       n_jobs=-1, num_leaves=31, objective=None, random_state=9001,
       reg_alpha=0.0, reg_lambda=100, silent=True, subsample=1.0,
       subsample_for_bin=200000, subsample_freq=1)
----- Training scores -----
R2 on log scale:  0.8503615778854379
MAE on log scale:  0.14116345178186565
MAE on original $ scale:  62808.18661965218
----- Validation scores -----
R2 on log scale:  0.39949081145584464
MAE on log scale:  0.4267294369743406
MAE on original $ scale:  198912.14925032566


In [20]:
# Feedforward NN
N_NODES = 145
N_LAYERS = 20

OPTIMIZER = Adam(lr=0.001)   
EPOCHS = 100
BATCH_SIZE = 200

filename = 'models/soldprice/image_models/mf_price_img_nn.h5'

# # train and save model
# model = KerasRegressor(build_fn=feedforward_NN, x_train=train_features, optimizer=OPTIMIZER, n_nodes=N_NODES, n_layers=N_LAYERS, epochs=EPOCHS, batch_size=BATCH_SIZE, verbose=1)
# utils.train_save_model(model, X=train_features, y=y_train, filename=filename)

# load savced model and evaluate model performance
utils.load_eval_model(filename=filename, X_train=train_features, X_val=val_features, y_train=y_train, y_val=y_val)

shape of train:  (11965,)
shape of val:  (1330,)
----- Training scores -----
R2 on log scale:  0.39075461943543166
MAE on log scale:  0.4237634170406626
MAE on original $ scale:  197713.70467368368
----- Validation scores -----
R2 on log scale:  0.3951855944311339
MAE on log scale:  0.4281450322056187
MAE on original $ scale:  202488.7017269737


### Test-Train Split, Response = 'DOM'

In [21]:
response_col = 'DOM'
test_size = 0.1
random_state = 9001

# read in data 
df = utils.read_preprocess_df(data_filename, response_col=response_col)

# test train split
X_train_dict, X_val_dict, y_train, y_val = utils.split_normalize_df(df=df, response_col=response_col, test_size=test_size, random_state=random_state)

Number of features from zillow:  10
Number of features from redfin:  20
Number of features from images:  2048
Number of features from remarks:  20
Number of training samples:  11965
Number of validation samples:  1330


### Train models

In [22]:
train_features = X_train_dict['img']
val_features = X_val_dict['img']

In [23]:
# Ridge
filename = 'models/dom/image_models/mf_dom_img_ridge.pkl'

print("Ridge model: ")
# t0 = time.time()

# # train and save model
# model = RidgeCV(alphas=(5, 5.5, 10, 20, 25, 30, 50, 100, 150))
# utils.train_save_model(model, X=train_features, y=y_train, filename=filename)
# print("training time: ", time.time()-t0)

# load savced model and evaluate model performance
utils.load_eval_model(filename=filename, X_train=train_features, X_val=val_features, y_train=y_train, y_val=y_val)

Ridge model: 
best alpha:  50.0
----- Training scores -----
R2 on log scale:  0.09843796546838024
MAE on log scale:  0.1986521625546102
MAE on original $ scale:  0.7291329521563765
----- Validation scores -----
R2 on log scale:  0.05074708509346437
MAE on log scale:  0.20419429927232965
MAE on original $ scale:  0.7500393569246421


In [26]:
# light gbm
filename = 'models/dom/image_models/mf_dom_img_LGBM.pkl'

params = {
          'reg_alpha':[10, 1e2, 1e3],
        'reg_lambda':[10, 1e2, 1e3]
         }

print("Light GBM model: ")
# t0 = time.time()

# model
model = model = LGBMRegressor(random_state=9001, n_estimators=512)
grid = GridSearchCV(model, params, verbose=1, n_jobs=-1)

# train and save model
utils.train_save_model(grid, X=train_features, y=y_train, filename=filename)
print("training time: ", time.time()-t0)

# load savced model and evaluate model performance
utils.load_eval_model(filename=filename, X_train=train_features, X_val=val_features, y_train=y_train, y_val=y_val)

Light GBM model: 
Fitting 3 folds for each of 9 candidates, totalling 27 fits


[Parallel(n_jobs=-1)]: Done  27 out of  27 | elapsed:  5.6min finished


training time:  2750.7265799045563
LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
       learning_rate=0.1, max_depth=-1, min_child_samples=20,
       min_child_weight=0.001, min_split_gain=0.0, n_estimators=512,
       n_jobs=-1, num_leaves=31, objective=None, random_state=9001,
       reg_alpha=10, reg_lambda=1000.0, silent=True, subsample=1.0,
       subsample_for_bin=200000, subsample_freq=1)
----- Training scores -----
R2 on log scale:  0.5670559080172752
MAE on log scale:  0.13137585572099392
MAE on original $ scale:  0.4855709954208372
----- Validation scores -----
R2 on log scale:  0.04323318992655367
MAE on log scale:  0.2054394034240479
MAE on original $ scale:  0.7540275479524075


In [25]:
# Feedforward NN
N_NODES = 145
N_LAYERS = 20

OPTIMIZER = Adam(lr=0.001)   
EPOCHS = 100
BATCH_SIZE = 200

filename = 'models/dom/image_models/mf_dom_img_nn.h5'

# # train and save model
# model = KerasRegressor(build_fn=feedforward_NN, x_train=train_features, optimizer=OPTIMIZER, n_nodes=N_NODES, n_layers=N_LAYERS, epochs=EPOCHS, batch_size=BATCH_SIZE, verbose=1)
# utils.train_save_model(model, X=train_features, y=y_train, filename=filename)

# load savced model and evaluate model performance
utils.load_eval_model(filename=filename, X_train=train_features, X_val=val_features, y_train=y_train, y_val=y_val)

shape of train:  (11965,)
shape of val:  (1330,)
----- Training scores -----
R2 on log scale:  0.06732925036810178
MAE on log scale:  0.202069160767427
MAE on original $ scale:  0.741487546101561
----- Validation scores -----
R2 on log scale:  0.040814402828049046
MAE on log scale:  0.204944852432692
MAE on original $ scale:  0.7527548644458774
