In [1]:
import time
import pandas as pd
import numpy as np

from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.linear_model import RidgeCV
from sklearn import svm
from sklearn.model_selection import GridSearchCV

import pickle
from sklearn.externals import joblib
import warnings
warnings.filterwarnings("ignore")

import src.utils as utils

Using TensorFlow backend.


# Condos

In [2]:
data_filename = 'data/features/CON_feats_remarks.pkl'

### Test-Train Split, Response = 'SOLDPRICE'

In [3]:
response_col = 'SOLDPRICE'
test_size = 0.1
random_state = 9001

# read in data 
df = utils.read_preprocess_df(data_filename, response_col=response_col)

# test train split
X_train_dict, X_val_dict, y_train, y_val = utils.split_normalize_df(df=df, response_col=response_col, test_size=test_size, random_state=random_state)

Number of features from zillow:  10
Number of features from redfin:  20
Number of features from images:  2048
Number of features from remarks:  20
Number of training samples:  36842
Number of validation samples:  4094


### Train models

In [4]:
train_features = X_train_dict['zillow']
val_features = X_val_dict['zillow']

In [6]:
# Ridge
filename = 'models/soldprice/data_models/condo_price_zillow_ridge.pkl'

print("Ridge model: ")
# t0 = time.time()

# # train and save model
# model = RidgeCV(alphas=(3.5, 4, 4.5, 5, 5.5))
# utils.train_save_model(model, X=train_features, y=y_train, filename=filename)
# print("training time: ", time.time()-t0)

# load savced model and evaluate model performance
utils.load_eval_model(filename=filename, X_train=train_features, X_val=val_features, y_train=y_train, y_val=y_val)

Ridge model: 
best alpha:  3.5
----- Training scores -----
R2 on log scale:  0.32590789177462154
MAE on log scale:  0.4587657640333033
MAE on original $ scale:  218948.52798658473
----- Validation scores -----
R2 on log scale:  0.30939790240028997
MAE on log scale:  0.4586051205574458
MAE on original $ scale:  240399.85392549518


In [None]:
# XGBoost
filename = 'models/soldprice/data_models/condo_price_zillow_XGBoost.pkl'
params = {
    'max_depth':range(21,28,2),
    'gamma':[i/10.0 for i in range(0,3)],
    'reg_alpha':[1e-2, 0.1, 1, 10]
}

print("XGBoost model: ")
# t0 = time.time()

# model
# model = XGBRegressor(random_seed=9001)
# grid = GridSearchCV(model, params, verbose=1, n_jobs=-1)

# train and save model
# utils.train_save_model(model, X=train_features, y=y_train, filename=filename)
# print("training time: ", time.time()-t0)

# load savced model and evaluate model performance
utils.load_eval_model(filename=filename, X_train=train_features, X_val=val_features, y_train=y_train, y_val=y_val)

In [None]:
# Light GBM
filename = 'models/soldprice/data_models/condo_price_zillow_LGBM.pkl'
params = {'num_leaves': [30, 100, 200], 
          'max_depth':[-1, 16, 32, 64], 
          'learning_rate':[0.01, 0.1, 1], 
          'n_estimators':[128, 256, 512]
         }

print("Light GBM model: ")
# t0 = time.time()

# model
# model = model = LGBMRegressor(random_state=9001)
# grid = GridSearchCV(model, params, verbose=1, n_jobs=-1)

# train and save model
# utils.train_save_model(model, X=train_features, y=y_train, filename=filename)
# print("training time: ", time.time()-t0)

# load savced model and evaluate model performance
utils.load_eval_model(filename=filename, X_train=train_features, X_val=val_features, y_train=y_train, y_val=y_val)

### Test-Train Split, Response = 'DOM'

In [7]:
response_col = 'DOM'
test_size = 0.1
random_state = 9001

# read in data 
df = utils.read_preprocess_df(data_filename, response_col=response_col)

# test train split
X_train_dict, X_val_dict, y_train, y_val = utils.split_normalize_df(df=df, response_col=response_col, test_size=test_size, random_state=random_state)

Number of features from zillow:  10
Number of features from redfin:  20
Number of features from images:  2048
Number of features from remarks:  20
Number of training samples:  36842
Number of validation samples:  4094


### Train models

In [8]:
train_features = X_train_dict['zillow']
val_features = X_val_dict['zillow']

In [10]:
# Ridge
filename = 'models/dom/data_models/condo_dom_zillow_ridge.pkl'

print("Ridge model: ")
# t0 = time.time()

# # train and save model
# model = RidgeCV(alphas=(3.5, 4, 4.5, 5, 5.5))
# utils.train_save_model(model, X=train_features, y=y_train, filename=filename)
# print("training time: ", time.time()-t0)

# load savced model and evaluate model performance
utils.load_eval_model(filename=filename, X_train=train_features, X_val=val_features, y_train=y_train, y_val=y_val)

Ridge model: 
best alpha:  1.5
----- Training scores -----
R2 on log scale:  0.03132535857505869
MAE on log scale:  0.20420263588251353
MAE on original $ scale:  0.7271013647650538
----- Validation scores -----
R2 on log scale:  0.02520524971485172
MAE on log scale:  0.20932643183401634
MAE on original $ scale:  0.7441632171191556


In [None]:
# XGBoost
filename = 'models/dom/data_models/condo_dom_zillow_XGBoost.pkl'
params = {
    'max_depth':range(21,28,2),
    'gamma':[i/10.0 for i in range(0,3)],
    'reg_alpha':[1e-2, 0.1, 1, 10]
}

print("XGBoost model: ")
# t0 = time.time()

# model
# model = XGBRegressor(random_seed=9001)
# grid = GridSearchCV(model, params, verbose=1, n_jobs=-1)

# train and save model
# utils.train_save_model(model, X=train_features, y=y_train, filename=filename)
# print("training time: ", time.time()-t0)

# load savced model and evaluate model performance
utils.load_eval_model(filename=filename, X_train=train_features, X_val=val_features, y_train=y_train, y_val=y_val)

In [None]:
# Light GBM
filename = 'models/dom/data_models/condo_dom_zillow_LGBM.pkl'
params = {'num_leaves': [30, 100, 200], 
          'max_depth':[-1, 16, 32, 64], 
          'learning_rate':[0.01, 0.1, 1], 
          'n_estimators':[128, 256, 512]
         }

print("Light GBM model: ")
# t0 = time.time()

# model
# model = model = LGBMRegressor(random_state=9001)
# grid = GridSearchCV(model, params, verbose=1, n_jobs=-1)

# train and save model
# utils.train_save_model(model, X=train_features, y=y_train, filename=filename)
# print("training time: ", time.time()-t0)

# load savced model and evaluate model performance
utils.load_eval_model(filename=filename, X_train=train_features, X_val=val_features, y_train=y_train, y_val=y_val)

# Multi Families

In [2]:
data_filename = 'data/features/MF_feats_remarks.pkl'

### Test-Train Split, Response = 'SOLDPRICE'

In [3]:
response_col = 'SOLDPRICE'
test_size = 0.1
random_state = 9001

# read in data 
df = utils.read_preprocess_df(data_filename, response_col=response_col)

# test train split
X_train_dict, X_val_dict, y_train, y_val = utils.split_normalize_df(df=df, response_col=response_col, test_size=test_size, random_state=random_state)

Number of features from zillow:  10
Number of features from redfin:  20
Number of features from images:  2048
Number of features from remarks:  20
Number of training samples:  11965
Number of validation samples:  1330


### Train models

In [4]:
train_features = X_train_dict['zillow']
val_features = X_val_dict['zillow']

In [None]:
# Ridge
filename = 'models/soldprice/data_models/mf_price_zillow_ridge.pkl'

print("Ridge model: ")
# t0 = time.time()

# # train and save model
# model = RidgeCV(alphas=(3.5, 4, 4.5, 5, 5.5))
# utils.train_save_model(model, X=train_features, y=y_train, filename=filename)
# print("training time: ", time.time()-t0)

# load savced model and evaluate model performance
utils.load_eval_model(filename=filename, X_train=train_features, X_val=val_features, y_train=y_train, y_val=y_val)

In [None]:
# XGBoost
filename = 'models/soldprice/data_models/mf_price_zillow_XGBoost.pkl'
params = {
    'max_depth':range(21,28,2),
    'gamma':[i/10.0 for i in range(0,3)],
    'reg_alpha':[1e-2, 0.1, 1, 10]
}

print("XGBoost model: ")
# t0 = time.time()

# model
# model = XGBRegressor(random_seed=9001)
# grid = GridSearchCV(model, params, verbose=1, n_jobs=-1)

# train and save model
# utils.train_save_model(model, X=train_features, y=y_train, filename=filename)
# print("training time: ", time.time()-t0)

# load savced model and evaluate model performance
utils.load_eval_model(filename=filename, X_train=train_features, X_val=val_features, y_train=y_train, y_val=y_val)

In [None]:
# Light GBM
filename = 'models/soldprice/data_models/mf_price_zillow_LGBM.pkl'
params = {'num_leaves': [30, 100, 200], 
          'max_depth':[-1, 16, 32, 64], 
          'learning_rate':[0.01, 0.1, 1], 
          'n_estimators':[128, 256, 512]
         }

print("Light GBM model: ")
# t0 = time.time()

# model
# model = model = LGBMRegressor(random_state=9001)
# grid = GridSearchCV(model, params, verbose=1, n_jobs=-1)

# train and save model
# utils.train_save_model(model, X=train_features, y=y_train, filename=filename)
# print("training time: ", time.time()-t0)

# load savced model and evaluate model performance
utils.load_eval_model(filename=filename, X_train=train_features, X_val=val_features, y_train=y_train, y_val=y_val)

### Test-Train Split, Response = 'DOM'

In [5]:
response_col = 'DOM'
test_size = 0.1
random_state = 9001

# read in data 
df = utils.read_preprocess_df(data_filename, response_col=response_col)

# test train split
X_train_dict, X_val_dict, y_train, y_val = utils.split_normalize_df(df=df, response_col=response_col, test_size=test_size, random_state=random_state)

Number of features from zillow:  10
Number of features from redfin:  20
Number of features from images:  2048
Number of features from remarks:  20
Number of training samples:  11965
Number of validation samples:  1330


### Train models

In [6]:
# Ridge
filename = 'models/dom/data_models/mf_dom_zillow_ridge.pkl'

print("Ridge model: ")
# t0 = time.time()

# # train and save model
# model = RidgeCV(alphas=(3.5, 4, 4.5, 5, 5.5))
# utils.train_save_model(model, X=train_features, y=y_train, filename=filename)
# print("training time: ", time.time()-t0)

# load savced model and evaluate model performance
utils.load_eval_model(filename=filename, X_train=train_features, X_val=val_features, y_train=y_train, y_val=y_val)

Ridge model: 
best alpha:  1.5
----- Training scores -----
R2 on log log scale:  0.018471819071555595
MAE on log log scale:  0.20791159116947272
MAE on original scale:  0.7631119067575725
----- Validation scores -----
R2 on log log scale:  0.02921156058375185
MAE on log log scale:  0.20720777894194073
MAE on original scale:  0.7611628113193962


In [None]:
# XGBoost
filename = 'models/dom/data_models/mf_dom_zillow_XGBoost.pkl'
params = {
    'max_depth':range(21,28,2),
    'gamma':[i/10.0 for i in range(0,3)],
    'reg_alpha':[1e-2, 0.1, 1, 10]
}

print("XGBoost model: ")
t0 = time.time()

# model
model = XGBRegressor(random_seed=9001)
grid = GridSearchCV(model, params, verbose=1, n_jobs=-1)

# train and save model
utils.train_save_model(model, X=train_features, y=y_train, filename=filename)
print("training time: ", time.time()-t0)

# load savced model and evaluate model performance
utils.load_eval_model(filename=filename, X_train=train_features, X_val=val_features, y_train=y_train, y_val=y_val)

In [None]:
# Light GBM
filename = 'models/dom/data_models/mf_dom_zillow_LGBM.pkl'
params = {'num_leaves': [30, 100, 200], 
          'max_depth':[-1, 16, 32, 64], 
          'learning_rate':[0.01, 0.1, 1], 
          'n_estimators':[128, 256, 512]
         }

print("Light GBM model: ")
t0 = time.time()

# model
model = model = LGBMRegressor(random_state=9001)
grid = GridSearchCV(model, params, verbose=1, n_jobs=-1)

# train and save model
utils.train_save_model(model, X=train_features, y=y_train, filename=filename)
print("training time: ", time.time()-t0)

# load savced model and evaluate model performance
utils.load_eval_model(filename=filename, X_train=train_features, X_val=val_features, y_train=y_train, y_val=y_val)