In [43]:
#import packages

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn import linear_model
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.feature_selection import f_regression, mutual_info_regression, RFECV
from sklearn.linear_model import Lasso
import censusgeocode as cg
import pickle

# allow pandas to display more data
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)

## Read in hold out data, scalers, and best model

In [44]:
# import holdout dataframe
holdout = pd.read_csv('../provided_data/kc_house_data_test_features.csv', index_col=0)

In [45]:
#load scaler
file = open('../pickle_jar/scaler.pkl', 'rb')
final_scaler = pickle.load(file)
file.close()

#load model
file = open('../pickle_jar/model.pkl', 'rb')
final_model = pickle.load(file)
file.close()

## Step 2: Feature Engineering for holdout set

Transform and generate same features for holdout data as we did in model notebook 

In [46]:
# datetime conversion
holdout['date'] = pd.to_datetime(holdout['date'], format="%Y%m%dT000000")
# fix incorrect data
holdout['bedrooms'] = holdout['bedrooms'].replace({11:4, 33:3})
# house to property ratio feature
holdout['shack_living'] = np.where(holdout['sqft_living']/holdout['sqft_lot'] < .2, 1, 0)
# how old it is
holdout['yrs_old'] = 2020-holdout.yr_built
# age feature creation
holdout['age_feature'] = (holdout['yrs_old'] - 57)**2
# grade_exp
holdout['grade_exp'] = np.exp(holdout['grade'])
# basement feature
holdout['has_basement'] = np.where(holdout['sqft_basement'] == 0, 0, 1)
# exponential grade feature
holdout['grade_exp'] = np.exp(holdout['grade'])

In [47]:
# season feature
conditions = [holdout.date.dt.month.isin([1,2,9,11,12]), holdout.date.dt.month.isin([4,5,6])]
choices = ['slow', 'busy']

holdout['season'] = np.select(conditions, choices, default='normal')

In [48]:
# condition feature
conditions = [holdout['condition']<= 2, holdout['condition'].isin(range(3, 5)), holdout['condition'] == 5]
choices = ['busted', 'aight', 'dope']

holdout['cond_cat'] = np.select(conditions, choices, default = 'missing')

In [49]:
# create dummy characters and concatenate to holdout dataframe
season_dum = pd.get_dummies(holdout['season'], prefix='sn', drop_first=True)
zipcode_dum = pd.get_dummies(holdout['zipcode'], prefix='zp', drop_first=True)
cond_dum = pd.get_dummies(holdout['cond_cat'], prefix='condtn', drop_first=True)
holdout = pd.concat([holdout, cond_dum, zipcode_dum, season_dum], 1)

In [50]:
# create features subset that matches model from other notebook
leave_out = ['id', 'date', 'season', 'price', 'cond_cat', 'sqft_lot', 
             'floors', 'view', 'condition', 'grade', 'sqft_above', 
             'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode', 
             'yrs_old','lat', 'long', 'sqft_living15']

features = [x for x in holdout.columns if x not in leave_out]

In [51]:
# create polynomial features
poly_2 = PolynomialFeatures(degree=2, include_bias=False)
poly2_data = poly_2.fit_transform(holdout[features])
poly2_columns = poly_2.get_feature_names(holdout[features].columns)
holdout_poly2 = pd.DataFrame(poly2_data, columns = poly2_columns)

## Scale holdout data to imported scaler

In [52]:
transformed_holdout = final_scaler.transform(holdout_poly2)

## Create predictions using the model on the holdout dataset

In [53]:
final_answers = final_model.predict(transformed_holdout)

## Export predictions

In [54]:
pd.DataFrame(final_answers).to_csv('housing_preds.csv')