In [305]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import math
from sklearn.preprocessing import StandardScaler
from statsmodels.formula.api import ols
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn import metrics
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.feature_selection import SelectKBest, f_regression,mutual_info_regression
from sklearn.feature_selection import RFECV
from sklearn.linear_model import Lasso
import censusgeocode as cg
import sklearn
import pickle

sns.set(style="darkgrid")

pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)

## Step 1: Read in hold out data, scalers, and best model

In [306]:
holdout = pd.read_csv('provided_data/kc_house_data_test_features.csv', index_col=0)

In [295]:
#load scaler
file = open('pickle_jar/scaler.pkl', 'rb')
final_scaler = pickle.load(file)
file.close()

#load model
file = open('pickle_jar/model.pkl', 'rb')
final_model = pickle.load(file)
file.close()

## Step 2: Feature Engineering for holdout set

Remember we have to perform the same transformations on our holdout data (feature engineering, extreme values, and scaling) that we performed on the original data.

In [296]:
holdout['shack_living'] = np.where(holdout['sqft_living']/holdout['sqft_lot'] < .2, 1, 0)

holdout['bedrooms'] = holdout['bedrooms'].replace({11:4, 33:3})

holdout['yrs_old'] = 2020-holdout.yr_built

holdout['age_feature'] = (holdout['yrs_old'] - 57)**2

holdout['grade_exp'] = np.exp(holdout['grade'])

holdout['date'] = pd.to_datetime(holdout['date'], format="%Y%m%dT000000")

holdout['season'] = (holdout.date.dt.month%12 + 3)//3

holdout['grade_exp'] = np.exp(holdout['grade'])

holdout['has_basement'] = np.where(holdout['sqft_basement'] == 0, 0, 1)

In [297]:
# def distance_to(lat1, lon1, lat2, lon2):
#    r = 6371
#    phi1 = np.radians(lat1)
#    phi2 = np.radians(lat2)
#    delta_phi = np.radians(lat2 - lat1)
#    delta_lambda = np.radians(lon2 - lon1)
#    a = np.sin(delta_phi / 2)**2 + np.cos(phi1) * np.cos(phi2) *   np.sin(delta_lambda / 2)**2
#    res = r * (2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a)))
#    return np.round(res, 2)

# amzn_hdqt = (47.623583541669845, -122.33669143795257)
# msft_hdqt = 47.641489, -122.134503

# dist_msft, dist_amzn = [], []
# for row in holdout.itertuples(index=False):
#     dist_amzn.append(distance_to(row.lat, row.long, amzn_hdqt[0], amzn_hdqt[1]))
#     dist_msft.append(distance_to(row.lat, row.long, msft_hdqt[0], msft_hdqt[1]))
# holdout['dist_amzn'] = dist_amzn
# holdout['dist_msft'] = dist_msft

# holdout.head()

In [298]:
conditions = [holdout['condition']<= 2, holdout['condition'].isin(range(3, 5)), holdout['condition'] == 5]
choices = ['busted', 'aight', 'dope']

holdout['cond_cat'] = np.select(conditions, choices, default = 'missing')

In [299]:
season_dum = pd.get_dummies(holdout['season'], prefix='sn', drop_first=True)
zipcode_dum = pd.get_dummies(holdout['zipcode'], prefix='zp', drop_first=True)
cond_dum = pd.get_dummies(holdout['cond_cat'], prefix='condtn', drop_first=True)
holdout = pd.concat([holdout, cond_dum, zipcode_dum, season_dum], 1)

In [300]:
leave_out = ['id', 'date', 'season', 'price', 'cond_cat', 'sqft_lot', 
             'floors', 'view', 'condition', 'grade', 'sqft_above', 
             'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode', 'lat', 'long', 'sqft_living15', 'last_const']
features = [x for x in holdout.columns if x not in leave_out]

In [301]:
poly_2 = PolynomialFeatures(degree=2, include_bias=False)
poly2_data = poly_2.fit_transform(holdout[features])
poly2_columns = poly_2.get_feature_names(holdout[features].columns)
holdout_poly2 = pd.DataFrame(poly2_data, columns = poly2_columns)

In [302]:
holdout_poly2.shape

(4323, 3654)

In [303]:
transformed_holdout = final_scaler.transform(holdout_poly2)

## Step 3: Predict the holdout set

In [304]:
final_answers = final_model.predict(transformed_holdout)
# final_model

## Step 4: Export your predictions

In [292]:
pd.DataFrame(final_answers).to_csv('housing_preds_joe_marx.csv')