In [1]:
import pandas as pd
import numpy as np
import pickle
pd.set_option('display.max_columns', 300)

### Read in holdout data, and best model

In [2]:
df = pd.read_csv('data/kc_house_data_test_features.csv', index_col=0)
print(df.shape)
df.head()

(4322, 20)


Unnamed: 0,id,date,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,1974300020,20140827T000000,4,2.5,2270,11500,1.0,0,0,3,8,1540,730,1967,0,98034,47.7089,-122.241,2020,10918
1,1974300020,20150218T000000,4,2.5,2270,11500,1.0,0,0,3,8,1540,730,1967,0,98034,47.7089,-122.241,2020,10918
2,3630020380,20141107T000000,3,2.5,1470,1779,2.0,0,0,3,8,1160,310,2005,0,98029,47.5472,-121.998,1470,1576
3,1771000290,20141203T000000,3,1.75,1280,16200,1.0,0,0,3,8,1030,250,1976,0,98077,47.7427,-122.071,1160,10565
4,5126310470,20150115T000000,4,2.75,2830,8126,2.0,0,0,3,8,2830,0,2005,0,98059,47.4863,-122.14,2830,7916


In [3]:
infile = open("model.pickle",'rb')
model = pickle.load(infile)
infile.close()

In [4]:
infile = open("other_info.pickle",'rb')
other_info = pickle.load(infile)
infile.close()

### Feature Engineering for Holdout set

In [5]:
df['bedrooms']= np.where(df['bedrooms'] ==33, 3, df['bathrooms'])

In [6]:
df['yard_sqft'] = df['sqft_lot']-df['sqft_living']

In [7]:
df['bathrooms']= np.where(df['bathrooms'] == 0, 0.5, df['bathrooms'])
df['bathroom_avg'] = df['bedrooms']/df['bathrooms']
df['bed_bath_avg'] = df['bathroom_avg'].apply(lambda x: 1 if x > 0 else 0)

In [8]:
df['sqft_compared'] = df['sqft_living']-df['sqft_living15']
df['sqft_living_larger'] = df['sqft_compared'].apply(lambda x: 1 if x > 0 else 0)

In [9]:
df['was_renovated'] = df['yr_renovated'].apply(lambda x: 1 if x > 0 else 0)

In [10]:
df['date'] = pd.to_datetime(df['date'])

In [11]:
df['month'] = df['date'].dt.month

In [12]:
df_dummies = df.copy()

In [13]:
df_dummies.drop(['id','date','lat','long','bedrooms','bathrooms','bathroom_avg','sqft_compared'],axis=1,inplace=True)

In [14]:
df_dummies = pd.get_dummies(df_dummies,columns=['zipcode'],drop_first=True)
df_dummies = pd.get_dummies(df_dummies,columns=['floors'],drop_first=True)
df_dummies = pd.get_dummies(df_dummies,columns=['view'],drop_first=True)
df_dummies = pd.get_dummies(df_dummies,columns=['month'], prefix ='month',drop_first=True)

### Predict holdout set

In [15]:
final_answers = model.predict(df_dummies[other_info])

In [16]:
final_answer = pd.DataFrame(final_answers)

### Export holdout set

In [17]:
final_answer.to_csv('housing_preds_jacob_ash.csv')