In [1]:
import pandas as pd
from sklearn.ensemble import StackingRegressor, GradientBoostingRegressor, RandomForestRegressor, AdaBoostRegressor, ExtraTreesRegressor
from sklearn.linear_model import LinearRegression, LogisticRegression, ridge_regression, BayesianRidge, Ridge
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [17]:
df = pd.read_csv('kc_house_data_train.csv')
df.drop(['Unnamed: 0'], axis = 1, inplace = True)
df['date'] = pd.to_datetime(df['date'])
df['date'] = df['date'].apply(lambda x: x.month)

In [18]:
df.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,2591820310,10,365000.0,4,2.25,2070,8893,2.0,0,0,...,8,2070,0,1986,0,98058,47.4388,-122.162,2390,7700
1,7974200820,8,865000.0,5,3.0,2900,6730,1.0,0,0,...,8,1830,1070,1977,0,98115,47.6784,-122.285,2370,6283
2,7701450110,8,1038000.0,4,2.5,3770,10893,2.0,0,2,...,11,3770,0,1997,0,98006,47.5646,-122.129,3710,9685
3,9522300010,3,1490000.0,3,3.5,4560,14608,2.0,0,2,...,12,4560,0,1990,0,98034,47.6995,-122.228,4050,14226
4,9510861140,7,711000.0,3,2.5,2550,5376,2.0,0,0,...,9,2550,0,2004,0,98052,47.6647,-122.083,2250,4050


In [19]:
y = df['price']
X = df.drop(['price', 'id', 'date'], axis = 1)

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

In [23]:
estimators = [
             ('rf', RandomForestRegressor()),
             ('gb', GradientBoostingRegressor()),
             ('lr', LinearRegression()),
             ('br', BayesianRidge()),
             ('r', Ridge()),
             ('et', ExtraTreesRegressor())
             ]
stack = StackingRegressor(estimators = estimators, final_estimator = GradientBoostingRegressor(), cv = 5)

In [24]:
stack.fit(X_train, y_train);

In [25]:
dct = score_all_estimators(stack, X_test, y_test)
dct['stack'] = score_model(stack, X_test, y_test)
dct

{'RandomForestRegressor': {'r^2': 0.8790164042744264,
  'rmse': 131667.8945280816},
 'GradientBoostingRegressor': {'r^2': 0.8769816804705264,
  'rmse': 132770.48509459925},
 'LinearRegression': {'r^2': 0.6912723263768771, 'rmse': 210331.5523410592},
 'BayesianRidge': {'r^2': 0.6911267778176904, 'rmse': 210381.12652845687},
 'Ridge': {'r^2': 0.6911278795677309, 'rmse': 210380.75131363617},
 'ExtraTreesRegressor': {'r^2': 0.8687669075495335,
  'rmse': 137131.84792653547},
 'stack': {'r^2': 0.8897764482178844, 'rmse': 125676.43350790215}}

In [9]:
def score_all_estimators(stack, X, y):
    dct = {}
    for estimator in stack.estimators_:
        name = str(estimator)
        name = name[:name.index('(')]
        dct[name] = score_model(estimator, X, y)
    return dct

def score_model(model, X, y):
    return  {'r^2' : model.score(X, y), 'rmse' : mean_squared_error(y, model.predict(X)) ** .5}

In [26]:
df_test = pd.read_csv('kc_house_data_test_features.csv')
df_test.drop(['Unnamed: 0'], axis = 1, inplace = True)

In [28]:
y = df['price']
X = df.drop(['price', 'id', 'date'], axis = 1)
X_test = df_test.drop(['id', 'date'], axis = 1)


In [29]:
estimators = [
             ('rf', RandomForestRegressor()),
             ('gb', GradientBoostingRegressor()),
             ('lr', LinearRegression()),
             ('br', BayesianRidge()),
             ('r', Ridge()),
             ('et', ExtraTreesRegressor())
             ]
stack = StackingRegressor(estimators = estimators, final_estimator = GradientBoostingRegressor(), cv = 5)
stack.fit(X, y)

StackingRegressor(cv=5,
                  estimators=[('rf',
                               RandomForestRegressor(bootstrap=True,
                                                     ccp_alpha=0.0,
                                                     criterion='mse',
                                                     max_depth=None,
                                                     max_features='auto',
                                                     max_leaf_nodes=None,
                                                     max_samples=None,
                                                     min_impurity_decrease=0.0,
                                                     min_impurity_split=None,
                                                     min_samples_leaf=1,
                                                     min_samples_split=2,
                                                     min_weight_fraction_leaf=0.0,
                                                     n_e

In [100]:
predictions = stack.predict(X_test)

In [71]:
predictions = pd.DataFrame(predictions)

In [73]:
predictions[0].to_csv('predictions.csv', index = False, header = True)

In [70]:
predictions

array([514833.4927415 , 514833.4927415 , 385726.97025028, ...,
       336976.1115544 , 390997.84994381, 349503.95076737])

In [93]:
predictions = list(predictions)

In [102]:
with open('housing_preds_jctc.csv', 'w') as f:
    writer = csv.writer(f)
    writer.writerow([None])
    for val in predictions:
        writer.writerow([val])

In [101]:
pd.read_csv('predictions.csv')

Unnamed: 0.1,Unnamed: 0
0,514833.492742
1,514833.492742
2,385726.970250
3,352091.654943
4,516343.716791
...,...
4318,372160.558301
4319,403253.591513
4320,336976.111554
4321,390997.849944
