In [68]:
import numpy as np
import pandas as pd
import math
import re
import seaborn as sns
import matplotlib as plt
from sklearn.linear_model import Ridge, Lasso, ElasticNet, LinearRegression
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.model_selection import train_test_split

In [69]:
df = pd.read_csv('data/train_cleanV2.csv')
df_X = df.loc[:,(df.columns != 'Id') & (df.columns != 'SalePrice')]

In [70]:
# LASSO REGRESSION - ALPHA=15.79 AND 38 FEATURES THAT DO NOT GO TO 0

features_lasso = ['LotArea', 'Neighborhood', 'OverallQual', 'OverallCond', 'MasVnrArea',
       'ExterQual', 'BsmtQual', 'BsmtFinType1', 'TotalBsmtSF', 'GrLivArea',
       'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd',
       'Functional', 'Fireplaces', 'GarageYrBlt', 'GarageFinish', 'GarageCars',
       'WoodDeckSF', '3SsnPorch', 'ScreenPorch', 'TotalBath',
       'LotConfig_CulDSac', 'LotConfig_FR2', 'Exterior1st_Plywood',
       'SaleCondition_Partial', 'BsmtExposure_Gd', 'BsmtExposure_NoBsmt']
lasso = Lasso()
lasso.set_params(alpha=15.790316,normalize=True)  

X_train, X_test, Y_train, Y_test = train_test_split(df_X.loc[:, features_lasso], df['SalePrice'], test_size=0.3, random_state=42)
lasso.fit(X_train,Y_train)
print("train R^2: %f" %lasso.score(X_train,Y_train))
print("test R^2: %f"%lasso.score(X_test,Y_test))

lasso.fit(df_X.loc[:, features_lasso], df['SalePrice'])
lasso_predicted = lasso.predict(df_X.loc[:, features_lasso])

train R^2: 0.839223
test R^2: 0.832512


In [71]:
gbm = GradientBoostingRegressor()
gbm.set_params(learning_rate=0.01, n_estimators=500, verbose=0, 
               subsample=0.7, warm_start=False, loss='ls', max_depth=2, 
               max_features = 2, min_impurity_decrease=0.01)
features_gbm = ['Neighborhood', 'OverallQual', 'YearRemodAdd', 'TotalBsmtSF',
       'GrLivArea', 'Fireplaces', 'GarageCars', 'TotalBath']

X_train, X_test, Y_train, Y_test = train_test_split(df_X.loc[:, features_gbm], df['SalePrice'], test_size=0.3, random_state=42)
gbm.fit(X_train,Y_train)
print("train R^2: %f" %gbm.score(X_train,Y_train))
print("test R^2: %f"%gbm.score(X_test,Y_test))

gbm.fit(df_X.loc[:, features_gbm], df['SalePrice'])
gbm_predicted = gbm.predict(df_X.loc[:, features_gbm])

train R^2: 0.897254
test R^2: 0.856385


In [72]:
forest = RandomForestRegressor()
forest.set_params(random_state=42, n_estimators=100, max_features=4)
features_forest = ['Neighborhood', 'OverallQual', 'TotalBsmtSF', 'GrLivArea', 'Bedroom/Bathroom']

X_train, X_test, Y_train, Y_test = train_test_split(df_X.loc[:, features_forest], df['SalePrice'], test_size=0.3, random_state=42)
forest.fit(X_train,Y_train)
print("train R^2: %f" %forest.score(X_train,Y_train))
print("test R^2: %f"%forest.score(X_test,Y_test))

forest.fit(df_X.loc[:, features_forest], df['SalePrice'])
forest_predicted = forest.predict(df_X.loc[:, features_forest])

train R^2: 0.969072
test R^2: 0.838222


In [73]:
df_stack = pd.DataFrame({'Lasso_Predicted': lasso_predicted, 'GBM_Predicted': gbm_predicted, 'Forest_Predicted': forest_predicted})
df_stack

Unnamed: 0,Lasso_Predicted,GBM_Predicted,Forest_Predicted
0,222399.707129,201105.120905,205014.00000
1,215071.504844,180553.047183,183415.50000
2,225248.311590,210753.120268,214722.78000
3,204392.745208,196242.309953,155947.40000
4,277106.700716,299796.572690,252787.00000
...,...,...,...
1447,168897.552493,182849.599204,175652.00000
1448,213385.631232,216442.922334,210889.00000
1449,260866.475145,227028.185772,257778.59000
1450,136444.062257,131547.929071,146651.26875


In [74]:
lm = LinearRegression()

X_train, X_test, Y_train, Y_test = train_test_split(df_stack, df['SalePrice'], test_size=0.3, random_state=42)
lm.fit(X_train,Y_train)
print("train R^2: %f" %lm.score(X_train,Y_train))
print("test R^2: %f"%lm.score(X_test,Y_test))

train R^2: 0.984528
test R^2: 0.981803


In [81]:
df_stack2 = pd.DataFrame({'Lasso_Predicted': lasso_predicted, 'GBM_Predicted': gbm_predicted})
df_stack2
lm2 = LinearRegression()
X_train, X_test, Y_train, Y_test = train_test_split(df_stack2, df['SalePrice'], test_size=0.3, random_state=42)
lm2.fit(X_train,Y_train)
print("train R^2: %f" %lm2.score(X_train,Y_train))
print("test R^2: %f"%lm2.score(X_test,Y_test))

train R^2: 0.900370
test R^2: 0.906616


In [75]:
features_used = np.unique(np.array(features_lasso + features_gbm + features_forest))
remove_dummify = lambda x: x.split('_', 1)[0]
features_used = np.unique(np.array([remove_dummify(x) for x in features_used]))
features_used

array(['3SsnPorch', 'Bedroom/Bathroom', 'BedroomAbvGr', 'BsmtExposure',
       'BsmtFinType1', 'BsmtQual', 'ExterQual', 'Exterior1st',
       'Fireplaces', 'Functional', 'GarageCars', 'GarageFinish',
       'GarageYrBlt', 'GrLivArea', 'KitchenAbvGr', 'KitchenQual',
       'LotArea', 'LotConfig', 'MasVnrArea', 'Neighborhood',
       'OverallCond', 'OverallQual', 'SaleCondition', 'ScreenPorch',
       'TotRmsAbvGrd', 'TotalBath', 'TotalBsmtSF', 'WoodDeckSF',
       'YearRemodAdd'], dtype='<U16')

In [76]:
test_final = pd.read_csv('data/test_clean.csv', index_col=0)
test_stack = pd.DataFrame({'Lasso_Predicted': lasso.predict(test_final[features_lasso]), 
                           'GBM_Predicted': gbm.predict(test_final[features_gbm]), 
                           'Forest_Predicted': forest.predict(test_final[features_forest])})
test_predictions = pd.DataFrame({'SalePrice': lm.predict(test_stack)}).set_index(test_final.index)
test_final.index

Int64Index([1461, 1462, 1463, 1464, 1465, 1466, 1467, 1468, 1469, 1470,
            ...
            2910, 2911, 2912, 2913, 2914, 2915, 2916, 2917, 2918, 2919],
           dtype='int64', name='Id', length=1459)

In [77]:
test_predictions.to_csv('data/predictions.csv')

In [78]:
predictions_lasso = pd.DataFrame({'SalePrice': test_stack['Lasso_Predicted']}).set_index(test_final.index)
predictions_gbm = pd.DataFrame({'SalePrice': test_stack['GBM_Predicted']}).set_index(test_final.index)
predictions_forest = pd.DataFrame({'SalePrice': test_stack['Forest_Predicted']}).set_index(test_final.index)
predictions_lasso.to_csv('data/predictions_lasso.csv')
predictions_gbm.to_csv('data/predictions_gbm.csv')
predictions_forest.to_csv('data/predictions_forest.csv')

In [83]:
test_stack2 = pd.DataFrame({'Lasso_Predicted': lasso.predict(test_final[features_lasso]), 
                           'GBM_Predicted': gbm.predict(test_final[features_gbm])})
test_predictions2 = pd.DataFrame({'SalePrice': lm2.predict(test_stack2)}).set_index(test_final.index)
test_predictions2.to_csv('data/predictions2.csv')