In [1]:
import numpy as np
import pandas as pd
import math
import re
import seaborn as sns
import matplotlib as plt
from sklearn.linear_model import Ridge, Lasso, ElasticNet, LinearRegression
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('data/train_cleanV2.csv')
df_X = df.loc[:,(df.columns != 'Id') & (df.columns != 'SalePrice')]

In [3]:
# LASSO REGRESSION - ALPHA=15.79 AND 38 FEATURES THAT DO NOT GO TO 0

features_lasso = ['LotFrontage', 'LotArea', 'Neighborhood', 'OverallQual', 'OverallCond',
       'MasVnrArea', 'ExterQual', 'BsmtQual', 'BsmtCond', 'BsmtFinType1',
       'TotalBsmtSF', 'GrLivArea', 'BedroomAbvGr', 'KitchenAbvGr',
       'KitchenQual', 'TotRmsAbvGrd', 'Functional', 'Fireplaces',
       'FireplaceQu', 'GarageYrBlt', 'GarageFinish', 'GarageCars',
       'GarageArea', 'WoodDeckSF', '3SsnPorch', 'ScreenPorch', 'TotalBath',
       'LotConfig_CulDSac', 'LotConfig_FR2', 'Exterior1st_Other',
       'Exterior1st_Plywood', 'Exterior2nd_HdBoard', 'MasVnrType_BrkFace',
       'SaleCondition_Family', 'SaleCondition_Partial', 'BsmtExposure_Gd',
       'BsmtExposure_NoBsmt', 'BsmtExposure_NoExposure']
lasso = Lasso()
lasso.set_params(alpha=15.790316,normalize=True)  

X_train, X_test, Y_train, Y_test = train_test_split(df_X.loc[:, features_lasso], df['SalePrice'], test_size=0.3, random_state=42)
lasso.fit(X_train,Y_train)
print("train R^2: %f" %lasso.score(X_train,Y_train))
print("test R^2: %f"%lasso.score(X_test,Y_test))

lasso.fit(df_X.loc[:, features_lasso], df['SalePrice'])
lasso_predicted = lasso.predict(df_X.loc[:, features_lasso])

train R^2: 0.842491
test R^: 0.832876


In [7]:
gbm = GradientBoostingRegressor()
gbm.set_params(learning_rate=0.01, n_estimators=500, verbose=0, 
               subsample=0.7, warm_start=False, loss='ls', max_depth=2, 
               max_features = 2, min_impurity_decrease=0.01)
features_gbm = ['Neighborhood', 'OverallQual', 'YearRemodAdd', 'TotalBsmtSF',
       'GrLivArea', 'Fireplaces', 'GarageCars', 'TotalBath']

X_train, X_test, Y_train, Y_test = train_test_split(df_X.loc[:, features_gbm], df['SalePrice'], test_size=0.3, random_state=42)
gbm.fit(X_train,Y_train)
print("train R^2: %f" %gbm.score(X_train,Y_train))
print("test R^2: %f"%gbm.score(X_test,Y_test))

gbm.fit(df_X.loc[:, features_gbm], df['SalePrice'])
gbm_predicted = gbm.predict(df_X.loc[:, features_gbm])

train R^2: 0.895563
test R^: 0.850068


In [8]:
forest = RandomForestRegressor()
forest.set_params(random_state=42, n_estimators=100, max_features=4)
features_forest = ['Neighborhood', 'OverallQual', 'TotalBsmtSF', 'GrLivArea', 'Bedroom/Bathroom']

X_train, X_test, Y_train, Y_test = train_test_split(df_X.loc[:, features_forest], df['SalePrice'], test_size=0.3, random_state=42)
forest.fit(X_train,Y_train)
print("train R^2: %f" %forest.score(X_train,Y_train))
print("test R^2: %f"%forest.score(X_test,Y_test))

forest.fit(df_X.loc[:, features_forest], df['SalePrice'])
forest_predicted = forest.predict(df_X.loc[:, features_forest])

train R^2: 0.969072
test R^: 0.838222


In [10]:
df_stack = pd.DataFrame({'Lasso_Predicted': lasso_predicted, 'GBM_Predicted': gbm_predicted, 'Forest_Predicted': forest_predicted})
df_stack

Unnamed: 0,Lasso_Predicted,GBM_Predicted,Forest_Predicted
0,218594.316714,203820.928079,205014.00000
1,217906.860059,182367.298234,183415.50000
2,223529.670552,213428.484444,214722.78000
3,201667.406196,196316.097183,155947.40000
4,277865.278885,298230.182831,252787.00000
...,...,...,...
1447,169074.035671,182679.462604,175652.00000
1448,213991.083645,213248.059215,210889.00000
1449,260278.182984,227970.900267,257778.59000
1450,138225.274927,130791.246693,146651.26875


In [11]:
lm = LinearRegression()

X_train, X_test, Y_train, Y_test = train_test_split(df_stack, df['SalePrice'], test_size=0.3, random_state=42)
lm.fit(X_train,Y_train)
print("train R^2: %f" %lm.score(X_train,Y_train))
print("test R^2: %f"%lm.score(X_test,Y_test))

train R^2: 0.984761
test R^: 0.981950
