In [68]:
%store -r models
%store -r models_results
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score 
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor

In [40]:
models

Unnamed: 0,linear_regression,ridge_regression,lasso_regression,elastic_net,decision_tree_regressor,random_forest_regressor,xgb_regressor
0,LinearRegression(),"Ridge(alpha=3.076923076923077, random_state=0)","Lasso(alpha=52, random_state=0)","ElasticNet(alpha=52, l1_ratio=1.0, random_stat...","DecisionTreeRegressor(max_depth=8, max_feature...","RandomForestRegressor(max_depth=31, max_featur...","XGBRegressor(base_score=None, booster=None, ca..."


In [4]:
models_results

linear_regression         -14231.924375
ridge_regression          -14039.208670
lasso_regression          -13943.969522
elastic_net               -13943.969522
decision_tree_regressor   -16727.753705
random_forest_regressor   -12596.608348
xgb_regressor             -12880.093265
dtype: float64

In [17]:
train_data = pd.read_csv('../../Data/train_data_processed.csv', index_col = 'Id')
X = train_data.copy()
y = X['SalePrice']
X = X.drop(['SalePrice'], axis = 1)

test_data = pd.read_csv('../../Data/test_data_processed.csv', index_col = 'Id')

In [19]:
columns_list = list(X.columns)

In [20]:
for i in range(0, len(columns_list)):
    print(i, ' - ', columns_list[i])

0  -  OverallQual
1  -  GrLivArea
2  -  YearBuilt
3  -  GarageArea
4  -  TotalBsmtSF
5  -  FullBath
6  -  YearRemodAdd
7  -  2ndFlrSF
8  -  LotArea
9  -  Fireplaces
10  -  OpenPorchSF
11  -  Neighborhood_Blmngtn
12  -  Neighborhood_Blueste
13  -  Neighborhood_BrDale
14  -  Neighborhood_BrkSide
15  -  Neighborhood_ClearCr
16  -  Neighborhood_CollgCr
17  -  Neighborhood_Crawfor
18  -  Neighborhood_Edwards
19  -  Neighborhood_Gilbert
20  -  Neighborhood_IDOTRR
21  -  Neighborhood_MeadowV
22  -  Neighborhood_Mitchel
23  -  Neighborhood_NAmes
24  -  Neighborhood_NPkVill
25  -  Neighborhood_NWAmes
26  -  Neighborhood_NoRidge
27  -  Neighborhood_NridgHt
28  -  Neighborhood_OldTown
29  -  Neighborhood_SWISU
30  -  Neighborhood_Sawyer
31  -  Neighborhood_SawyerW
32  -  Neighborhood_Somerst
33  -  Neighborhood_StoneBr
34  -  Neighborhood_Timber
35  -  Neighborhood_Veenker
36  -  Foundation_BrkTil
37  -  Foundation_CBlock
38  -  Foundation_PConc
39  -  Foundation_Stone
40  -  Foundation_Wood
41  

In [51]:
scores_diff_num_of_features = pd.DataFrame({'model': [], 'num_of_features': [], 'score': []})


In [52]:
lasso_regression = models['lasso_regression'][0]
random_forest = models['random_forest_regressor'][0]
xgb_regressor = models['xgb_regressor'][0]

try_models = [lasso_regression, random_forest, xgb_regressor]

In [59]:
for model in try_models:
    for number_of_columns in [5, 6, 7, 8, 9, 10, 11, 36, 41, 55, 65]:
        scores = cross_val_score(model, 
                                 X[columns_list[:number_of_columns]], 
                                 y, 
                                 cv = 5, 
                                 scoring = 'neg_mean_absolute_error')
        
        scores_diff_num_of_features.loc[len(scores_diff_num_of_features)] = model, number_of_columns - 1, scores.mean()
    
    

In [62]:
scores_diff_num_of_features

Unnamed: 0,model,num_of_features,score
0,"Lasso(alpha=52, random_state=0)",4.0,-15980.745945
1,"Lasso(alpha=52, random_state=0)",5.0,-16035.291394
2,"Lasso(alpha=52, random_state=0)",6.0,-15779.585323
3,"Lasso(alpha=52, random_state=0)",7.0,-15763.365303
4,"Lasso(alpha=52, random_state=0)",8.0,-15007.862816
5,"Lasso(alpha=52, random_state=0)",9.0,-14888.342752
6,"Lasso(alpha=52, random_state=0)",10.0,-14845.326052
7,"Lasso(alpha=52, random_state=0)",35.0,-13999.892551
8,"Lasso(alpha=52, random_state=0)",40.0,-14006.308512
9,"Lasso(alpha=52, random_state=0)",54.0,-13971.072366


In [61]:
scores_diff_num_of_features.loc[scores_diff_num_of_features['score'].idxmax()]

model              RandomForestRegressor(max_depth=31, max_featur...
num_of_features                                                 54.0
score                                                   -12528.80902
Name: 20, dtype: object

We found the optimal number of features to train the model. But it seems that not all the features used were really useful. If we look at the table above, we can see that neg_mean_absolute_error became worse after adding `2ndFlrSF` and `OpenPorchSF`. Let's exclude them and see the result.

In [63]:
X_shorted = X.copy()

# Exclude 'Exterior1st' features
X_shorted = X_shorted[columns_list[:55]]

# Exclude '2ndFlrSF' and 'OpenPorchSF'
X_shorted = X_shorted.drop(['2ndFlrSF', 'OpenPorchSF'], axis = 1)

In [65]:
scores_shorted = cross_val_score(random_forest,
                                X_shorted,
                                y,
                                cv = 5,
                                scoring = 'neg_mean_absolute_error')
scores_shorted.mean()

-12523.642546665924

We got practically no increase in the accuracy of the model. Let's try to use GridSearch again. Perhaps the model with a new number of features needs other parameters.

In [66]:
random_forest

In [75]:
random_forest_sample = RandomForestRegressor()
rf_hyper_params = {'max_depth': [31], # found on previous runs of GridSearch
                   'max_features': [15], # found on previous runs of GridSearch
                   'n_estimators': [1200],
                   'random_state': [0],
                   'n_jobs': [-1]
                  }
random_forest = GridSearchCV(random_forest_sample, rf_hyper_params, cv = 5, scoring = 'neg_mean_absolute_error')
random_forest.fit(X_shorted, y)

print('Best params: ', random_forest.best_params_)
print('Best score: ', random_forest.best_score_)

Best params:  {'max_depth': 31, 'max_features': 15, 'n_estimators': 1200, 'n_jobs': -1, 'random_state': 0}
Best score:  -12497.254251314629


In [80]:
test_data_shorted = test_data[X_shorted.columns]
rf_shorted_prediction = random_forest.predict(test_data_shorted)

rf_shorted_output = pd.DataFrame({'Id': test_data.index, 'SalePrice': rf_shorted_prediction})
rf_shorted_output.to_csv('../../Kaggle_Submissions/outputs/rf_shorted_output.csv', index = False)

In [77]:
data_stats = pd.read_csv('../../Data/data_processed_stats.csv')


X_stats = data_stats.copy()
y_stats = X_stats['SalePrice']
X_stats = X_stats.drop(['SalePrice'], axis = 1)
X_stats = X_stats[X_shorted.columns]

In [85]:
scores_stats = cross_val_score(random_forest,
                               X_stats,
                               y_stats,
                               cv = 5,
                               scoring = 'mean_absolute_error'
                              )



In [84]:
np.exp(scores_stats.mean())

0.9234082226837613

In [88]:
random_forest.fit(X_stats, y_stats)

test_data_stats = test_data_shorted.copy()
test_data_stats['GrLivArea'] = np.log(test_data_shorted['GrLivArea'])
test_data_stats.loc[test_data_stats['TotalBsmtSF'] > 0, 'TotalBsmtSF'] = np.log(test_data_stats['TotalBsmtSF'])

rf_stats_prediction = np.exp(random_forest.predict(test_data_stats))

rf_stats_output = pd.DataFrame({'Id': test_data_stats.index, 'SalePrice': rf_stats_prediction})
rf_stats_output.to_csv('../../Kaggle_Submissions/outputs/rf_stats_output.csv', index = False)

  result = getattr(ufunc, method)(*inputs, **kwargs)


In [87]:
X_stats.head(1)

Unnamed: 0,OverallQual,GrLivArea,YearBuilt,GarageArea,TotalBsmtSF,FullBath,YearRemodAdd,LotArea,Fireplaces,Neighborhood_Blmngtn,...,Exterior2nd_CmentBd,Exterior2nd_HdBoard,Exterior2nd_ImStucc,Exterior2nd_MetalSd,Exterior2nd_Plywood,Exterior2nd_Stone,Exterior2nd_Stucco,Exterior2nd_VinylSd,Exterior2nd_Wd Sdng,Exterior2nd_Wd Shng
0,7,7.444249,2003,548,6.75227,2,2003,8450,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
