# House Prices: Advanced Regression Techniques - Part V

In this notebook we will only work with the GradientBoostingRegressor().
We will primarily do category encoding.

In [1]:
from load_modules_files_functions_clean import *

No. features: 79
No. numerical features: 33
No. ordinal features: 21
No. (possible) categorical features: 25 

num_cols: ['LotArea', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold', 'LotFrontage'] 

ord_cols: ['OverallQual', 'OverallCond', 'ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'HeatingQC', 'KitchenQual', 'GarageQual', 'GarageCond', 'Utilities', 'Functional', 'GarageFinish', 'PavedDrive', 'Alley', 'Fence', 'FireplaceQu', 'PoolQC'] 

cat_cols: ['MSSubClass', 'MSZoning', 'Street', 'LotShape', 'LandContour', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condit

In [2]:
X.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,60,RL,65.0,8450,Pave,,Reg,Lvl,3,Inside,...,0,0,,,,0,2,2008,WD,Normal
1,20,RL,80.0,9600,Pave,,Reg,Lvl,3,FR2,...,0,0,,,,0,5,2007,WD,Normal
2,60,RL,68.0,11250,Pave,,IR1,Lvl,3,Inside,...,0,0,,,,0,9,2008,WD,Normal
3,70,RL,60.0,9550,Pave,,IR1,Lvl,3,Corner,...,0,0,,,,0,2,2006,WD,Abnorml
4,60,RL,84.0,14260,Pave,,IR1,Lvl,3,FR2,...,0,0,,,,0,12,2008,WD,Normal


Define functions that have been problematic or that have to be redefined. 

In [3]:
def get_train_val_sets(X, y, cols, test_size = 0.20):
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = test_size, random_state = 1)
    return X_train[cols], X_val[cols], y_train, y_val

def print_cv_val_score(my_s, print_best_est = True, adj_refit = False): # This function does not work properly when it is imported from load_modules_files_functions_clean; X[cols] is not updated correctly.   
    best_est = my_s.best_estimator_
    best_est.fit(X_train, y_train)       
    if adj_refit:
        best_idx = my_s.best_index_
        best_adj_CV_score = my_s.cv_results_['mean_test_score'][best_idx] - my_s.cv_results_['std_test_score'][best_idx]
        print('Best adjusted CV score:', round(-best_adj_CV_score, 5))
    else:
        best_CV_score = my_s.best_score_        
        print('Best CV score:', round(-best_CV_score, 5))
    if y_val.shape[0] > 0:
        y_pred = best_est.predict(X_val)
        val_score = rmsle(y_val, y_pred)
        print('Validation score:', round(val_score, 5))
    if print_best_est:
        print(best_est)
        
def get_sub_csv(my_s, cols, name_csv): # There is a similar problem for this function as well.
    print(name_csv)
    best_est = my_s.best_estimator_    
    best_est.fit(X[cols], y)
    X_test = test[cols]
    y_pred = best_est.predict(X_test)    
    test_submission = pd.DataFrame({'Id':test['Id'], 'SalePrice':y_pred})
    test_submission.to_csv(name_csv, index=False)

def adj_refit_fn(cv_results):
    best_score_idx = np.argmax(cv_results['mean_test_score'] - cv_results['std_test_score'])    
    return(best_score_idx)

def load_run_save_GSCV(key, param_grid, save_s = True, adj_refit = False, cv = 5, save_results = False):
    if save_results:
        global results
    best_CV_score = np.nan
    val_score= np.nan
    filename = key + '.joblib'
    if os.path.isfile(filename):
        my_s = joblib.load(filename)
    else:
        if adj_refit:
            my_refit = adj_refit_fn
        else:
            my_refit = True
        my_s = GridSearchCV(ttr, param_grid = param_grid, cv = cv, scoring = rmsle_scorer, n_jobs = -1, verbose = 10, error_score = 'raise', refit = my_refit)
        my_s = my_s.fit(X_train, y_train)
        if save_s:
            joblib.dump(my_s, filename)
    best_est = my_s.best_estimator_
    best_est.fit(X_train, y_train)   
    
    if y_val.shape[0] > 0:
        y_pred = best_est.predict(X_val)
        val_score = rmsle(y_val, y_pred)
    if adj_refit:
        best_idx = my_s.best_index_
        best_adj_CV_score = my_s.cv_results_['mean_test_score'][best_idx] - my_s.cv_results_['std_test_score'][best_idx]
        print('Best adjusted CV score:', round(-best_adj_CV_score, 5))
    else:
        best_CV_score = my_s.best_score_        
        print('Best CV score:', round(-best_CV_score, 5))
    if y_val.shape[0] > 0:
        print('Validation score:', round(val_score, 5))      
    
    if save_results:
        results_model = pd.Series({'Best CV score': -best_CV_score, 'Val score':val_score})
        results_model.name = key
        results = results.append(results_model)
    return my_s

def min_imp_filter(cols, feat_imps, min_imp):
    feats_keep = list(feat_imps[feat_imps > min_imp].index)
    cols_keep = []
    for col in cols:
        if col in feats_keep:
            cols_keep.append(col)
    return cols_keep

Recreate the best performing model so far (with some slight FE).

In [4]:
X['LowQualFinSFBinary'] = 0
X.loc[X['LowQualFinSF'] > 0, 'LowQualFinSFBinary'] = 1

test['LowQualFinSFBinary'] = 0
test.loc[test['LowQualFinSF'] > 0, 'LowQualFinSFBinary'] = 1

num_cols[num_cols.index('LowQualFinSF')] = 'LowQualFinSFBinary'

In [5]:
X_train, X_val, y_train, y_val = get_train_val_sets(X, y, num_cols + ord_cols + cat_cols)
X_train, X_val, y_train, y_val = pd.concat([X_train, X_val], axis = 0), pd.DataFrame(), pd.concat([y_train, y_val]), pd.DataFrame()

In [None]:
imputer = ColumnTransformer([
    ('imputer_num_cols', 'passthrough', slice(0, len(num_cols))),
    ('imputer_ord_cols', 'passthrough', slice(len(num_cols), len(num_cols + ord_cols))),
    ('imputer_cat_cols', 'passthrough' , slice(len(num_cols + ord_cols), len(num_cols + ord_cols + cat_cols))) 
])

preprocessor = ColumnTransformer([
            ('scaler_num_cols', 'passthrough', slice(0, len(num_cols))),
            ('scaler_ord_cols', 'passthrough', slice(len(num_cols), len(num_cols + ord_cols))),
            ('category_encoder_cat_cols', OneHotEncoder(handle_unknown = 'ignore'), slice(len(num_cols + ord_cols), len(num_cols + ord_cols + cat_cols)))
]) 

steps = [
    ('imputer', imputer),
    ('preprocessor', preprocessor),         
    ('model', None)
]

pipeline = Pipeline(steps)

ttr = TransformedTargetRegressor(regressor = pipeline, func = np.log1p, inverse_func = np.expm1)

gbr = GradientBoostingRegressor(random_state = 1)

param_grid = {
    'regressor__imputer__imputer_num_cols': [SimpleImputer(fill_value = -999)],
    'regressor__imputer__imputer_num_cols__strategy': ['mean', 'median', 'most_frequent', 'constant'],    
    'regressor__imputer__imputer_ord_cols': [SimpleImputer(fill_value = -999)],
    'regressor__imputer__imputer_ord_cols__strategy': ['mean', 'median', 'most_frequent', 'constant'],    
    'regressor__imputer__imputer_cat_cols': [SimpleImputer(fill_value = 'MISS')],
    'regressor__imputer__imputer_cat_cols__strategy': ['most_frequent', 'constant'],    
    'regressor__model': [gbr],
}

key = 'gbr_default_cat_FE'
my_s = load_run_save_GSCV(key, param_grid, save_s = False, adj_refit = False, cv = 8)

Perform the mean target encoding.

In [6]:
X_train.head()

Unnamed: 0,LotArea,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,1stFlrSF,2ndFlrSF,...,Exterior2nd,MasVnrType,Foundation,Heating,CentralAir,Electrical,GarageType,SaleType,SaleCondition,MiscFeature
921,8777,1900,2003,0.0,1084,0,188,1272,1272,928,...,MetalSd,,CBlock,GasA,Y,SBrkr,,WD,Normal,
520,10800,1900,2000,0.0,0,0,0,0,694,600,...,MetalSd,,BrkTil,GasA,N,FuseA,,WD,Normal,
401,8767,2005,2005,0.0,24,0,1286,1310,1310,0,...,VinylSd,,PConc,GasA,Y,SBrkr,Attchd,New,Partial,
280,11287,1989,1989,340.0,421,0,386,807,1175,807,...,Plywood,BrkFace,CBlock,GasA,Y,SBrkr,Attchd,WD,Normal,
1401,7415,2004,2004,0.0,759,0,80,839,864,729,...,VinylSd,,PConc,GasA,Y,SBrkr,Attchd,WD,Normal,


In [7]:
te = TargetEncoder(cols = cat_cols)
X_train[cat_cols] = te.fit_transform(X_train[cat_cols], y_train)
X[cat_cols] = te.transform(X[cat_cols], y)
test[cat_cols] = te.transform(test[cat_cols])

In [8]:
X_train.head()

Unnamed: 0,LotArea,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,1stFlrSF,2ndFlrSF,...,Exterior2nd,MasVnrType,Foundation,Heating,CentralAir,Electrical,GarageType,SaleType,SaleCondition,MiscFeature
921,8777,1900,2003,0.0,1084,0,188,1272,1272,928,...,149803.172897,156221.891204,149805.714511,182021.195378,186186.70989,186825.113193,103317.283951,173401.836622,175202.219533,182046.410384
520,10800,1900,2000,0.0,0,0,0,0,694,600,...,149803.172897,156221.891204,132291.075342,182021.195378,105264.073684,122196.893617,103317.283951,173401.836622,175202.219533,182046.410384
401,8767,2005,2005,0.0,24,0,1286,1310,1310,0,...,214432.460317,156221.891204,225230.44204,182021.195378,186186.70989,186825.113193,202892.656322,274945.418033,272291.752,182046.410384
280,11287,1989,1989,340.0,421,0,386,807,1175,807,...,168112.387324,204691.87191,149805.714511,182021.195378,186186.70989,186825.113193,202892.656322,173401.836622,175202.219533,182046.410384
1401,7415,2004,2004,0.0,759,0,80,839,864,729,...,214432.460317,156221.891204,225230.44204,182021.195378,186186.70989,186825.113193,202892.656322,173401.836622,175202.219533,182046.410384


In [11]:
X_train['PoolQC'].value_counts()

2.0    3
3.0    2
0.0    2
Name: PoolQC, dtype: int64

In [9]:
test.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,LowQualFinSFBinary
0,1461,185224.811567,131558.3901,80.0,11622,181130.538514,,164754.818378,180183.746758,3.0,...,0,,2.0,182046.410384,0,6,2010,173401.836622,175202.219533,0
1,1462,185224.811567,191004.994787,81.0,14267,181130.538514,,206101.665289,180183.746758,3.0,...,0,,,173485.45588,12500,6,2010,173401.836622,175202.219533,0
2,1463,239948.501672,191004.994787,74.0,13830,181130.538514,,206101.665289,180183.746758,3.0,...,0,,2.0,182046.410384,0,3,2010,173401.836622,175202.219533,0
3,1464,239948.501672,191004.994787,78.0,9978,181130.538514,,206101.665289,180183.746758,3.0,...,0,,,182046.410384,0,6,2010,173401.836622,175202.219533,0
4,1465,200779.08046,191004.994787,43.0,5005,181130.538514,,206101.665289,231533.94,3.0,...,0,,,182046.410384,0,1,2010,173401.836622,175202.219533,0


In [12]:
test['PoolQC'].value_counts()

3.0    2
2.0    1
Name: PoolQC, dtype: int64

Train a model with mean target encoding.

In [None]:
imputer = ColumnTransformer([
    ('imputer_num_cols', 'passthrough', slice(0, len(num_cols))),
    ('imputer_ord_cols', 'passthrough', slice(len(num_cols), len(num_cols + ord_cols))),
    ('imputer_cat_cols', 'passthrough' , slice(len(num_cols + ord_cols), len(num_cols + ord_cols + cat_cols))) 
])

preprocessor = ColumnTransformer([
            ('scaler_num_cols', 'passthrough', slice(0, len(num_cols))),
            ('scaler_ord_cols', 'passthrough', slice(len(num_cols), len(num_cols + ord_cols))),
            #('category_encoder_cat_cols', OneHotEncoder(handle_unknown = 'ignore'), slice(len(num_cols + ord_cols), len(num_cols + ord_cols + cat_cols)))
            ('category_encoder_cat_cols', 'passthrough', slice(len(num_cols + ord_cols), len(num_cols + ord_cols + cat_cols)))
]) 

steps = [
    ('imputer', imputer),
    ('preprocessor', preprocessor),         
    ('model', None)
]

pipeline = Pipeline(steps)

ttr = TransformedTargetRegressor(regressor = pipeline, func = np.log1p, inverse_func = np.expm1)

gbr = GradientBoostingRegressor(random_state = 1)

param_grid = {
    'regressor__imputer__imputer_num_cols': [SimpleImputer(fill_value = -999)],
    'regressor__imputer__imputer_num_cols__strategy': ['mean', 'median', 'most_frequent', 'constant'],    
    'regressor__imputer__imputer_ord_cols': [SimpleImputer(fill_value = -999)],
    'regressor__imputer__imputer_ord_cols__strategy': ['mean', 'median', 'most_frequent', 'constant'],    
    #'regressor__imputer__imputer_cat_cols': [SimpleImputer(fill_value = 'MISS')],
    #'regressor__imputer__imputer_cat_cols__strategy': ['most_frequent', 'constant'],
    'regressor__imputer__imputer_cat_cols': [SimpleImputer(fill_value = -999)],
    'regressor__imputer__imputer_cat_cols__strategy': ['mean', 'median', 'most_frequent', 'constant'], 
    'regressor__model': [gbr],
}

key = 'gbr_default_cat_FE'
my_s = load_run_save_GSCV(key, param_grid, save_s = False, adj_refit = False, cv = 8)

In [None]:
print_cv_val_score(my_s, print_best_est = False, adj_refit = False)

In [None]:
name = 'gbr_sub_num_ord_cat_FE.csv'
get_sub_csv(my_s, num_cols + ord_cols + cat_cols, name)

## Log

Mean target encoding of cat_cols (default) ---> CV: 0.12351, Test: