In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/house-prices-advanced-regression-techniques/sample_submission.csv
/kaggle/input/house-prices-advanced-regression-techniques/data_description.txt
/kaggle/input/house-prices-advanced-regression-techniques/train.csv
/kaggle/input/house-prices-advanced-regression-techniques/test.csv


# Homework description

1. Take data from house pricing (kaggle)
2. Try different GB libraries for regression:
- sklearn, 10 models ensemble
- lightgbm, 10 models ensemble
- xgboost, 10 models ensemble
- catboost, 10 models ensemble
- keras, 10 models ensemble
3. What library is the best in this problem?
4. Build ensemble (averaging)


In [3]:
train_data = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/train.csv")
test_data = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/test.csv")

In [4]:
numeric_columns = [i for i, j in zip(train_data.columns, train_data.dtypes) if j in [np.int64, np.float64] and i not in ['SalePrice', 'Id']]
x_train = train_data[numeric_columns].fillna(-1)
x_test = test_data[numeric_columns].fillna(-1)

In [5]:
y_train = np.log(train_data['SalePrice'])

In [6]:
def rmse(a, b):
    return ((a - b) ** 2).mean() ** 0.5

In [8]:
metrics = []

Gradient Boosting: sklearn

(from academy's presentation)

Pros:
- simple
- feature importances
- out of box

Cons:
- many parameters
- big and slow
- custom loss implementation is not simple

In [9]:
from sklearn.ensemble import GradientBoostingRegressor
from tqdm import tqdm
from sklearn.model_selection import train_test_split

K = 10

rmse_tr_sklearn = []
rmse_val_sklearn = []

preds_tr_sklearn = []
preds_val_sklearn = []

for k in tqdm(range(K)):
    x_tr, x_val, y_tr, y_val = train_test_split(x_train, y_train, shuffle=True, random_state=k)

    model = GradientBoostingRegressor().fit(x_tr, y_tr)
    
    preds_tr = model.predict(x_tr)
    preds_val = model.predict(x_val)

    preds_tr_sklearn.append(preds_tr)
    preds_val_sklearn.append(preds_val)
    
    rmse_tr_sklearn.append(rmse(preds_tr, y_tr))
    rmse_val_sklearn.append(rmse(preds_val, y_val))
    
metrics = []
metrics.append({'RMSE for sklearn train' : np.mean(rmse_tr_sklearn)})
metrics.append({'RMSE for sklearn val' : np.mean(rmse_val_sklearn)})

metrics

100%|██████████| 10/10 [00:04<00:00,  2.44it/s]


[{'RMSE for sklearn train': 0.08537249662003835},
 {'RMSE for sklearn val': 0.13405301414243137}]

In [56]:
model_sklearn = GradientBoostingRegressor().fit(x_train, y_train)
y_pred_sklearn = model_sklearn.predict(x_test)
submit1 = pd.DataFrame()
submit1['Id'] = test_data['Id']
submit1['SalePrice'] = np.exp(y_pred_sklearn)

submit1.to_csv('/kaggle/working/gb_sklearn_default.csv', index=False)

public score of sklearn GB: 0.13672

Gradient Boosting: LightGBM

(from academy's presentation)

Pros:
- small fast
- feature importances
- out of box
- simple implementation for custom loss

Cons:
- too many parameters with synonyms
- gigantic API
- GPU support hard

In [10]:
import lightgbm as lgbm

rmse_tr_lgbm = []
rmse_val_lgbm = []

preds_tr_lgbm = []
preds_val_lgbm = []

for k in tqdm(range(K)):
    x_tr, x_val, y_tr, y_val = train_test_split(x_train, y_train, shuffle=True, random_state=k)

    model = lgbm.LGBMRegressor().fit(x_tr, y_tr)
    
    preds_tr = model.predict(x_tr)
    preds_val = model.predict(x_val)

    preds_tr_lgbm.append(preds_tr)
    preds_val_lgbm.append(preds_val)
    
    rmse_tr_lgbm.append(rmse(preds_tr, y_tr))
    rmse_val_lgbm.append(rmse(preds_val, y_val))
    
metrics.append({'RMSE for lgbm train' : np.mean(rmse_tr_lgbm)})
metrics.append({'RMSE for lgbm val' : np.mean(rmse_val_lgbm)})

metrics

100%|██████████| 10/10 [00:01<00:00,  6.84it/s]


[{'RMSE for sklearn train': 0.08537249662003835},
 {'RMSE for sklearn val': 0.13405301414243137},
 {'RMSE for lgbm train': 0.047959156112600154},
 {'RMSE for lgbm val': 0.13735107256597223}]

In [57]:
model_lgb = lgbm.LGBMRegressor().fit(x_train, y_train)
y_pred_lgb = model_lgb.predict(x_test)
submit2 = pd.DataFrame()
submit2['Id'] = test_data['Id']
submit2['SalePrice'] = np.exp(y_pred_lgb)

submit2.to_csv('/kaggle/working/lgb_default.csv', index=False)

public score of sklearn LGB: 0.14099

Gradient Boosting: XGBoost

(from academy's presentation)

Pros:
- simple use
- small fast
- feature importances
- out of box GPU support
- simple implementation for custom loss

Cons:
- many parameters
- gigantic API

In [11]:
from xgboost import XGBRegressor

rmse_tr_xgb = []
rmse_val_xgb = []

preds_tr_xgb = []
preds_val_xgb = []

for k in tqdm(range(K)):
    x_tr, x_val, y_tr, y_val = train_test_split(x_train, y_train, shuffle=True, random_state=k)

    model = XGBRegressor().fit(x_tr, y_tr)
    
    preds_tr = model.predict(x_tr)
    preds_val = model.predict(x_val)

    preds_tr_xgb.append(preds_tr)
    preds_val_xgb.append(preds_val)
    
    rmse_tr_xgb.append(rmse(preds_tr, y_tr))
    rmse_val_xgb.append(rmse(preds_val, y_val))
    
metrics.append({'RMSE for xgb train' : np.mean(rmse_tr_xgb)})
metrics.append({'RMSE for xgb val' : np.mean(rmse_val_xgb)})

metrics

100%|██████████| 10/10 [00:03<00:00,  2.81it/s]


[{'RMSE for sklearn train': 0.08537249662003835},
 {'RMSE for sklearn val': 0.13405301414243137},
 {'RMSE for lgbm train': 0.047959156112600154},
 {'RMSE for lgbm val': 0.13735107256597223},
 {'RMSE for xgb train': 0.008115376928497115},
 {'RMSE for xgb val': 0.14468284312571045}]

In [59]:
model_xgb = XGBRegressor().fit(x_train, y_train)
y_pred_xgb = model_xgb.predict(x_test)
submit3 = pd.DataFrame()
submit3['Id'] = test_data['Id']
submit3['SalePrice'] = np.exp(y_pred_xgb)

submit3.to_csv('/kaggle/working/xgb_default.csv', index=False)

public score of sklearn XGB: 0.15080

Gradient Boosting: CatBoost

(from academy's presentation)

Pros:
- optimized for categories
- accurate and fast
- small fast
- feature importances
- out of box

Cons:
- problems with custom loss

In [12]:
from catboost import CatBoostRegressor

rmse_tr_cat = []
rmse_val_cat = []

preds_tr_cat = []
preds_val_cat = []

for k in tqdm(range(K)):
    x_tr, x_val, y_tr, y_val = train_test_split(x_train, y_train, shuffle=True, random_state=k)

    model = CatBoostRegressor(verbose=False).fit(x_tr, y_tr)
    
    preds_tr = model.predict(x_tr)
    preds_val = model.predict(x_val)

    preds_tr_cat.append(preds_tr)
    preds_val_cat.append(preds_val)
    
    rmse_tr_cat.append(rmse(preds_tr, y_tr))
    rmse_val_cat.append(rmse(preds_val, y_val))
    
metrics.append({'RMSE for cat train' : np.mean(rmse_tr_cat)})
metrics.append({'RMSE for cat val' : np.mean(rmse_val_cat)})

metrics

100%|██████████| 10/10 [00:17<00:00,  1.71s/it]


[{'RMSE for sklearn train': 0.08537249662003835},
 {'RMSE for sklearn val': 0.13405301414243137},
 {'RMSE for lgbm train': 0.047959156112600154},
 {'RMSE for lgbm val': 0.13735107256597223},
 {'RMSE for xgb train': 0.008115376928497115},
 {'RMSE for xgb val': 0.14468284312571045},
 {'RMSE for cat train': 0.03835916398253965},
 {'RMSE for cat val': 0.13075140057730833}]

In [60]:
model_cat = CatBoostRegressor(verbose=False).fit(x_train, y_train)
y_pred_cat = model_lgb.predict(x_test)
submit4 = pd.DataFrame()
submit4['Id'] = test_data['Id']
submit4['SalePrice'] = np.exp(y_pred_cat)

submit4.to_csv('/kaggle/working/catboost_default.csv', index=False)

public score of sklearn XGB: 0.14099

Gradient Boosting: TensorFlow

(from academy's presentation)

Pros:
- simple use
- small fast
- feature importances
- out of box GPU
- simple implementation for custom loss
- good documentation

Cons:
- gigantic docs

In [62]:
# !pip install tensorflow_decision_forests
import tensorflow_decision_forests as tfdf

rmse_tr_keras = []
rmse_val_keras = []

preds_tr_keras = []
preds_val_keras = []

for k in tqdm(range(K)):
    x_tr, x_val, y_tr, y_val = train_test_split(x_train, y_train, shuffle=True, random_state=k)
    tr = pd.concat([x_tr, y_tr], axis=1)
    tf_tr = tfdf.keras.pd_dataframe_to_tf_dataset(tr, label='SalePrice', task=tfdf.keras.Task.REGRESSION)
    
    model = tfdf.keras.GradientBoostedTreesModel(task=tfdf.keras.Task.REGRESSION, verbose=0)
    model.fit(x=tf_tr)
    
    preds_tr = model.predict(tfdf.keras.pd_dataframe_to_tf_dataset(x_tr, task=tfdf.keras.Task.REGRESSION))
    preds_val = model.predict(tfdf.keras.pd_dataframe_to_tf_dataset(x_val, task=tfdf.keras.Task.REGRESSION))
    
    preds_tr = np.array([pred_tr_el for pred_tr in preds_tr for pred_tr_el in pred_tr])
    preds_val = np.array([pred_val_el for pred_val in preds_val for pred_val_el in pred_val])
        
    preds_tr_keras.append(preds_tr)
    preds_val_keras.append(preds_val)
    
    rmse_tr_keras.append(rmse(preds_tr, y_tr))
    rmse_val_keras.append(rmse(preds_val, y_val))
    
metrics.append({'RMSE for keras train' : np.mean(rmse_tr_keras)})
metrics.append({'RMSE for keras val' : np.mean(rmse_val_keras)})

metrics

  0%|          | 0/10 [00:00<?, ?it/s][INFO kernel.cc:1176] Loading model from path /tmp/tmp8i__us9a/model/ with prefix cef1cc9b2a324e37
[INFO kernel.cc:1022] Use fast generic engine




 10%|█         | 1/10 [00:02<00:25,  2.81s/it][INFO kernel.cc:1176] Loading model from path /tmp/tmpx1csjflq/model/ with prefix 46cadea727fe430e
[INFO kernel.cc:1022] Use fast generic engine




 20%|██        | 2/10 [00:04<00:18,  2.32s/it][INFO kernel.cc:1176] Loading model from path /tmp/tmp7xsu7v6q/model/ with prefix c198dec1426e4844
[INFO abstract_model.cc:1248] Engine "GradientBoostedTreesQuickScorerExtended" built
[INFO kernel.cc:1022] Use fast generic engine




 30%|███       | 3/10 [00:06<00:15,  2.26s/it][INFO kernel.cc:1176] Loading model from path /tmp/tmptvewvlju/model/ with prefix f37d515e0b2946ef
[INFO kernel.cc:1022] Use fast generic engine




 40%|████      | 4/10 [00:09<00:13,  2.19s/it][INFO kernel.cc:1176] Loading model from path /tmp/tmp_euk_ic1/model/ with prefix 90c359c2f3b64ebc
[INFO kernel.cc:1022] Use fast generic engine




 50%|█████     | 5/10 [00:11<00:11,  2.25s/it][INFO kernel.cc:1176] Loading model from path /tmp/tmp6kfl80vg/model/ with prefix dd1b4e9fdf2c4f96
[INFO kernel.cc:1022] Use fast generic engine




 60%|██████    | 6/10 [00:13<00:09,  2.30s/it][INFO kernel.cc:1176] Loading model from path /tmp/tmpdbi56sle/model/ with prefix 1d1e99b778a64398
[INFO kernel.cc:1022] Use fast generic engine




 70%|███████   | 7/10 [00:16<00:07,  2.39s/it][INFO kernel.cc:1176] Loading model from path /tmp/tmpapd_i4ed/model/ with prefix 27d6499195804232
[INFO abstract_model.cc:1248] Engine "GradientBoostedTreesQuickScorerExtended" built
[INFO kernel.cc:1022] Use fast generic engine




 80%|████████  | 8/10 [00:18<00:04,  2.39s/it][INFO kernel.cc:1176] Loading model from path /tmp/tmpqx29_z3k/model/ with prefix 7590de83ba4f4fdc
[INFO kernel.cc:1022] Use fast generic engine




 90%|█████████ | 9/10 [00:21<00:02,  2.54s/it][INFO kernel.cc:1176] Loading model from path /tmp/tmpydg8ugny/model/ with prefix 8e80739daaeb4dfb
[INFO kernel.cc:1022] Use fast generic engine




100%|██████████| 10/10 [00:25<00:00,  2.59s/it]


[{'RMSE for sklearn train': 0.08537249662003835},
 {'RMSE for sklearn val': 0.13405301414243137},
 {'RMSE for lgbm train': 0.047959156112600154},
 {'RMSE for lgbm val': 0.13735107256597223},
 {'RMSE for xgb train': 0.008115376928497115},
 {'RMSE for xgb val': 0.14468284312571045},
 {'RMSE for cat train': 0.03835916398253965},
 {'RMSE for cat val': 0.13075140057730833},
 {'RMSE for keras train': 0.05464199541941048},
 {'RMSE for keras val': 0.1364307020166531},
 {'RMSE for keras train': 0.06141507782338048},
 {'RMSE for keras val': 0.1402678440191205},
 {'RMSE for keras train': 0.06423386431418326},
 {'RMSE for keras val': 0.14036930237572198},
 {'RMSE for keras train': 0.06141507782338048},
 {'RMSE for keras val': 0.1402678440191205}]

In [63]:
train = pd.concat([x_train, y_train], axis=1)
tf_train = tfdf.keras.pd_dataframe_to_tf_dataset(train, label='SalePrice', task=tfdf.keras.Task.REGRESSION)
model_keras = tfdf.keras.GradientBoostedTreesModel(task=tfdf.keras.Task.REGRESSION, verbose=0)
model_keras.fit(x=tf_train)
y_pred_keras = model.predict(tfdf.keras.pd_dataframe_to_tf_dataset(x_test, task=tfdf.keras.Task.REGRESSION))
y_pred_keras = np.array([preds_keras_el for pred_keras in y_pred_keras for preds_keras_el in pred_keras])   
submit5 = pd.DataFrame()
submit5['Id'] = test_data['Id']
submit5['SalePrice'] = np.exp(y_pred_keras)

submit5.to_csv('/kaggle/working/keras_gb_default.csv', index=False)

[INFO kernel.cc:1176] Loading model from path /tmp/tmp75_reg6j/model/ with prefix 0f9a97770b2c4322
[INFO abstract_model.cc:1248] Engine "GradientBoostedTreesQuickScorerExtended" built
[INFO kernel.cc:1022] Use fast generic engine




public score of tensorflow: 0.14689

In [31]:
sorted(metrics, key=lambda x: list(x.values())[0])

[{'RMSE for xgb train': 0.008115376928497115},
 {'RMSE for cat train': 0.03835916398253965},
 {'RMSE for lgbm train': 0.047959156112600154},
 {'RMSE for keras train': 0.05464199541941048},
 {'RMSE for keras train': 0.06141507782338048},
 {'RMSE for sklearn train': 0.08537249662003835},
 {'RMSE for cat val': 0.13075140057730833},
 {'RMSE for sklearn val': 0.13405301414243137},
 {'RMSE for keras val': 0.1364307020166531},
 {'RMSE for lgbm val': 0.13735107256597223},
 {'RMSE for keras val': 0.1402678440191205},
 {'RMSE for xgb val': 0.14468284312571045}]

Looking at the validation scores, the best result brought catboost
Looking at the public score on test data, the order from the best to worse is:
1. sklearn
2. lgb and catboost
3. keras
4. xgb

In [36]:
y_pred_sklearn = []
y_pred_lgb = []
y_pred_xgb = []
y_pred_cat = []
y_pred_keras = []
    
model_sklearn = GradientBoostingRegressor().fit(x_train, y_train)
y_pred_sklearn.append(model_sklearn.predict(x_test))

model_lgb = lgbm.LGBMRegressor().fit(x_train, y_train)
y_pred_lgb.append(model_lgb.predict(x_test))

model_xgb = XGBRegressor().fit(x_train, y_train)
y_pred_xgb.append(model_xgb.predict(x_test))

model_cat = CatBoostRegressor(verbose=False).fit(x_train, y_train)
y_pred_cat.append(model_cat.predict(x_test))

train = pd.concat([x_train, y_train], axis=1)
tf_train = tfdf.keras.pd_dataframe_to_tf_dataset(train, label='SalePrice', task=tfdf.keras.Task.REGRESSION)
model_keras = tfdf.keras.GradientBoostedTreesModel(task=tfdf.keras.Task.REGRESSION, verbose=0)
model_keras.fit(x=tf_train)
preds_keras = model.predict(tfdf.keras.pd_dataframe_to_tf_dataset(x_test, task=tfdf.keras.Task.REGRESSION))
y_pred_keras.append(np.array([preds_keras_el for pred_keras in preds_keras for preds_keras_el in pred_keras]))    

  features_dataframe = dataframe.drop(label, 1)
[INFO kernel.cc:1176] Loading model from path /tmp/tmp8ol8c7sb/model/ with prefix 267f2cd593fe4cfd
[INFO abstract_model.cc:1248] Engine "GradientBoostedTreesQuickScorerExtended" built
[INFO kernel.cc:1022] Use fast generic engine




In [65]:
y_pred_ensemble = np.mean([y_pred_sklearn, y_pred_lgb, y_pred_xgb, y_pred_cat, y_pred_keras], axis=0)
y_pred_ensemble

array([11.72336781, 11.9356032 , 12.12047656, ..., 12.13145495,
       11.63907827, 12.40513155])

In [43]:
submit = pd.DataFrame()
submit['Id'] = test_data['Id']
submit['SalePrice'] = np.exp(y_pred_ensemble)

submit

Unnamed: 0,Id,SalePrice
0,1461,124778.960908
1,1462,155251.167104
2,1463,183322.835945
3,1464,186602.690812
4,1465,191040.778132
...,...,...
1454,2915,73193.241636
1455,2916,84264.141659
1456,2917,184300.331674
1457,2918,114572.344673


In [44]:
submit.to_csv('/kaggle/working/diff_gb_models_ensemble.csv', index=False)

public score of different GB implementations ensemble: 0.13505

which is better then any other implementation on its own