In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Homework description

1. Take data from house pricing (kaggle)
2. Try different GB libraries for regression:
- sklearn, 10 models ensemble
- lightgbm, 10 models ensemble
- xgboost, 10 models ensemble
- catboost, 10 models ensemble
- keras, 10 models ensemble
3. What library is the best in this problem?
4. Build ensemble (averaging)


In [3]:
train_data = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/train.csv")
test_data = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/test.csv")

In [4]:
numeric_columns = [i for i, j in zip(train_data.columns, train_data.dtypes) if j in [np.int64, np.float64] and i not in ['SalePrice', 'Id']]
x_train = train_data[numeric_columns].fillna(-1)
x_test = test_data[numeric_columns].fillna(-1)

In [5]:
y_train = np.log(train_data['SalePrice'])

In [6]:
def rmse(a, b):
    return ((a - b) ** 2).mean() ** 0.5

In [8]:
metrics = []

Gradient Boosting: sklearn

(from academy's presentation)

Pros:
- simple
- feature importances
- out of box

Cons:
- many parameters
- big and slow
- custom loss implementation is not simple

In [9]:
from sklearn.ensemble import GradientBoostingRegressor
from tqdm import tqdm
from sklearn.model_selection import train_test_split

K = 10

rmse_tr_sklearn = []
rmse_val_sklearn = []

preds_tr_sklearn = []
preds_val_sklearn = []

for k in tqdm(range(K)):
    x_tr, x_val, y_tr, y_val = train_test_split(x_train, y_train, shuffle=True, random_state=k)

    model = GradientBoostingRegressor().fit(x_tr, y_tr)
    
    preds_tr = model.predict(x_tr)
    preds_val = model.predict(x_val)

    preds_tr_sklearn.append(preds_tr)
    preds_val_sklearn.append(preds_val)
    
    rmse_tr_sklearn.append(rmse(preds_tr, y_tr))
    rmse_val_sklearn.append(rmse(preds_val, y_val))
    
metrics = []
metrics.append({'RMSE for sklearn train' : np.mean(rmse_tr_sklearn)})
metrics.append({'RMSE for sklearn val' : np.mean(rmse_val_sklearn)})

metrics

In [56]:
model_sklearn = GradientBoostingRegressor().fit(x_train, y_train)
y_pred_sklearn = model_sklearn.predict(x_test)
submit1 = pd.DataFrame()
submit1['Id'] = test_data['Id']
submit1['SalePrice'] = np.exp(y_pred_sklearn)

submit1.to_csv('/kaggle/working/gb_sklearn_default.csv', index=False)

public score of sklearn GB: 0.13672

Gradient Boosting: LightGBM

(from academy's presentation)

Pros:
- small fast
- feature importances
- out of box
- simple implementation for custom loss

Cons:
- too many parameters with synonyms
- gigantic API
- GPU support hard

In [10]:
import lightgbm as lgbm

rmse_tr_lgbm = []
rmse_val_lgbm = []

preds_tr_lgbm = []
preds_val_lgbm = []

for k in tqdm(range(K)):
    x_tr, x_val, y_tr, y_val = train_test_split(x_train, y_train, shuffle=True, random_state=k)

    model = lgbm.LGBMRegressor().fit(x_tr, y_tr)
    
    preds_tr = model.predict(x_tr)
    preds_val = model.predict(x_val)

    preds_tr_lgbm.append(preds_tr)
    preds_val_lgbm.append(preds_val)
    
    rmse_tr_lgbm.append(rmse(preds_tr, y_tr))
    rmse_val_lgbm.append(rmse(preds_val, y_val))
    
metrics.append({'RMSE for lgbm train' : np.mean(rmse_tr_lgbm)})
metrics.append({'RMSE for lgbm val' : np.mean(rmse_val_lgbm)})

metrics

In [57]:
model_lgb = lgbm.LGBMRegressor().fit(x_train, y_train)
y_pred_lgb = model_lgb.predict(x_test)
submit2 = pd.DataFrame()
submit2['Id'] = test_data['Id']
submit2['SalePrice'] = np.exp(y_pred_lgb)

submit2.to_csv('/kaggle/working/lgb_default.csv', index=False)

public score of sklearn LGB: 0.14099

Gradient Boosting: XGBoost

(from academy's presentation)

Pros:
- simple use
- small fast
- feature importances
- out of box GPU support
- simple implementation for custom loss

Cons:
- many parameters
- gigantic API

In [11]:
from xgboost import XGBRegressor

rmse_tr_xgb = []
rmse_val_xgb = []

preds_tr_xgb = []
preds_val_xgb = []

for k in tqdm(range(K)):
    x_tr, x_val, y_tr, y_val = train_test_split(x_train, y_train, shuffle=True, random_state=k)

    model = XGBRegressor().fit(x_tr, y_tr)
    
    preds_tr = model.predict(x_tr)
    preds_val = model.predict(x_val)

    preds_tr_xgb.append(preds_tr)
    preds_val_xgb.append(preds_val)
    
    rmse_tr_xgb.append(rmse(preds_tr, y_tr))
    rmse_val_xgb.append(rmse(preds_val, y_val))
    
metrics.append({'RMSE for xgb train' : np.mean(rmse_tr_xgb)})
metrics.append({'RMSE for xgb val' : np.mean(rmse_val_xgb)})

metrics

In [59]:
model_xgb = XGBRegressor().fit(x_train, y_train)
y_pred_xgb = model_xgb.predict(x_test)
submit3 = pd.DataFrame()
submit3['Id'] = test_data['Id']
submit3['SalePrice'] = np.exp(y_pred_xgb)

submit3.to_csv('/kaggle/working/xgb_default.csv', index=False)

public score of sklearn XGB: 0.15080

Gradient Boosting: CatBoost

(from academy's presentation)

Pros:
- optimized for categories
- accurate and fast
- small fast
- feature importances
- out of box

Cons:
- problems with custom loss

In [12]:
from catboost import CatBoostRegressor

rmse_tr_cat = []
rmse_val_cat = []

preds_tr_cat = []
preds_val_cat = []

for k in tqdm(range(K)):
    x_tr, x_val, y_tr, y_val = train_test_split(x_train, y_train, shuffle=True, random_state=k)

    model = CatBoostRegressor(verbose=False).fit(x_tr, y_tr)
    
    preds_tr = model.predict(x_tr)
    preds_val = model.predict(x_val)

    preds_tr_cat.append(preds_tr)
    preds_val_cat.append(preds_val)
    
    rmse_tr_cat.append(rmse(preds_tr, y_tr))
    rmse_val_cat.append(rmse(preds_val, y_val))
    
metrics.append({'RMSE for cat train' : np.mean(rmse_tr_cat)})
metrics.append({'RMSE for cat val' : np.mean(rmse_val_cat)})

metrics

In [60]:
model_cat = CatBoostRegressor(verbose=False).fit(x_train, y_train)
y_pred_cat = model_lgb.predict(x_test)
submit4 = pd.DataFrame()
submit4['Id'] = test_data['Id']
submit4['SalePrice'] = np.exp(y_pred_cat)

submit4.to_csv('/kaggle/working/catboost_default.csv', index=False)

public score of sklearn XGB: 0.14099

Gradient Boosting: TensorFlow

(from academy's presentation)

Pros:
- simple use
- small fast
- feature importances
- out of box GPU
- simple implementation for custom loss
- good documentation

Cons:
- gigantic docs

In [62]:
# !pip install tensorflow_decision_forests
import tensorflow_decision_forests as tfdf

rmse_tr_keras = []
rmse_val_keras = []

preds_tr_keras = []
preds_val_keras = []

for k in tqdm(range(K)):
    x_tr, x_val, y_tr, y_val = train_test_split(x_train, y_train, shuffle=True, random_state=k)
    tr = pd.concat([x_tr, y_tr], axis=1)
    tf_tr = tfdf.keras.pd_dataframe_to_tf_dataset(tr, label='SalePrice', task=tfdf.keras.Task.REGRESSION)
    
    model = tfdf.keras.GradientBoostedTreesModel(task=tfdf.keras.Task.REGRESSION, verbose=0)
    model.fit(x=tf_tr)
    
    preds_tr = model.predict(tfdf.keras.pd_dataframe_to_tf_dataset(x_tr, task=tfdf.keras.Task.REGRESSION))
    preds_val = model.predict(tfdf.keras.pd_dataframe_to_tf_dataset(x_val, task=tfdf.keras.Task.REGRESSION))
    
    preds_tr = np.array([pred_tr_el for pred_tr in preds_tr for pred_tr_el in pred_tr])
    preds_val = np.array([pred_val_el for pred_val in preds_val for pred_val_el in pred_val])
        
    preds_tr_keras.append(preds_tr)
    preds_val_keras.append(preds_val)
    
    rmse_tr_keras.append(rmse(preds_tr, y_tr))
    rmse_val_keras.append(rmse(preds_val, y_val))
    
metrics.append({'RMSE for keras train' : np.mean(rmse_tr_keras)})
metrics.append({'RMSE for keras val' : np.mean(rmse_val_keras)})

metrics

In [63]:
train = pd.concat([x_train, y_train], axis=1)
tf_train = tfdf.keras.pd_dataframe_to_tf_dataset(train, label='SalePrice', task=tfdf.keras.Task.REGRESSION)
model_keras = tfdf.keras.GradientBoostedTreesModel(task=tfdf.keras.Task.REGRESSION, verbose=0)
model_keras.fit(x=tf_train)
y_pred_keras = model.predict(tfdf.keras.pd_dataframe_to_tf_dataset(x_test, task=tfdf.keras.Task.REGRESSION))
y_pred_keras = np.array([preds_keras_el for pred_keras in y_pred_keras for preds_keras_el in pred_keras])   
submit5 = pd.DataFrame()
submit5['Id'] = test_data['Id']
submit5['SalePrice'] = np.exp(y_pred_keras)

submit5.to_csv('/kaggle/working/keras_gb_default.csv', index=False)

public score of tensorflow: 0.14689

In [31]:
sorted(metrics, key=lambda x: list(x.values())[0])

Looking at the validation scores, the best result brought catboost
Looking at the public score on test data, the order from the best to worse is:
1. sklearn
2. lgb and catboost
3. keras
4. xgb

In [36]:
y_pred_sklearn = []
y_pred_lgb = []
y_pred_xgb = []
y_pred_cat = []
y_pred_keras = []
    
model_sklearn = GradientBoostingRegressor().fit(x_train, y_train)
y_pred_sklearn.append(model_sklearn.predict(x_test))

model_lgb = lgbm.LGBMRegressor().fit(x_train, y_train)
y_pred_lgb.append(model_lgb.predict(x_test))

model_xgb = XGBRegressor().fit(x_train, y_train)
y_pred_xgb.append(model_xgb.predict(x_test))

model_cat = CatBoostRegressor(verbose=False).fit(x_train, y_train)
y_pred_cat.append(model_cat.predict(x_test))

train = pd.concat([x_train, y_train], axis=1)
tf_train = tfdf.keras.pd_dataframe_to_tf_dataset(train, label='SalePrice', task=tfdf.keras.Task.REGRESSION)
model_keras = tfdf.keras.GradientBoostedTreesModel(task=tfdf.keras.Task.REGRESSION, verbose=0)
model_keras.fit(x=tf_train)
preds_keras = model.predict(tfdf.keras.pd_dataframe_to_tf_dataset(x_test, task=tfdf.keras.Task.REGRESSION))
y_pred_keras.append(np.array([preds_keras_el for pred_keras in preds_keras for preds_keras_el in pred_keras]))    

In [65]:
y_pred_ensemble = np.mean([y_pred_sklearn, y_pred_lgb, y_pred_xgb, y_pred_cat, y_pred_keras], axis=0)
y_pred_ensemble

In [43]:
submit = pd.DataFrame()
submit['Id'] = test_data['Id']
submit['SalePrice'] = np.exp(y_pred_ensemble)

submit

In [44]:
submit.to_csv('/kaggle/working/diff_gb_models_ensemble.csv', index=False)

public score of different GB implementations ensemble: 0.13505

which is better then any other implementation on its own