In [15]:
from typing import List, Optional
from tqdm import tqdm

import numpy as np
import pandas as pd


import scipy.stats as st
from scipy.stats import probplot, ks_2samp
from sklearn.metrics import roc_auc_score, roc_curve, auc
from sklearn.ensemble import RandomForestRegressor

from collections import Counter
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils.validation import check_is_fitted
import missingno as msno

import xgboost as xgb
import lightgbm as lgb
import catboost as cb

import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

In [2]:
train = pd.read_csv("./datasets/assignment_2_train.csv")
test = pd.read_csv("./datasets/assignment_2_test.csv")
print("train.shape = {} rows, {} cols".format(*train.shape))
print("test.shape = {} rows, {} cols".format(*test.shape))


train.shape = 180000 rows, 394 cols
test.shape = 100001 rows, 394 cols


In [3]:
train.head()

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,V330,V331,V332,V333,V334,V335,V336,V337,V338,V339
0,2987000,0,86400,68.5,W,13926,,150.0,discover,142.0,...,,,,,,,,,,
1,2987001,0,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,...,,,,,,,,,,
2,2987002,0,86469,59.0,W,4663,490.0,150.0,visa,166.0,...,,,,,,,,,,
3,2987003,0,86499,50.0,W,18132,567.0,150.0,mastercard,117.0,...,,,,,,,,,,
4,2987004,0,86506,50.0,H,4497,514.0,150.0,mastercard,102.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
target = 'isFraud'
var_names = train.columns.to_list()[2:]

In [5]:
num_features = train[var_names].select_dtypes(include=[np.number]).columns.to_list()
cat_features = train[var_names].select_dtypes(include=[np.object]).columns.to_list()

print(f'count of numerical features {len(num_features)}')
print(f'count of categorical features {len(cat_features)}')

count of numerical features 378
count of categorical features 14


In [8]:
class FeatureGenerator:
    def __init__(self, cat_features):
        self.cat_features = cat_features
        self.new_cat_features = []
        self.lgb_cat_features = []
        self.target_encodings = dict()
        self.ordinal_encoding = dict()
        
        
    def fit(self, train):
        df = train.copy()
        for feature in self.cat_features: 
            new_feature = feature + '_'
            lgb_feature = feature + 'lgb'
            self.new_cat_features.append(new_feature)
            self.lgb_cat_features.append(lgb_feature)            
            self.target_encodings[feature] = {}
            self.ordinal_encoding[feature] = {}
            for ind, level in enumerate(df[feature].unique()):
                level_value = df.loc[df[feature]==level, target].mean()
                self.target_encodings[feature][level] = level_value
                self.ordinal_encoding[feature][level] = ind
                
    def transform(self, df):
        for feature in self.cat_features: 
            for level in self.target_encodings[feature].keys():
                new_feature = feature + '_'
                lgb_feature = feature + 'lgb'
                df.loc[df[feature] == level, new_feature] = self.target_encodings[feature][level]
                df.loc[df[feature] == level, lgb_feature] = self.ordinal_encoding[feature][level]
                
        df[cat_features] = df[cat_features].astype(str)  
        
        return df

In [13]:
df_train, df_valid = train_test_split(train, 
                                      shuffle=True,
                                      stratify=train[target],
                                      test_size=0.2,
                                      random_state=1)

In [16]:
features = FeatureGenerator(cat_features)
features.fit(df_train)
df_train = features.transform(df_train)
df_valid = features.transform(df_valid)

In [17]:
features = FeatureGenerator(cat_features)
features.fit(train)
train = features.transform(train)
test = features.transform(test)

Задание 1: отобрать только числовые признаки и обучить модель XGBoost с параметром booster = gbtree. Обучать алгоритм до тех пор, пока метрика качества не перестанет улучшаться на валидационной выборке в течение определенного числа итераций (выбрать значение самостоятельно).

In [19]:
params_xgb = {"booster": "gbtree", 
              "objective": "binary:logistic", 
              "eval_metric": "auc", 
              "learning_rate": 0.1, 
              "n_estimators": 1000, 
              "reg_lambda": 100, 
              "max_depth": 4, 
              "gamma": 10, 
              "nthread": -1, 
              "seed": 27}

In [20]:
dtrain = xgb.DMatrix(data=df_train[num_features], label=df_train[target])
dvalid = xgb.DMatrix(data=df_valid[num_features], label=df_valid[target])

model_xgb_num = xgb.train(params=params_xgb,
                          dtrain=dtrain,
                          evals=[(dtrain, "train"), (dvalid, "valid")],
                          num_boost_round=1000,
                          early_stopping_rounds=50,  
                          verbose_eval=50,
                          maximize=True)

Parameters: { n_estimators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	train-auc:0.64850	valid-auc:0.65575
[50]	train-auc:0.88045	valid-auc:0.88507
[100]	train-auc:0.89836	valid-auc:0.89863
[150]	train-auc:0.90670	valid-auc:0.90349
[200]	train-auc:0.90770	valid-auc:0.90416
[208]	train-auc:0.90770	valid-auc:0.90416


Задание 2: обработать категориальные признаки любым способом (который вы знаете) и добавить их к данным. Выполнить задание 1.

In [21]:
new_feature = num_features + features.new_cat_features

In [22]:
dtrain = xgb.DMatrix(data=df_train[new_feature], label=df_train[target])
dvalid = xgb.DMatrix(data=df_valid[new_feature], label=df_valid[target])

model_xgb_num = xgb.train(params=params_xgb,
                          dtrain=dtrain,
                          evals=[(dtrain, "train"), (dvalid, "valid")],
                          num_boost_round=1000,
                          early_stopping_rounds=50,  
                          verbose_eval=50,
                          maximize=True)

Parameters: { n_estimators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	train-auc:0.64850	valid-auc:0.65575
[50]	train-auc:0.88842	valid-auc:0.89343
[100]	train-auc:0.90786	valid-auc:0.90642
[150]	train-auc:0.91562	valid-auc:0.91143
[200]	train-auc:0.91648	valid-auc:0.91182
[205]	train-auc:0.91648	valid-auc:0.91182


Задание 3: для числовых признаков обучить модель LightGBM. Обучать алгоритм до тех пор, пока метрика качества не перестанет улучшаться на валидационной выборке в течение определенного числа итераций (выбрать значение самостоятельно).

In [40]:
params_lgb = {"boosting_type": "gbdt",
              "objective": "binary",
              "metric": "auc",
              "num_boost_round": 10000,  
              "learning_rate": 0.01,
              "n_estimators": 1000,
              "n_jobs": -1,
              "seed": 27}

In [41]:
dtrain = lgb.Dataset(data=df_train[num_features], label=df_train[target])
dvalid = lgb.Dataset(data=df_valid[num_features], label=df_valid[target])

model_lgb_num = lgb.train(params=params_lgb,
                          train_set=dtrain,  
                          valid_sets=[dtrain, dvalid],
                          categorical_feature="auto",
                          verbose_eval=500,
                          early_stopping_rounds=50)

[LightGBM] [Info] Number of positive: 4113, number of negative: 139887
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 31662
[LightGBM] [Info] Number of data points in the train set: 144000, number of used features: 376
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.028563 -> initscore=-3.526682
[LightGBM] [Info] Start training from score -3.526682
Training until validation scores don't improve for 50 rounds
[500]	training's auc: 0.944346	valid_1's auc: 0.925079
[1000]	training's auc: 0.967873	valid_1's auc: 0.936432
[1500]	training's auc: 0.978256	valid_1's auc: 0.941325
[2000]	training's auc: 0.98426	valid_1's auc: 0.944706
[2500]	training's auc: 0.988283	valid_1's auc: 0.947384
[3000]	training's auc: 0.991717	valid_1's auc: 0.949577
[3500]	training's auc: 0.993659	valid_1's auc: 0.951199
Early stopping, best iteration is:
[3888]	training's auc: 0.994793	valid_1's auc: 0.95254


Задание 4: обработать категориальные признаки любым способом (который вы знаете) и добавить их к данным. Выполнить задание 3.

In [42]:
dtrain = lgb.Dataset(data=df_train[new_feature], label=df_train[target])
dvalid = lgb.Dataset(data=df_valid[new_feature], label=df_valid[target])

model_lgb_num = lgb.train(params=params_lgb,
                          train_set=dtrain,  
                          valid_sets=[dtrain, dvalid],
                          categorical_feature="auto",
                          verbose_eval=500,
                          early_stopping_rounds=50)

[LightGBM] [Info] Number of positive: 4113, number of negative: 139887
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 31768
[LightGBM] [Info] Number of data points in the train set: 144000, number of used features: 390
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.028563 -> initscore=-3.526682
[LightGBM] [Info] Start training from score -3.526682
Training until validation scores don't improve for 50 rounds
[500]	training's auc: 0.954435	valid_1's auc: 0.935091
[1000]	training's auc: 0.973527	valid_1's auc: 0.944858
[1500]	training's auc: 0.983757	valid_1's auc: 0.949055
[2000]	training's auc: 0.989681	valid_1's auc: 0.952462
[2500]	training's auc: 0.993322	valid_1's auc: 0.95553
[3000]	training's auc: 0.995438	valid_1's auc: 0.956912
[3500]	training's auc: 0.996797	valid_1's auc: 0.958178
[4000]	training's auc: 0.997796	valid_1's auc: 0.959084
Early stopping, best iteration is:
[4330]	training's auc: 0.998209	valid_1's auc: 0.959647


Задание 5: обработать категориальные признаки встроенным методом в LightGBM. Выполнить задание 3. Сделать выводы о качестве работы алгоритма, по сравнению с пунктом 3.

In [43]:
lgb_features = num_features + features.lgb_cat_features

In [44]:
dtrain = lgb.Dataset(data=df_train[lgbm_features], 
                     label=df_train[target], 
                     categorical_feature=features.lgb_cat_features)

dvalid = lgb.Dataset(data=df_valid[lgbm_features], 
                     label=df_valid[target],
                     categorical_feature=features.lgb_cat_features)

model_lgb_all_cat = lgb.train(params=params_lgb,
                              train_set=dtrain,  
                              valid_sets=[dtrain, dvalid],
                              categorical_feature=features.lgb_cat_features,
                              verbose_eval=500,
                              early_stopping_rounds=50)

[LightGBM] [Info] Number of positive: 4113, number of negative: 139887
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 31835
[LightGBM] [Info] Number of data points in the train set: 144000, number of used features: 390
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.028563 -> initscore=-3.526682
[LightGBM] [Info] Start training from score -3.526682
Training until validation scores don't improve for 50 rounds
[500]	training's auc: 0.955084	valid_1's auc: 0.933298
[1000]	training's auc: 0.974084	valid_1's auc: 0.944636
[1500]	training's auc: 0.983933	valid_1's auc: 0.948529
[2000]	training's auc: 0.989812	valid_1's auc: 0.951704
[2500]	training's auc: 0.993409	valid_1's auc: 0.953856
[3000]	training's auc: 0.995338	valid_1's auc: 0.955657
[3500]	training's auc: 0.996915	valid_1's auc: 0.957256
Early stopping, best iteration is:
[3794]	training's auc: 0.997497	valid_1's auc: 0.957705


Вывод: Обработка категориальных признаков встроенным в LightGBM методом дает не худший результат
по сравнению с target encoding.

Задание 6: для числовых признаков обучить модель CatBoost. Обучать алгоритм до тех пор, пока метрика качества не перестанет улучшаться на валидационной выборке в течение определенного числа итераций (выбрать значение самостоятельно).

In [63]:
params_cb = {"n_estimators":5000,
             "loss_function": "Logloss",
             "eval_metric": "AUC",
             "task_type": "CPU",
             "max_bin": 30,
             "early_stopping_rounds": 50,
             "verbose": 500,
             "max_depth": 5,
             "l2_leaf_reg": 100,             
             "thread_count": 6,
             "random_seed": 42}

In [64]:
dtrain = cb.Pool(df_train[num_features], label=df_train[target])
dvalid = cb.Pool(df_valid[num_features], label=df_valid[target])

model_cb_num = cb.CatBoostClassifier(**params_cb)
model_cb_num.fit(dtrain, eval_set=dvalid)

0:	test: 0.6778214	best: 0.6778214 (0)	total: 51.4ms	remaining: 4m 16s
500:	test: 0.8895358	best: 0.8895495 (472)	total: 27.1s	remaining: 4m 3s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.8895494801
bestIteration = 472

Shrink model to first 473 iterations.


<catboost.core.CatBoostClassifier at 0x150acb6b9a0>

Задание 7: обработать категориальные признаки любым способом (который вы знаете) и добавить их к данным. Выполнить задание 6.

In [65]:
dtrain = cb.Pool(df_train[new_feature], label=df_train[target])
dvalid = cb.Pool(df_valid[new_feature], label=df_valid[target])

model_cb_all = cb.CatBoostClassifier(**params_cb)
model_cb_all.fit(dtrain, eval_set=dvalid)

0:	test: 0.6806710	best: 0.6806710 (0)	total: 55.6ms	remaining: 4m 37s
500:	test: 0.8996157	best: 0.8996157 (500)	total: 27.1s	remaining: 4m 3s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.9026456852
bestIteration = 730

Shrink model to first 731 iterations.


<catboost.core.CatBoostClassifier at 0x15098678f10>

Задание 8: обработать категориальные признаки встроенным методом в CatBoost. Выполнить задание 6. Сделать выводы о качестве работы алгоритма, по сравнению с пунктом 7.

In [66]:
dtrain = cb.Pool(df_train[var_names], 
                 label=df_train[target], 
                 cat_features=cat_features)
dvalid = cb.Pool(df_valid[var_names], 
                 df_valid[target], 
                 cat_features=cat_features)

model_cb_all_cat = cb.CatBoostClassifier(**params_cb)
model_cb_all_cat.fit(dtrain, eval_set=dvalid)

0:	test: 0.7309429	best: 0.7309429 (0)	total: 207ms	remaining: 17m 16s
500:	test: 0.8978831	best: 0.8978831 (500)	total: 1m 37s	remaining: 14m 38s
1000:	test: 0.9015727	best: 0.9015733 (985)	total: 3m 22s	remaining: 13m 29s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.9015732736
bestIteration = 985

Shrink model to first 986 iterations.


<catboost.core.CatBoostClassifier at 0x15099518d90>

Вывод: Использование встроенного метода обработки категориальных переменных catboost, дает похожий результат с target encoding и увеличивает время обучения.