# Introduction

Hey, thanks for viewing my Kernel!

If you like my work, please, leave an upvote: it will be really appreciated and it will motivate me in offering more content to the Kaggle community ! 😊

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import warnings


warnings.simplefilter("ignore")

X = pd.read_csv('/kaggle/input/prepared-data/x_train.csv')
X_test = pd.read_csv('/kaggle/input/prepared-data/x_test.csv')
y = pd.read_csv('/kaggle/input/prepared-data/y_train.csv')

print('X shape: ', X.shape)
print('X_test shape: ', X_test.shape)
print('y shape: ', y.shape)
print(X.head())

train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=42)

X shape:  (300000, 68)
X_test shape:  (200000, 68)
y shape:  (300000, 1)
      cont0     cont1     cont2     cont3     cont4     cont5     cont6  \
0  0.771882  1.831469  0.757355  1.831778  1.734982  3.604858  1.614927   
1  0.839769  1.263666  0.868692  1.580342  1.315982  2.338179  1.623011   
2  0.803225  1.257000  0.884926  1.654517  1.894852  3.244596  1.367991   
3  0.709523  1.672173  0.740416  1.843903  1.903929  3.591256  1.233940   
4  0.876465  1.204973  0.886647  1.319137  1.632942  1.481867  1.601123   

      cont7     cont8     cont9  ...  cat11cat12M  cat11cat13M  cat11cat14M  \
0  0.417356  0.158744  0.256644  ...            0            0            0   
1  0.633500  0.362071  0.269906  ...            0            0            0   
2  0.649517  0.351679  0.825885  ...            0            0            0   
3  0.378396  0.221788  0.343216  ...            0            0            0   
4  0.751206  0.404528  0.713877  ...            0            0            0   

 

# Model Creating and Evaluating

In [2]:
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

import optuna
from optuna.visualization import plot_optimization_history, plot_param_importances

from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold

def scorer(y, y_pred):
    return roc_auc_score(y, y_pred)

In [3]:
# XGBClassifier
xgbc_model = XGBClassifier(min_child_weight=0.1, reg_lambda=100, booster='gbtree', objective='binary:logitraw', random_state=42)
xgbc_score = cross_val_score(xgbc_model, train_X, train_y, scoring='roc_auc', cv=5)
print('xgbc_score: ', xgbc_score.mean())

# LGBMClassifier
ligthgbmc_model = LGBMClassifier(boosting_type='gbdt', objective='binary', random_state=42)
ligthgbmc_score = cross_val_score(ligthgbmc_model, train_X, train_y, scoring='roc_auc', cv=5)
print('ligthgbmc_score: ', ligthgbmc_score.mean())

# CatBoostClassifier
cbc_model = CatBoostClassifier(loss_function='Logloss', random_state=42, verbose=False)
cbc_score = cross_val_score(cbc_model, train_X, train_y, scoring='roc_auc', cv=5)
print('cbc_score: ', cbc_score.mean())

xgbc_score:  0.8898202612356174
ligthgbmc_score:  0.8879385374274603
cbc_score:  0.8909648517647316


# XGB Optuna

In [4]:
def objective(trial, data=X, target=y):
    X_train, X_val, y_train, y_val = train_test_split(data, target, test_size=0.2, random_state=42)

    params = {
        'max_depth': trial.suggest_int('max_depth', 3, 32),
        'learning_rate': trial.suggest_categorical('learning_rate', [0.005, 0.02, 0.05, 0.08, 0.1]),
        'n_estimators': trial.suggest_int('n_estimators', 2000, 8000),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 300),
        'gamma': trial.suggest_float('gamma', 0.0001, 1.0, log = True),
        'alpha': trial.suggest_float('alpha', 0.0001, 10.0, log = True),
        'lambda': trial.suggest_float('lambda', 0.0001, 10.0, log = True),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.1, 0.8),
        'subsample': trial.suggest_float('subsample', 0.1, 0.8),
        'tree_method': 'gpu_hist',
        'booster': 'gbtree',
        'random_state': 42,
        'use_label_encoder': False,
        'eval_metric': 'auc'

    }
    
    model = XGBClassifier(**params)  
    model.fit(X_train, y_train, eval_set = [(X_val,y_val)], early_stopping_rounds = 333, verbose = False)
    y_pred = model.predict_proba(X_val)[:,1]
    roc_auc = roc_auc_score(y_val, y_pred)

    return roc_auc

In [5]:
#study = optuna.create_study(direction='maximize')
#study.optimize(objective, n_trials=50)
#print('Number of finished trials: ', len(study.trials))
#print('Best trial: ', study.best_trial.params)
#print('Best value: ', study.best_value)

Number of finished trials: 1 Best trial: {'max_depth': 4, 'learning_rate': 0.1, 'n_estimators': 2616, 'min_child_weight': 36, 'gamma': 0.0001231342905079067, 'alpha': 5.138826788428377, 'lambda': 0.006952601632723477, 'colsample_bytree': 0.3019243613187322, 'subsample': 0.7474126793277557} Best value: 0.8941200673933261

Number of finished trials: 50 Best trial: {'max_depth': 6, 'learning_rate': 0.02, 'n_estimators': 2941, 'min_child_weight': 10, 'gamma': 0.027689264382343946, 'alpha': 2.239319562015662, 'lambda': 0.005116156806904708, 'colsample_bytree': 0.2018103901998171, 'subsample': 0.7452030806282816} Best value: 0.8951492161710065

# CatBoost Optuna

In [6]:
def objective(trial, data=X, target=y):
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

    params = {
        'max_depth': trial.suggest_int('max_depth', 3, 64),
        'learning_rate': trial.suggest_categorical('learning_rate', [0.005, 0.02, 0.05, 0.08, 0.1]),
        'n_estimators': trial.suggest_int('n_estimators', 2000, 8000),
        'max_bin': trial.suggest_int('max_bin', 200, 400),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 1, 300),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 0.0001, 1.0, log = True),
        'subsample': trial.suggest_float('subsample', 0.1, 0.8),
        'random_seed': 42,
        'task_type': 'GPU',
        'loss_function': 'Logloss',
        'eval_metric': 'AUC',
        'bootstrap_type': 'Poisson'
    }
    
    model = CatBoostClassifier(**params)  
    model.fit(X_train, y_train, eval_set = [(X_val,y_val)], early_stopping_rounds = 222, verbose = False)
    y_pred = model.predict_proba(X_val)[:,1]
    roc_auc = roc_auc_score(y_val, y_pred)

    return roc_auc

In [7]:
#study = optuna.create_study(direction = 'maximize')
#study.optimize(objective, n_trials = 50)
#print('Number of finished trials:', len(study.trials))
#print('Best trial:', study.best_trial.params)
#print('Best value:', study.best_value)

Number of finished trials: 50 Best trial: {'max_depth': 4, 'learning_rate': 0.1, 'n_estimators': 2877, 'max_bin': 200, 'min_data_in_leaf': 10, 'l2_leaf_reg': 0.09385107162927438, 'subsample': 0.7990428819543426} Best value: 0.8925910141177894

# LGBM Optuna

In [8]:
def objective(trial,data=X,target=y):   
    train_x, test_x, train_y, test_y = train_test_split(data, target, test_size=0.15,random_state=42)
    params = {
        'reg_alpha': trial.suggest_float('reg_alpha', 0.001, 10.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.001, 10.0),
        'num_leaves': trial.suggest_int('num_leaves', 11, 333),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'max_depth': trial.suggest_int('max_depth', 5, 64),
        'learning_rate': trial.suggest_categorical('learning_rate', [0.01, 0.02, 0.05, 0.005, 0.1]),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.1, 0.5),
        'n_estimators': trial.suggest_int('n_estimators', 2000, 8000),
        'cat_smooth' : trial.suggest_int('cat_smooth', 10, 100),
        'cat_l2': trial.suggest_int('cat_l2', 1, 20),
        'min_data_per_group': trial.suggest_int('min_data_per_group', 50, 200),
        'cat_feature' : [11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 
                         32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 
                         53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67],
        'n_jobs' : -1, 
        'random_state': 42,
        'boosting_type': 'gbdt',
        'metric': 'AUC',
        'device': 'gpu'
    }
    model = LGBMClassifier(**params)  
    
    model.fit(train_x,train_y,eval_set=[(test_x,test_y)],eval_metric='auc', early_stopping_rounds=300, verbose=False)
    
    preds = model.predict_proba(test_x)[:,1]
    
    auc = roc_auc_score(test_y, preds)
    
    return auc

In [9]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)
print('Number of finished trials: ', len(study.trials))
print('Best trial: ', study.best_trial.params)
print('Best value: ', study.best_value)

[32m[I 2022-01-14 09:28:46,832][0m A new study created in memory with name: no-name-0cdfba62-7b73-433b-8e88-6e2a5bcb39a4[0m
[32m[I 2022-01-14 09:32:02,412][0m Trial 0 finished with value: 0.8955073188832696 and parameters: {'reg_alpha': 5.1443519018289585, 'reg_lambda': 4.415244949244017, 'num_leaves': 26, 'min_child_samples': 61, 'max_depth': 26, 'learning_rate': 0.01, 'colsample_bytree': 0.37058327649790235, 'n_estimators': 5108, 'cat_smooth': 58, 'cat_l2': 7, 'min_data_per_group': 55}. Best is trial 0 with value: 0.8955073188832696.[0m
[32m[I 2022-01-14 09:35:21,640][0m Trial 1 finished with value: 0.8954177148782939 and parameters: {'reg_alpha': 5.358676472636174, 'reg_lambda': 7.636187731515871, 'num_leaves': 165, 'min_child_samples': 93, 'max_depth': 30, 'learning_rate': 0.005, 'colsample_bytree': 0.4892020956172842, 'n_estimators': 2465, 'cat_smooth': 63, 'cat_l2': 16, 'min_data_per_group': 189}. Best is trial 0 with value: 0.8955073188832696.[0m
[32m[I 2022-01-14 09:3

Number of finished trials:  50
Best trial:  {'reg_alpha': 3.444027580169452, 'reg_lambda': 2.6823672979117004, 'num_leaves': 211, 'min_child_samples': 54, 'max_depth': 50, 'learning_rate': 0.005, 'colsample_bytree': 0.26986900577926765, 'n_estimators': 6749, 'cat_smooth': 34, 'cat_l2': 14, 'min_data_per_group': 136}
Best value:  0.8966645758299353


Trial 32 finished with value: 0.8960118885713237 and parameters: {'reg_alpha': 5.028382776465415, 'reg_lambda': 7.969115943661513, 'num_leaves': 196, 'min_child_samples': 39, 'max_depth': 20, 'learning_rate': 0.01, 'colsample_bytree': 0.22772406492167746, 'n_estimators': 7028, 'cat_smooth': 38, 'cat_l2': 20, 'min_data_per_group': 199}. Best is trial 32 with value: 0.8960118885713237.

Number of finished trials: 50 Best trial: {'reg_alpha': 5.028382776465415, 'reg_lambda': 7.969115943661513, 'num_leaves': 196, 'min_child_samples': 39, 'max_depth': 20, 'learning_rate': 0.01, 'colsample_bytree': 0.22772406492167746, 'n_estimators': 7028, 'cat_smooth': 38, 'cat_l2': 20, 'min_data_per_group': 199}. Best is trial 32 with value: 0.8960118885713237.

In [10]:
# Historic
plot_optimization_history(study)

In [11]:
# Importance
optuna.visualization.plot_param_importances(study)

In [12]:
lgb_params =  {'reg_alpha': 5.028382776465415, 
               'reg_lambda': 7.969115943661513, 
               'num_leaves': 196, 
               'min_child_samples': 39, 
               'max_depth': 20, 
               'learning_rate': 0.01, 
               'colsample_bytree': 0.22772406492167746, 
               'n_estimators': 7028, 
               'cat_smooth': 38, 
               'cat_l2': 20, 
               'min_data_per_group': 199,
               'cat_feature' : [11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 
                                 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 
                                 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67],
               'n_jobs' : -1, 
               'random_state': 42,
               'boosting_type': 'gbdt',
               'metric': 'AUC',
               'device': 'gpu'
}

In [13]:
lgb_params = study.best_trial.params
lgb_params['device'] = "gpu"
lgb_params['random_state'] = 42
lgb_params['cat_feature'] = [11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 
                             32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 
                             53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67]
lgb_params['n_jobs'] = -1 
lgb_params['boosting_type'] =  'gbdt'
lgb_params['metric'] =  'AUC'

In [14]:
NFOLDS = 20
folds = StratifiedKFold(n_splits=NFOLDS, random_state=42, shuffle=True)
predictions = np.zeros(len(X_test))
for fold, (train_index, test_index) in enumerate(folds.split(X, y)):
    print("--> Fold {}".format(fold + 1))
    
    X_train, X_valid = X.iloc[train_index], X.iloc[test_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[test_index]
    
    lgb_model = LGBMClassifier(**lgb_params).fit(X_train, y_train, 
                                                  eval_set=[(X_valid, y_valid)], 
                                                  eval_metric='auc', 
                                                  early_stopping_rounds=300, verbose=0)
    
    y_preds = lgb_model.predict_proba(X_valid)[:,1]
    predictions += lgb_model.predict_proba(X_test)[:,1] / folds.n_splits 
    
    print(": LGB - ROC AUC Score = {}".format(roc_auc_score(y_valid, y_preds, average="micro")))

--> Fold 1
: LGB - ROC AUC Score = 0.9021636273375255
--> Fold 2
: LGB - ROC AUC Score = 0.8917105762710301
--> Fold 3
: LGB - ROC AUC Score = 0.895656797009998
--> Fold 4
: LGB - ROC AUC Score = 0.8962422533291337
--> Fold 5
: LGB - ROC AUC Score = 0.8990828200994236
--> Fold 6
: LGB - ROC AUC Score = 0.8940285715192222
--> Fold 7
: LGB - ROC AUC Score = 0.8990148908232043
--> Fold 8
: LGB - ROC AUC Score = 0.8965821279672065
--> Fold 9
: LGB - ROC AUC Score = 0.8987658852874936
--> Fold 10
: LGB - ROC AUC Score = 0.900633301263989
--> Fold 11
: LGB - ROC AUC Score = 0.8973259033252727
--> Fold 12
: LGB - ROC AUC Score = 0.8953067192850737
--> Fold 13
: LGB - ROC AUC Score = 0.8961045230694875
--> Fold 14
: LGB - ROC AUC Score = 0.9011333666481998
--> Fold 15
: LGB - ROC AUC Score = 0.8940033719490117
--> Fold 16
: LGB - ROC AUC Score = 0.8964601702646395
--> Fold 17
: LGB - ROC AUC Score = 0.9013588388896292
--> Fold 18
: LGB - ROC AUC Score = 0.8971573811994907
--> Fold 19
: LGB - R

In [15]:
sub = pd.read_csv('../input/tabular-playground-series-mar-2021/sample_submission.csv')
sub['target'] = predictions
regr_table = sub.to_csv("Sub_lgb_v2.csv", index = False)