In [1]:
import pandas as pd
import numpy as np
import sklearn
import matplotlib.pyplot as plt

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

import warnings

warnings.filterwarnings("ignore")

import lightgbm as lgb
import xgboost as xgb

import optuna
import ray
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, StandardScaler
from sklearn.metrics import (
    confusion_matrix,
    accuracy_score,
    roc_auc_score,
    balanced_accuracy_score,
)
from imblearn.over_sampling import RandomOverSampler, SMOTE, SMOTENC
from sklearn.utils import compute_class_weight, class_weight
from sklearn.manifold import Isomap
from typing import Tuple
from scipy.special import expit
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.manifold import Isomap

In [2]:
train = pd.read_csv('train.csv')
train['EJ'].replace(['A', 'B'], [1, 0], inplace=True)

ej = np.array(train['EJ']).reshape(-1, 1)

sample_submission = pd.read_csv('sample_submission.csv')

y = train['Class']

In [3]:
scaler = StandardScaler()

x_numerical_columns = train.drop(columns=["Id", "Class", "EJ"]).columns.tolist()
x_categorical_columns = ["EJ"]
x_cols = x_numerical_columns + x_categorical_columns

scaler.fit(train[x_numerical_columns])

X = scaler.transform(train[x_numerical_columns])
X = np.concatenate((X, ej), axis=1)

In [4]:
from sklearn.impute import KNNImputer

knn = KNNImputer()
knn.fit(X)

X = knn.fit_transform(X)

X = pd.DataFrame(X, columns=x_cols)
X['EJ'] = X['EJ'].astype('int')

In [5]:
outlier_df = X[X>10].dropna(how='all').dropna(how='all', axis=1)

outlier_index = outlier_df.loc[(y==0)].index.tolist()

X = X.drop(index=outlier_index).reset_index(drop=True)
y = y.drop(index=outlier_index).reset_index(drop=True)

X['EJ'] = X['EJ'].astype('category')

In [6]:
def balancedlogloss_lgb(
    predt: np.ndarray, dtrain: lgb.Dataset
) -> Tuple[np.ndarray, np.ndarray]:
    y = dtrain.get_label()
    n0 = len(y[y == 0])
    n1 = len(y[y == 1])

    p = expit(predt)
    p[p == 0] = 1e-15

    grad = 1 / 2 * ((1 - y) / (1 - p) - y / p)
    hess = 1 / 2 * ((1 - y) / ((1 - p) ** 2) + y / (p**2))
    return grad, hess

def balancedlogloss_xgb(
    predt: np.ndarray, dtrain: xgb.DMatrix
) -> Tuple[np.ndarray, np.ndarray]:
    y = dtrain.get_label()
    n0 = len(y[y == 0])
    n1 = len(y[y == 1])

    p = expit(predt)
    p[p == 0] = 1e-15

    grad = 1 / 2 * ((1 - y) / (1 - p) - y / p)
    hess = 1 / 2 * ((1 - y) / ((1 - p) ** 2) + y / (p**2))
    return grad, hess


def balancedlogloss_eval_lgb(
    predt: np.ndarray, dtrain: lgb.Dataset
) -> Tuple[np.ndarray, np.ndarray]:
    y = dtrain.get_label()
    n0 = len(y[y == 0])
    n1 = len(y[y == 1])
    p = expit(predt)

    p[p == 0] = 1e-15

    return (
        "balanced_logloss",
        (-1/ n0 * (sum((1 - y) * np.log(1 - p))) - 1 / n1 * (sum(y * np.log(p)))) / 2,
        True
    )

def balancedlogloss_eval_xgb(
    predt: np.ndarray, dtrain: lgb.Dataset
) -> Tuple[np.ndarray, np.ndarray]:
    y = dtrain.get_label()
    n0 = len(y[y == 0])
    n1 = len(y[y == 1])
    p = expit(predt)

    p[p == 0] = 1e-15

    return (
        "balanced_logloss",
        (-1 / n0 * (sum((1 - y) * np.log(1 - p))) - 1 / n1 * (sum(y * np.log(p)))) / 2,
    )

def score(p, y):

    p[p == 0] = 1e-15

    n0 = len(y[y == 0])
    n1 = len(y[y == 1])

    return ((-1/ n0 * (sum((1 - y) * np.log(1 - p))) - 1 / n1 * (sum(y * np.log(p)))) / 2)

In [7]:
def get_trials_df(trials_dataframe):
    col_index = [1] + [i for i in range(5, trials_dataframe.shape[1]-1)]

    trials_dataframe = trials_dataframe.iloc[:, col_index]
    trials_dataframe = trials_dataframe.groupby(trials_dataframe.columns.tolist()[1:]).mean()

    trials_dataframe = trials_dataframe.sort_values(by=['value'], ascending=True)

    return trials_dataframe

In [8]:
def objective(trial):

    xgb_params = {
        'learning_rate': 0.1,
        'min_child_weight': trial.suggest_categorical('min_child_weight', [i for i in range(8, 15)]),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.3, 1, step=0.05),
        'reg_alpha': trial.suggest_float('reg_alpha', 3.5, 4.5, step=0.1),
        'max_depth': trial.suggest_categorical('max_depth', [8, 10, 12]),
        'max_delta_step': 4,
        'subsample': trial.suggest_float('subsample', 0.2, 1, step=0.1),
        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.08, 0.1, 0.12, 0.18, 0.2]),
        'disable_default_eval_metric': True, 
        'seed': 5,
    }

    kf = StratifiedKFold(10, shuffle=True, random_state=30)
    cols = X.columns.tolist()

    xgb_scores = []
    
    for train_index, test_index in kf.split(X, y):
        X_train_val, X_test = X.loc[train_index], X.loc[test_index]
        y_train_val, y_test = y.loc[train_index], y.loc[test_index]

        X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.05, 
                                                          stratify=y_train_val, random_state=32)

        sampler = RandomOverSampler()
        X_train, y_train = sampler.fit_resample(X_train, y_train)

        n_components = 3
        isomap = Isomap(n_components=n_components)
        isomap.fit(X_train)

        x_isomap_train = isomap.transform(X_train)
        x_isomap_test = isomap.transform(X_test)
        x_isomap_val = isomap.transform(X_val)

        x_isomap_train = pd.DataFrame(x_isomap_train, columns=['isomap_' + str(i) for i in range(n_components)], index=X_train.index)
        x_isomap_test = pd.DataFrame(x_isomap_test, columns=['isomap_' + str(i) for i in range(n_components)], index=X_test.index)
        x_isomap_val = pd.DataFrame(x_isomap_val, columns=['isomap_' + str(i) for i in range(n_components)], index=X_val.index)

        X_train = pd.concat([X_train, x_isomap_train], axis=1)
        X_test = pd.concat([X_test, x_isomap_test], axis=1)
        X_val = pd.concat([X_val, x_isomap_val], axis=1)
        cols = X_train.columns.tolist()

        dtrain_xgb = xgb.DMatrix(X_train, y_train, feature_names=cols, enable_categorical=True)
        dtest_xgb = xgb.DMatrix(X_test, y_test, feature_names=cols, enable_categorical=True)
        dval_xgb = xgb.DMatrix(X_val, y_val, feature_names=cols, enable_categorical=True)

        xgb_model = xgb.train(params=xgb_params,
                            dtrain=dtrain_xgb,
                            verbose_eval=False,
                            obj=balancedlogloss_xgb,
                            evals=[(dtrain_xgb, 'train'), (dval_xgb, 'validation')],
                            feval=balancedlogloss_eval_xgb,
                            num_boost_round=300,
                            early_stopping_rounds=10,
                            )

        xgb_test_preds = expit(xgb_model.predict(dtest_xgb, output_margin=True))

        xgb_score = score(xgb_test_preds, y_test)
        xgb_scores = xgb_scores + [xgb_score]

    if np.isnan(np.mean(xgb_scores)):
        raise optuna.exceptions.TrialPruned()
    
    return np.mean(xgb_scores)

pruner = optuna.pruners.MedianPruner(n_warmup_steps=5)
study = optuna.create_study(direction='minimize', pruner=pruner)
study.optimize(objective, n_trials=50)

trials_dataframe = study.trials_dataframe()
get_trials_df(trials_dataframe)


[I 2023-06-27 18:46:39,907] A new study created in memory with name: no-name-3f6572b1-73d0-4da1-9db7-c7cf059c44e8


[I 2023-06-27 18:47:10,384] Trial 0 finished with value: 0.3087522021265533 and parameters: {'min_child_weight': 12, 'reg_lambda': 0.5, 'reg_alpha': 3.5, 'max_depth': 12, 'subsample': 0.5, 'colsample_bytree': 0.18}. Best is trial 0 with value: 0.3087522021265533.
[I 2023-06-27 18:47:47,458] Trial 1 finished with value: 0.29147330770715824 and parameters: {'min_child_weight': 13, 'reg_lambda': 0.95, 'reg_alpha': 3.8, 'max_depth': 8, 'subsample': 0.30000000000000004, 'colsample_bytree': 0.1}. Best is trial 1 with value: 0.29147330770715824.
[I 2023-06-27 18:48:23,893] Trial 2 finished with value: 0.33138281434556804 and parameters: {'min_child_weight': 14, 'reg_lambda': 0.8500000000000001, 'reg_alpha': 4.3, 'max_depth': 8, 'subsample': 0.2, 'colsample_bytree': 0.08}. Best is trial 1 with value: 0.29147330770715824.
[I 2023-06-27 18:48:58,600] Trial 3 finished with value: 0.3494696876679245 and parameters: {'min_child_weight': 13, 'reg_lambda': 0.9000000000000001, 'reg_alpha': 4.1, 'max_d

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,value
params_colsample_bytree,params_max_depth,params_min_child_weight,params_reg_alpha,params_reg_lambda,params_subsample,Unnamed: 6_level_1
0.1,8,13,3.8,0.95,0.3,0.291473
0.12,10,12,4.3,0.65,0.2,0.29201
0.12,10,13,4.2,0.8,0.3,0.294662
0.1,8,13,4.0,0.7,0.4,0.297268
0.2,12,14,4.1,0.65,0.7,0.30111
0.12,10,13,4.5,0.95,0.3,0.303032
0.2,10,8,4.4,0.85,0.2,0.304179
0.12,10,13,4.3,0.7,0.2,0.304889
0.1,10,11,4.0,0.8,0.3,0.306882
0.18,12,12,3.5,0.5,0.5,0.308752


In [20]:
# tuned but not completed
xgb_param = study.best_params
xgb_param['learning_rate'] = 0.1
xgb_param['max_delta_step'] = 4
xgb_param['seed'] = 5
xgb_param['disable_default_eval_metric'] = True

lgb_param = {'learning_rate': 0.2,
             'lambda_l1': 40,
             'lambda_l2': 10,
             'subsample': 0.4,
             'colsample_bytree': 0.5,
             'verbosity': -1,
             'boosting_type': 'goss',
             }

kf = StratifiedKFold(10, shuffle=True, random_state=30)
cols = X.columns.tolist()

df_xgb_train, df_xgb_test = pd.DataFrame(), pd.DataFrame()
df_lgb_train, df_lgb_test = pd.DataFrame(), pd.DataFrame()

xgb_scores = []
lgb_scores = []
scores = []

for train_index, test_index in kf.split(X, y):

    X_train_val, X_test = X.loc[train_index], X.loc[test_index]
    y_train_val, y_test = y.loc[train_index], y.loc[test_index]

    X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, stratify=y_train_val, test_size=0.05, random_state=32)

    sampler = RandomOverSampler()
    X_train, y_train = sampler.fit_resample(X_train, y_train)

    n_components = 3
    isomap = Isomap(n_components=n_components)
    isomap.fit(X_train)

    x_isomap_train = isomap.transform(X_train)
    x_isomap_test = isomap.transform(X_test)
    x_isomap_val = isomap.transform(X_val)

    x_isomap_train = pd.DataFrame(x_isomap_train, columns=['isomap_' + str(i) for i in range(n_components)], index=X_train.index)
    x_isomap_test = pd.DataFrame(x_isomap_test, columns=['isomap_' + str(i) for i in range(n_components)], index=X_test.index)
    x_isomap_val = pd.DataFrame(x_isomap_val, columns=['isomap_' + str(i) for i in range(n_components)], index=X_val.index)

    X_train = pd.concat([X_train, x_isomap_train], axis=1)
    X_test = pd.concat([X_test, x_isomap_test], axis=1)
    X_val = pd.concat([X_val, x_isomap_val], axis=1)
    cols = X_train.columns.tolist()

    evals_xgb = {}
    dtrain_xgb = xgb.DMatrix(X_train, y_train, feature_names=cols, enable_categorical=True)
    dtest_xgb = xgb.DMatrix(X_test, y_test, feature_names=cols, enable_categorical=True)
    dval_xgb = xgb.DMatrix(X_val, y_val, feature_names=cols, enable_categorical=True)

    xgb_model = xgb.train(params=xgb_param,
                          dtrain=dtrain_xgb,
                          obj=balancedlogloss_xgb,
                          verbose_eval=False,
                          evals=[(dtrain_xgb, 'train'), (dval_xgb, 'val')],
                          feval=balancedlogloss_eval_xgb,
                          evals_result=evals_xgb,
                          early_stopping_rounds=20,
                          num_boost_round=300,
                          )
    
    df_xgb_train = pd.concat([df_xgb_train, pd.Series(evals_xgb['train']['balanced_logloss'])], axis=1)
    df_xgb_test = pd.concat([df_xgb_test, pd.Series(evals_xgb['val']['balanced_logloss'])], axis=1)

    xgb_train_preds = expit(xgb_model.predict(dtrain_xgb, output_margin=True))
    xgb_test_preds = expit(xgb_model.predict(dtest_xgb, output_margin=True))

    xgb_score = score(xgb_test_preds, y_test)
    xgb_scores = xgb_scores + [xgb_score]
    print(xgb_score)

    evals_lgb = {}
    dtrain_lgb = lgb.Dataset(X_train, y_train)
    dtest_lgb = lgb.Dataset(X_test, y_test)
    dval_lgb = lgb.Dataset(X_val, y_val)

    lgb_model = lgb.train(params=lgb_param,
                          train_set=dtrain_lgb,
                          valid_sets=[dtrain_lgb, dval_lgb],
                          fobj=balancedlogloss_lgb,
                          feval=balancedlogloss_eval_lgb,
                          evals_result=evals_lgb,
                          valid_names=['train', 'val'],
                          num_boost_round=500,
                          early_stopping_rounds=2,
                          verbose_eval=False)

    df_lgb_train = pd.concat([df_lgb_train, pd.Series(evals_lgb['train']['balanced_logloss'])], axis=1)
    df_lgb_test = pd.concat([df_lgb_test, pd.Series(evals_lgb['val']['balanced_logloss'])], axis=1)

    lgb_train_preds = expit(lgb_model.predict(X_train, raw_score=True))
    lgb_test_preds = expit(lgb_model.predict(X_test, raw_score=True))

    lgb_score = score(lgb_test_preds, y_test)
    lgb_scores = lgb_scores + [lgb_score]
    print(lgb_score)

    stacked_preds_train = np.column_stack(((expit(xgb_train_preds)), (expit(lgb_train_preds))))
    stacked_preds_test = np.column_stack(((expit(xgb_test_preds)), (expit(lgb_test_preds))))

    meta_model = LogisticRegression(C=10, random_state=20)
    # meta_model = xgb.XGBClassifier()
    meta_model.fit(stacked_preds_train, y_train)
    ensemble_preds = meta_model.predict_proba(stacked_preds_test)[:, 1]

    ensemble_score = score(ensemble_preds, np.array(y_test))
    scores = scores + [ensemble_score]
    print('ensemble: ' + str(ensemble_score))

df_xgb = pd.DataFrame()
df_xgb['train'] = df_xgb_train.mean(axis=1)
df_xgb['val'] = df_xgb_test.mean(axis=1)

df_lgb = pd.DataFrame()
df_lgb['train'] = df_lgb_train.mean(axis=1)
df_lgb['val'] = df_lgb_test.mean(axis=1)

print('\n')
print('xgb: ' + str(np.mean(xgb_scores)))
print('lgb: ' + str(np.mean(lgb_scores)))
print('ensemble:' + str(np.mean(scores)))

0.3480299219515556
0.6758073702576946
ensemble: 0.29074432262241634
0.5432995333904612
0.6806512668785927
ensemble: 0.6201091008919584
0.3705253818828417
0.6764264054637226
ensemble: 0.356120582796209
0.4594165965285191
0.6738122082665903
ensemble: 0.47939363752571307
0.21552427367561244
0.6699694882385698
ensemble: 0.16801681811198582
0.2905431592883542
0.6694373584885963
ensemble: 0.3337217320382609
0.28767757759593016
0.6776906750851969
ensemble: 0.2890359506125468
0.23438752349016567
0.6661445430226921
ensemble: 0.23197171735571206
0.24765286135516362
0.6717067475850311
ensemble: 0.3040891599995785
0.11926825555710821
0.6677202140269065
ensemble: 0.11109402752703484


xgb: 0.3116325084715712
lgb: 0.6729366277313592
ensemble:0.3184297049481416
