In [None]:
import pandas as pd
import numpy as np
import sklearn
import matplotlib.pyplot as plt

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

import warnings

warnings.filterwarnings("ignore")

import lightgbm as lgb
import xgboost as xgb

import optuna
import ray
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, StandardScaler
from sklearn.metrics import (
    confusion_matrix,
    accuracy_score,
    roc_auc_score,
    balanced_accuracy_score,
)
from imblearn.over_sampling import RandomOverSampler, SMOTE, SMOTENC
from sklearn.utils import compute_class_weight, class_weight
from sklearn.manifold import Isomap
from typing import Tuple
from scipy.special import expit
from sklearn.linear_model import LogisticRegression, LinearRegression

In [None]:
train = pd.read_csv('train.csv')
train['EJ'].replace(['A', 'B'], [1, 0], inplace=True)

ej = np.array(train['EJ']).reshape(-1, 1)

sample_submission = pd.read_csv('sample_submission.csv')

y = train['Class']

In [None]:
scaler = StandardScaler()

x_numerical_columns = train.drop(columns=["Id", "Class", "EJ"]).columns.tolist()
x_categorical_columns = ["EJ"]
x_cols = x_numerical_columns + x_categorical_columns

scaler.fit(train[x_numerical_columns])

X = scaler.transform(train[x_numerical_columns])
X = np.concatenate((X, ej), axis=1)

In [None]:
from sklearn.impute import KNNImputer

knn = KNNImputer()
knn.fit(X)

X = knn.fit_transform(X)

X = pd.DataFrame(X, columns=x_cols)
X['EJ'] = X['EJ'].astype('int')

In [None]:
outlier_df = X[X>10].dropna(how='all').dropna(how='all', axis=1)

outlier_index = outlier_df.loc[(y==0)].index.tolist()

X = X.drop(index=outlier_index).reset_index(drop=True)
y = y.drop(index=outlier_index).reset_index(drop=True)

X['EJ'] = X['EJ'].astype('category')

In [None]:
def balancedlogloss_lgb(
    predt: np.ndarray, dtrain: lgb.Dataset
) -> Tuple[np.ndarray, np.ndarray]:
    y = dtrain.get_label()
    n0 = len(y[y == 0])
    n1 = len(y[y == 1])

    p = expit(predt)
    p[p == 0] = 1e-15

    grad = 1 / 2 * ((1 - y) / (1 - p) - y / p)
    hess = 1 / 2 * ((1 - y) / ((1 - p) ** 2) + y / (p**2))
    return grad, hess

def balancedlogloss_xgb(
    predt: np.ndarray, dtrain: xgb.DMatrix
) -> Tuple[np.ndarray, np.ndarray]:
    y = dtrain.get_label()
    n0 = len(y[y == 0])
    n1 = len(y[y == 1])

    p = expit(predt)
    p[p == 0] = 1e-15

    grad = 1 / 2 * ((1 - y) / (1 - p) - y / p)
    hess = 1 / 2 * ((1 - y) / ((1 - p) ** 2) + y / (p**2))
    return grad, hess


def balancedlogloss_eval_lgb(
    predt: np.ndarray, dtrain: lgb.Dataset
) -> Tuple[np.ndarray, np.ndarray]:
    y = dtrain.get_label()
    n0 = len(y[y == 0])
    n1 = len(y[y == 1])
    p = expit(predt)

    p[p == 0] = 1e-15

    return (
        "balanced_logloss",
        (-1/ n0 * (sum((1 - y) * np.log(1 - p))) - 1 / n1 * (sum(y * np.log(p)))) / 2,
        True
    )

def balancedlogloss_eval_xgb(
    predt: np.ndarray, dtrain: lgb.Dataset
) -> Tuple[np.ndarray, np.ndarray]:
    y = dtrain.get_label()
    n0 = len(y[y == 0])
    n1 = len(y[y == 1])
    p = expit(predt)

    p[p == 0] = 1e-15

    return (
        "balanced_logloss",
        (-1 / n0 * (sum((1 - y) * np.log(1 - p))) - 1 / n1 * (sum(y * np.log(p)))) / 2,
    )

def score(p, y):

    p[p == 0] = 1e-15

    n0 = len(y[y == 0])
    n1 = len(y[y == 1])

    return ((-1/ n0 * (sum((1 - y) * np.log(1 - p))) - 1 / n1 * (sum(y * np.log(p)))) / 2)

In [296]:
xgb_param = {'learning_rate': 0.225,
             'min_child_weight': 2,
             'reg_lambda': 60,
             'reg_alpha': 10,
             'max_depth': 3,
             'max_delta_step': 4,
             'subsample': 0.2,
             'colsample_bytree': 0.3,
             'disable_default_eval_metric': True}

lgb_param = {'learning_rate': 0.05,
             'lambda_l1': 40,
             'lambda_l2': 40,
             'subsample': 0.4,
             'colsample_bytree': 0.5,
             'verbosity': -1,
             'boosting_type': 'goss',
             }

kf = KFold(10, shuffle=True, random_state=30)
cols = X.columns.tolist()

df_xgb_train, df_xgb_test = pd.DataFrame(), pd.DataFrame()
df_lgb_train, df_lgb_test = pd.DataFrame(), pd.DataFrame()

xgb_scores = []
lgb_scores = []
scores = []

for train_index, test_index in kf.split(X):

    X_train, X_test = X.loc[train_index], X.loc[test_index]
    y_train, y_test = y.loc[train_index], y.loc[test_index]

    # sampler = RandomOverSampler()
    sampler = SMOTE()
    X_train, y_train = sampler.fit_resample(X_train, y_train)

    evals_xgb = {}
    dtrain_xgb = xgb.DMatrix(X_train, y_train, feature_names=cols, enable_categorical=True)
    dtest_xgb = xgb.DMatrix(X_test, y_test, feature_names=cols, enable_categorical=True)
    xgb_model = xgb.train(params=xgb_param,
                          dtrain=dtrain_xgb,
                          obj=balancedlogloss_xgb,
                          verbose_eval=False,
                          evals=[(dtrain_xgb, 'train'), (dtest_xgb, 'test')],
                          feval=balancedlogloss_eval_xgb,
                          evals_result=evals_xgb,
                          num_boost_round=200,
                          )
    
    df_xgb_train = pd.concat([df_xgb_train, pd.Series(evals_xgb['train']['balanced_logloss'])], axis=1)
    df_xgb_test = pd.concat([df_xgb_test, pd.Series(evals_xgb['test']['balanced_logloss'])], axis=1)

    xgb_train_preds = expit(xgb_model.predict(dtrain_xgb, output_margin=True))
    xgb_test_preds = expit(xgb_model.predict(dtest_xgb, output_margin=True))

    xgb_score = score(xgb_test_preds, y_test)
    xgb_scores = xgb_scores + [xgb_score]
    print(xgb_score)

    evals_lgb = {}
    dtrain_lgb = lgb.Dataset(X_train, y_train)
    dtest_lgb = lgb.Dataset(X_test, y_test)
    lgb_model = lgb.train(params=lgb_param,
                          train_set=dtrain_lgb,
                          valid_sets=[dtrain_lgb, dtest_lgb],
                          fobj=balancedlogloss_lgb,
                          feval=balancedlogloss_eval_lgb,
                          evals_result=evals_lgb,
                          valid_names=['train', 'test'],
                          num_boost_round=400,
                          verbose_eval=False)

    df_lgb_train = pd.concat([df_lgb_train, pd.Series(evals_lgb['train']['balanced_logloss'])], axis=1)
    df_lgb_test = pd.concat([df_lgb_test, pd.Series(evals_lgb['test']['balanced_logloss'])], axis=1)

    lgb_train_preds = expit(lgb_model.predict(X_train, raw_score=True))
    lgb_test_preds = expit(lgb_model.predict(X_test, raw_score=True))

    lgb_score = score(lgb_test_preds, y_test)
    lgb_scores = lgb_scores + [lgb_score]
    print(lgb_score)

    stacked_preds_train = np.column_stack(((expit(xgb_train_preds)), (expit(lgb_train_preds))))
    stacked_preds_test = np.column_stack(((expit(xgb_test_preds)), (expit(lgb_test_preds))))

    meta_model = LogisticRegression(C=10, random_state=20)
    # meta_model = xgb.XGBClassifier()
    meta_model.fit(stacked_preds_train, y_train)
    ensemble_preds = meta_model.predict_proba(stacked_preds_test)[:, 1]

    ensemble_score = score(ensemble_preds, np.array(y_test))
    scores = scores + [ensemble_score]
    print('ensemble: ' + str(ensemble_score))

df_xgb = pd.DataFrame()
df_xgb['train'] = df_xgb_train.mean(axis=1)
df_xgb['test'] = df_xgb_test.mean(axis=1)

df_lgb = pd.DataFrame()
df_lgb['train'] = df_lgb_train.mean(axis=1)
df_lgb['test'] = df_lgb_test.mean(axis=1)

print('\n')
print('xgb: ' + str(np.mean(xgb_scores)))
print('lgb: ' + str(np.mean(lgb_scores)))
print('ensemble:' + str(np.mean(scores)))

0.4496867876659467
0.45530969676446625
ensemble: 0.49859208738254096
0.24512471394878554
0.23070873802732084
ensemble: 0.2611372567294537
0.3554959807267247
0.29839365958354896
ensemble: 0.3509887382403441
0.4987750252997095
0.41997078940290633
ensemble: 0.5188434742950659
0.16977463685932298
0.17640480809000314
ensemble: 0.13543239606482232
0.3499967204955986
0.32532655277445305
ensemble: 0.4015455149902005
0.43650465045576237
0.411180128240631
ensemble: 0.3966788144002997
0.4937955294978261
0.5707474613739473
ensemble: 0.4498458057214003
0.35321475136421476
0.3170019673461753
ensemble: 0.341418090027324
0.1547741310694255
0.16172657008074004
ensemble: 0.1534329505448303


xgb: 0.35071429273833166
lgb: 0.33667703716841924
ensemble:0.3507915128396282


In [297]:
df_lgb

Unnamed: 0,train,test
0,0.687276,0.68851
1,0.683043,0.68575
2,0.677095,0.681134
3,0.671393,0.677005
4,0.665554,0.672413
5,0.660855,0.668884
6,0.655129,0.664259
7,0.649747,0.660236
8,0.644683,0.65659
9,0.638989,0.651917
