In [None]:
import pandas as pd
import numpy as np
import sklearn
import matplotlib.pyplot as plt

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

import warnings

warnings.filterwarnings("ignore")

import lightgbm as lgb
import xgboost as xgb

import optuna
import ray
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, StandardScaler
from sklearn.metrics import (
    confusion_matrix,
    accuracy_score,
    roc_auc_score,
    balanced_accuracy_score,
)
from imblearn.over_sampling import RandomOverSampler, SMOTE, SMOTENC
from sklearn.utils import compute_class_weight, class_weight
from sklearn.manifold import Isomap
from typing import Tuple
from scipy.special import expit
from sklearn.linear_model import LogisticRegression

In [None]:
train = pd.read_csv('train.csv')
train['EJ'].replace(['A', 'B'], [1, 0], inplace=True)

ej = np.array(train['EJ']).reshape(-1, 1)

sample_submission = pd.read_csv('sample_submission.csv')

y = train['Class']

In [None]:
scaler = StandardScaler()

x_numerical_columns = train.drop(columns=["Id", "Class", "EJ"]).columns.tolist()
x_categorical_columns = ["EJ"]
x_cols = x_numerical_columns + x_categorical_columns

scaler.fit(train[x_numerical_columns])

X = scaler.transform(train[x_numerical_columns])
X = np.concatenate((X, ej), axis=1)

In [None]:
from sklearn.impute import KNNImputer

knn = KNNImputer()
knn.fit(X)

X = knn.fit_transform(X)

X = pd.DataFrame(X, columns=x_cols)
X['EJ'] = X['EJ'].astype('int')

In [None]:
outlier_df = X[X>10].dropna(how='all').dropna(how='all', axis=1)

outlier_index = outlier_df.loc[(y==0)].index.tolist()

X = X.drop(index=outlier_index).reset_index(drop=True)
y = y.drop(index=outlier_index).reset_index(drop=True)

X['EJ'] = X['EJ'].astype('category')

In [None]:
def balancedlogloss_lgb(
    predt: np.ndarray, dtrain: lgb.Dataset
) -> Tuple[np.ndarray, np.ndarray]:
    y = dtrain.get_label()
    n0 = len(y[y == 0])
    n1 = len(y[y == 1])

    p = expit(predt)
    p[p == 0] = 1e-15

    grad = 1 / 2 * ((1 - y) / (1 - p) - y / p)
    hess = 1 / 2 * ((1 - y) / ((1 - p) ** 2) + y / (p**2))
    return grad, hess

def balancedlogloss_xgb(
    predt: np.ndarray, dtrain: xgb.DMatrix
) -> Tuple[np.ndarray, np.ndarray]:
    y = dtrain.get_label()
    n0 = len(y[y == 0])
    n1 = len(y[y == 1])

    p = expit(predt)
    p[p == 0] = 1e-15

    grad = 1 / 2 * ((1 - y) / (1 - p) - y / p)
    hess = 1 / 2 * ((1 - y) / ((1 - p) ** 2) + y / (p**2))
    return grad, hess


def balancedlogloss_eval_lgb(
    predt: np.ndarray, dtrain: lgb.Dataset
) -> Tuple[np.ndarray, np.ndarray]:
    y = dtrain.get_label()
    n0 = len(y[y == 0])
    n1 = len(y[y == 1])
    p = expit(predt)

    p[p == 0] = 1e-15

    return (
        "balanced_logloss",
        (-1/ n0 * (sum((1 - y) * np.log(1 - p))) - 1 / n1 * (sum(y * np.log(p)))) / 2,
        True
    )

def balancedlogloss_eval_xgb(
    predt: np.ndarray, dtrain: lgb.Dataset
) -> Tuple[np.ndarray, np.ndarray]:
    y = dtrain.get_label()
    n0 = len(y[y == 0])
    n1 = len(y[y == 1])
    p = expit(predt)

    p[p == 0] = 1e-15

    return (
        "balanced_logloss",
        (-1 / n0 * (sum((1 - y) * np.log(1 - p))) - 1 / n1 * (sum(y * np.log(p)))) / 2,
    )

def score(p, y):

    p[p == 0] = 1e-15

    n0 = len(y[y == 0])
    n1 = len(y[y == 1])

    return ((-1/ n0 * (sum((1 - y) * np.log(1 - p))) - 1 / n1 * (sum(y * np.log(p)))) / 2)

In [145]:
xgb_param = {'learning_rate': 0.05,
             'subsample': 0.7,
             'lambda': 40,
             'gamma': 20,
             'disable_default_eval_metric': True}

lgb_param = {'learning_rate': 0.05,
             'lambda_l1': 40,
             'lambda_l2': 10,
             'subsample': 0.7,
             'verbosity': -1,
             'colsample_bytree': 0.8,
             }

kf = KFold(10)
cols = X.columns.tolist()

df_xgb_train, df_xgb_test = pd.DataFrame(), pd.DataFrame()
df_lgb_train, df_lgb_test = pd.DataFrame(), pd.DataFrame()

xgb_scores = []
lgb_scores = []
scores = []

for train_index, test_index in kf.split(X):

    X_train, X_test = X.loc[train_index], X.loc[test_index]
    y_train, y_test = y.loc[train_index], y.loc[test_index]

    # sampler = RandomOverSampler()
    sampler = SMOTE()
    X_train, y_train = sampler.fit_resample(X_train, y_train)

    evals_xgb = {}
    dtrain_xgb = xgb.DMatrix(X_train, y_train, feature_names=cols, enable_categorical=True)
    dtest_xgb = xgb.DMatrix(X_test, y_test, feature_names=cols, enable_categorical=True)
    xgb_model = xgb.train(params=xgb_param,
                          dtrain=dtrain_xgb,
                          obj=balancedlogloss_xgb,
                          verbose_eval=False,
                          evals=[(dtrain_xgb, 'train'), (dtest_xgb, 'test')],
                          feval=balancedlogloss_eval_xgb,
                          evals_result=evals_xgb,
                          num_boost_round=500,
                        #   early_stopping_rounds=20,
                          )
    
    df_xgb_train = pd.concat([df_xgb_train, pd.Series(evals_xgb['train']['balanced_logloss'])], axis=1)
    df_xgb_test = pd.concat([df_xgb_test, pd.Series(evals_xgb['test']['balanced_logloss'])], axis=1)

    xgb_train_preds = expit(xgb_model.predict(dtrain_xgb, output_margin=True))
    xgb_test_preds = expit(xgb_model.predict(dtest_xgb, output_margin=True))

    xgb_score = score(xgb_test_preds, y_test)
    xgb_scores = xgb_scores + [xgb_score]
    print(xgb_score)

    evals_lgb = {}
    dtrain_lgb = lgb.Dataset(X_train, y_train)
    dtest_lgb = lgb.Dataset(X_test, y_test)
    lgb_model = lgb.train(params=lgb_param,
                          train_set=dtrain_lgb,
                          valid_sets=[dtrain_lgb, dtest_lgb],
                          fobj=balancedlogloss_lgb,
                          feval=balancedlogloss_eval_lgb,
                          evals_result=evals_lgb,
                          valid_names=['train', 'test'],
                          num_boost_round=150,
                          verbose_eval=False)

    df_lgb_train = pd.concat([df_lgb_train, pd.Series(evals_lgb['train']['balanced_logloss'])], axis=1)
    df_lgb_test = pd.concat([df_lgb_test, pd.Series(evals_lgb['test']['balanced_logloss'])], axis=1)

    lgb_train_preds = expit(lgb_model.predict(X_train, raw_score=True))
    lgb_test_preds = expit(lgb_model.predict(X_test, raw_score=True))

    lgb_score = score(lgb_test_preds, y_test)
    lgb_scores = lgb_scores + [lgb_score]
    print(lgb_score)

    stacked_preds_train = np.column_stack((xgb_train_preds, lgb_train_preds))
    stacked_preds_test = np.column_stack((xgb_test_preds, lgb_test_preds))

    meta_model = LogisticRegression(C=2.0)
    meta_model.fit(stacked_preds_train, y_train)

    coefs = [i/np.sum(meta_model.coef_[0]) for i in meta_model.coef_[0]]
    ensemble_preds = coefs[0]*xgb_test_preds + coefs[1]*lgb_test_preds

    test_score = score(ensemble_preds, np.array(y_test))
    scores = scores + [test_score]
    print('ensemble: ' + str(test_score))

df_xgb = pd.DataFrame()
df_xgb['train'] = df_xgb_train.mean(axis=1)
df_xgb['test'] = df_xgb_test.mean(axis=1)

df_lgb = pd.DataFrame()
df_lgb['train'] = df_lgb_train.mean(axis=1)
df_lgb['test'] = df_lgb_test.mean(axis=1)

print('\n')
print('xgb: ' + str(np.mean(xgb_scores)))
print('lgb: ' + str(np.mean(lgb_scores)))
print('ensemble:' + str(np.mean(scores)))

0.3096564452571329
0.31057634226937636
ensemble: 0.30338087580102013
0.20383397251020607
0.2912837465944597
ensemble: 0.26670834535559107
0.6507522183350449
0.5591894967631519
ensemble: 0.5673677473887737
0.3727705369303955
0.3926642610816981
ensemble: 0.3743143587696942
0.24973399436065502
0.2968230423919521
ensemble: 0.27860853720592443
0.32579044541968405
0.363046008196092
ensemble: 0.33704108449138326
0.36896261013422677
0.42879258318588936
ensemble: 0.4051230795805713
0.2725259485264542
0.34308169547875805
ensemble: 0.31128519731407334
0.6179906908684624
0.4350784191365407
ensemble: 0.41010973831498543
0.432512374738088
0.3649735542230159
ensemble: 0.361292164309839


xgb: 0.38045292370803496
lgb: 0.3785509149320935
ensemble:0.3615231128531856
