In [1]:
from sklearn.linear_model import LogisticRegression, LinearRegression
from scipy.special import expit
from typing import Tuple
from sklearn.manifold import Isomap
from sklearn.utils import compute_class_weight, class_weight
from imblearn.over_sampling import RandomOverSampler, SMOTE, SMOTENC
from sklearn.metrics import (
    confusion_matrix,
    accuracy_score,
    roc_auc_score,
    balanced_accuracy_score,
)
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, StandardScaler
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
import ray
import optuna
import xgboost as xgb
import lightgbm as lgb
import warnings
import pandas as pd
import numpy as np
import sklearn
import matplotlib.pyplot as plt

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

warnings.filterwarnings('ignore')

In [2]:
train = pd.read_csv('train.csv')
train['EJ'].replace(['A', 'B'], [1, 0], inplace=True)

ej = np.array(train['EJ']).reshape(-1, 1)

sample_submission = pd.read_csv('sample_submission.csv')

y = train['Class']

In [3]:
scaler = StandardScaler()

x_numerical_columns = train.drop(
    columns=['Id', 'Class', 'EJ']).columns.tolist()
x_categorical_columns = ['EJ']
x_cols = x_numerical_columns + x_categorical_columns

scaler.fit(train[x_numerical_columns])

X = scaler.transform(train[x_numerical_columns])
X = np.concatenate((X, ej), axis=1)

In [4]:
from sklearn.impute import KNNImputer

knn = KNNImputer()
knn.fit(X)

X = knn.fit_transform(X)

X = pd.DataFrame(X, columns=x_cols)
X['EJ'] = X['EJ'].astype('int')

In [5]:
outlier_df = X[X > 10].dropna(how='all').dropna(how='all', axis=1)

outlier_index = outlier_df.loc[(y == 0)].index.tolist()

X = X.drop(index=outlier_index).reset_index(drop=True)
y = y.drop(index=outlier_index).reset_index(drop=True)

X['EJ'] = X['EJ'].astype('category')

In [6]:
def balancedlogloss_lgb(
    predt: np.ndarray, dtrain: lgb.Dataset
) -> Tuple[np.ndarray, np.ndarray]:
    y = dtrain.get_label()
    n0 = len(y[y == 0])
    n1 = len(y[y == 1])

    p = expit(predt)
    p[p == 0] = 1e-15

    grad = 1 / 2 * ((1 - y) / (1 - p) - y / p)
    hess = 1 / 2 * ((1 - y) / ((1 - p) ** 2) + y / (p**2))
    return grad, hess


def balancedlogloss_xgb(
    predt: np.ndarray, dtrain: xgb.DMatrix
) -> Tuple[np.ndarray, np.ndarray]:
    y = dtrain.get_label()
    n0 = len(y[y == 0])
    n1 = len(y[y == 1])

    p = expit(predt)
    p[p == 0] = 1e-15

    grad = 1 / 2 * ((1 - y) / (1 - p) - y / p)
    hess = 1 / 2 * ((1 - y) / ((1 - p) ** 2) + y / (p**2))
    return grad, hess


def balancedlogloss_eval_lgb(
    predt: np.ndarray, dtrain: lgb.Dataset
) -> Tuple[np.ndarray, np.ndarray]:
    y = dtrain.get_label()
    n0 = len(y[y == 0])
    n1 = len(y[y == 1])
    p = expit(predt)

    p[p == 0] = 1e-15

    return (
        'balanced_logloss',
        (-1 / n0 * (sum((1 - y) * np.log(1 - p))) -
         1 / n1 * (sum(y * np.log(p)))) / 2,
        False
    )


def balancedlogloss_eval_xgb(
    predt: np.ndarray, dtrain: lgb.Dataset
) -> Tuple[np.ndarray, np.ndarray]:
    y = dtrain.get_label()
    n0 = len(y[y == 0])
    n1 = len(y[y == 1])
    p = expit(predt)

    p[p == 0] = 1e-15

    return (
        'balanced_logloss',
        (-1 / n0 * (sum((1 - y) * np.log(1 - p))) -
         1 / n1 * (sum(y * np.log(p)))) / 2,
    )


def score(p, y):

    p[p == 0] = 1e-15

    n0 = len(y[y == 0])
    n1 = len(y[y == 1])

    return ((-1 / n0 * (sum((1 - y) * np.log(1 - p))) - 1 / n1 * (sum(y * np.log(p)))) / 2)

In [7]:
def get_trials_df(trials_dataframe):
    col_index = [1] + [i for i in range(5, trials_dataframe.shape[1]-1)]

    trials_dataframe = trials_dataframe.iloc[:, col_index]
    trials_dataframe = trials_dataframe.groupby(
        trials_dataframe.columns.tolist()[1:]).mean()

    trials_dataframe = trials_dataframe.sort_values(
        by=['value'], ascending=True)

    return trials_dataframe

In [8]:
""" def xgb_objective(trial):

    xgb_params = {
        'learning_rate': 0.1,
        'min_child_weight': trial.suggest_categorical('min_child_weight', [i for i in range(8, 15)]),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.3, 1, step=0.05),
        'reg_alpha': trial.suggest_float('reg_alpha', 3.5, 4.5, step=0.1),
        'max_depth': trial.suggest_categorical('max_depth', [8, 10, 12]),
        'max_delta_step': 4,
        'subsample': trial.suggest_float('subsample', 0.2, 1, step=0.1),
        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.08, 0.1, 0.12, 0.18, 0.2]),
        'disable_default_eval_metric': True, 
        'seed': 5,
    }

    kf = StratifiedKFold(10, shuffle=True, random_state=30)
    cols = X.columns.tolist()

    xgb_scores = []
    
    for train_index, test_index in kf.split(X, y):
        try:
            X_train_val, X_test = X.loc[train_index], X.loc[test_index]
            y_train_val, y_test = y.loc[train_index], y.loc[test_index]

            X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.05, 
                                                            stratify=y_train_val, random_state=32)

            sampler = RandomOverSampler()
            X_train, y_train = sampler.fit_resample(X_train, y_train)

            n_components = 3
            isomap = Isomap(n_components=n_components)
            isomap.fit(X_train)

            x_isomap_train = isomap.transform(X_train)
            x_isomap_test = isomap.transform(X_test)
            x_isomap_val = isomap.transform(X_val)

            x_isomap_train = pd.DataFrame(x_isomap_train, columns=['isomap_' + str(i) for i in range(n_components)], index=X_train.index)
            x_isomap_test = pd.DataFrame(x_isomap_test, columns=['isomap_' + str(i) for i in range(n_components)], index=X_test.index)
            x_isomap_val = pd.DataFrame(x_isomap_val, columns=['isomap_' + str(i) for i in range(n_components)], index=X_val.index)

            X_train = pd.concat([X_train, x_isomap_train], axis=1)
            X_test = pd.concat([X_test, x_isomap_test], axis=1)
            X_val = pd.concat([X_val, x_isomap_val], axis=1)
            cols = X_train.columns.tolist()

            dtrain_xgb = xgb.DMatrix(X_train, y_train, feature_names=cols, enable_categorical=True)
            dtest_xgb = xgb.DMatrix(X_test, y_test, feature_names=cols, enable_categorical=True)
            dval_xgb = xgb.DMatrix(X_val, y_val, feature_names=cols, enable_categorical=True)

            xgb_model = xgb.train(params=xgb_params,
                                dtrain=dtrain_xgb,
                                verbose_eval=False,
                                obj=balancedlogloss_xgb,
                                evals=[(dtrain_xgb, 'train'), (dval_xgb, 'validation')],
                                feval=balancedlogloss_eval_xgb,
                                num_boost_round=300,
                                early_stopping_rounds=10,
                                )

            xgb_test_preds = expit(xgb_model.predict(dtest_xgb, output_margin=True))

            xgb_score = score(xgb_test_preds, y_test)
            xgb_scores = xgb_scores + [xgb_score]
        
        except ValueError:
            print("An error occurred during Isomap fitting or transforming, skipping this part")
            optuna.exceptions.TrialPruned()

    if np.isnan(np.mean(xgb_scores)):
        raise optuna.exceptions.TrialPruned()
    
    return np.mean(xgb_scores)

pruner = optuna.pruners.MedianPruner(n_warmup_steps=5)
xgb_study = optuna.create_study(direction='minimize', pruner=pruner)
xgb_study.optimize(xgb_objective, n_trials=20)

xgb_trials_dataframe = xgb_study.trials_dataframe()
get_trials_df(xgb_trials_dataframe)
 """

' def xgb_objective(trial):\n\n    xgb_params = {\n        \'learning_rate\': 0.1,\n        \'min_child_weight\': trial.suggest_categorical(\'min_child_weight\', [i for i in range(8, 15)]),\n        \'reg_lambda\': trial.suggest_float(\'reg_lambda\', 0.3, 1, step=0.05),\n        \'reg_alpha\': trial.suggest_float(\'reg_alpha\', 3.5, 4.5, step=0.1),\n        \'max_depth\': trial.suggest_categorical(\'max_depth\', [8, 10, 12]),\n        \'max_delta_step\': 4,\n        \'subsample\': trial.suggest_float(\'subsample\', 0.2, 1, step=0.1),\n        \'colsample_bytree\': trial.suggest_categorical(\'colsample_bytree\', [0.08, 0.1, 0.12, 0.18, 0.2]),\n        \'disable_default_eval_metric\': True, \n        \'seed\': 5,\n    }\n\n    kf = StratifiedKFold(10, shuffle=True, random_state=30)\n    cols = X.columns.tolist()\n\n    xgb_scores = []\n    \n    for train_index, test_index in kf.split(X, y):\n        try:\n            X_train_val, X_test = X.loc[train_index], X.loc[test_index]\n         

In [9]:
""" def lgb_objective(trial):

    lgb_params = {
        'learning_rate': 0.1,
        'lambda_l2': trial.suggest_int('lambda_l2', 5, 20, step=3),
        'lambda_l1': trial.suggest_categorical('lambda_l1', [0, 0.5]),
        'subsample': trial.suggest_float('subsample', 0.1, 0.6, step=0.1),
        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.2]),
        'max_bins': trial.suggest_int('max_bins', 70, 100, step=10),
        'num_leaves': trial.suggest_int('num_leaves', 10, 20, step=2),
        'random_seed': 5,
        'first_metric_only': True,
        'verbosity': -1,
    }

    n_components = trial.suggest_categorical('n_components', [1, 2, 3])
    num_boost_round = trial.suggest_categorical('num_boost_round', [90, 100, 150])

    lgb_test_scores = []
    lgb_train_scores = []

    kf = StratifiedKFold(10, shuffle=True)

    for train_index, test_index in kf.split(X, y):
        
        X_train, X_test = X.loc[train_index], X.loc[test_index]
        y_train, y_test = y.loc[train_index], y.loc[test_index]

        sampler = RandomOverSampler(random_state=3)
        X_train, y_train = sampler.fit_resample(X_train, y_train)
        
        try:
            isomap = Isomap(n_components=n_components, metric='manhattan')
            isomap.fit(X_train)
            x_isomap_train = isomap.transform(X_train)
            x_isomap_test = isomap.transform(X_test)

            x_isomap_train = pd.DataFrame(x_isomap_train, columns=[
                                        'isomap_' + str(i) for i in range(n_components)], index=X_train.index)
            x_isomap_test = pd.DataFrame(x_isomap_test, columns=[
                                        'isomap_' + str(i) for i in range(n_components)], index=X_test.index)

            X_train = pd.concat([X_train, x_isomap_train], axis=1)
            X_test = pd.concat([X_test, x_isomap_test], axis=1)

            dtrain_lgb = lgb.Dataset(X_train, y_train)
            dtest_lgb = lgb.Dataset(X_test, y_test)

            lgb_evals = {}
            lgb_model = lgb.train(params=lgb_params,
                                train_set=dtrain_lgb,
                                fobj=balancedlogloss_lgb,
                                feval=balancedlogloss_eval_lgb,
                                num_boost_round=num_boost_round,
                                )

            lgb_test_preds = expit(lgb_model.predict(X_test, raw_score=True))
            lgb_test_score = score(lgb_test_preds, y_test)
            lgb_test_scores = lgb_test_scores + [lgb_test_score]

            lgb_train_preds = expit(lgb_model.predict(X_train, raw_score=True))
            lgb_train_score = score(lgb_train_preds, y_train)
            lgb_train_scores = lgb_train_scores + [lgb_train_score]

        except ValueError:
            print("An error occurred during Isomap fitting or transforming, skipping this part")
            optuna.exceptions.TrialPruned()

    print(('train', np.mean(lgb_train_scores)), ('test', np.mean(lgb_test_scores)))
    return np.mean(lgb_test_scores)

pruner = optuna.pruners.MedianPruner(n_warmup_steps=5)
lgb_study = optuna.create_study(direction='minimize', pruner=pruner)
lgb_study.optimize(lgb_objective, n_trials=20)

lgb_trials_dataframe = lgb_study.trials_dataframe()
get_trials_df(lgb_trials_dataframe)
 """

' def lgb_objective(trial):\n\n    lgb_params = {\n        \'learning_rate\': 0.1,\n        \'lambda_l2\': trial.suggest_int(\'lambda_l2\', 5, 20, step=3),\n        \'lambda_l1\': trial.suggest_categorical(\'lambda_l1\', [0, 0.5]),\n        \'subsample\': trial.suggest_float(\'subsample\', 0.1, 0.6, step=0.1),\n        \'colsample_bytree\': trial.suggest_categorical(\'colsample_bytree\', [0.2]),\n        \'max_bins\': trial.suggest_int(\'max_bins\', 70, 100, step=10),\n        \'num_leaves\': trial.suggest_int(\'num_leaves\', 10, 20, step=2),\n        \'random_seed\': 5,\n        \'first_metric_only\': True,\n        \'verbosity\': -1,\n    }\n\n    n_components = trial.suggest_categorical(\'n_components\', [1, 2, 3])\n    num_boost_round = trial.suggest_categorical(\'num_boost_round\', [90, 100, 150])\n\n    lgb_test_scores = []\n    lgb_train_scores = []\n\n    kf = StratifiedKFold(10, shuffle=True)\n\n    for train_index, test_index in kf.split(X, y):\n        \n        X_train, X_t

In [13]:
""" xgb_params = {
    'learning_rate': 0.1,
    'min_child_weight': xgb_study.best_params['min_child_weight'],
    'reg_lambda': xgb_study.best_params['reg_lambda'],
    'reg_alpha': xgb_study.best_params['reg_alpha'],
    'max_depth': xgb_study.best_params['max_depth'],
    'max_delta_step': 4,
    'subsample': xgb_study.best_params['subsample'],
    'colsample_bytree': xgb_study.best_params['colsample_bytree'],
    'disable_default_eval_metric': True, 
    'seed': 5,
}

lgb_params = {
    'learning_rate': 0.1,
    'lambda_l2': lgb_study.best_params['lambda_l2'],
    'lambda_l1': lgb_study.best_params['lambda_l1'],
    'subsample': lgb_study.best_params['subsample'],
    'colsample_bytree': lgb_study.best_params['colsample_bytree'],
    'max_bins': lgb_study.best_params['max_bins'],
    'num_leaves': lgb_study.best_params['num_leaves'],
    'random_seed': 5,
    'first_metric_only': True,
    'verbosity': -1,
} """

xgb_params = {
 'learning_rate': 0.1,
 'min_child_weight': 13,
 'reg_lambda': 0.75,
 'reg_alpha': 4.5,
 'max_depth': 8,
 'max_delta_step': 4,
 'subsample': 0.4,
 'colsample_bytree': 0.18,
 'disable_default_eval_metric': True,
 'seed': 5}

lgb_params = {'learning_rate': 0.1,
 'lambda_l2': 5,
 'lambda_l1': 0.5,
 'subsample': 0.4,
 'colsample_bytree': 0.2,
 'max_bins': 80,
 'num_leaves': 12,
 'random_seed': 5,
 'first_metric_only': True,
 'verbosity': -1}

kf = StratifiedKFold(10, shuffle=True, random_state=30)
cols = X.columns.tolist()

df_xgb_train, df_xgb_test = pd.DataFrame(), pd.DataFrame()
df_lgb_train, df_lgb_test = pd.DataFrame(), pd.DataFrame()

xgb_test_scores = []
lgb_test_scores = []
ensemble_scores = []

for train_index, test_index in kf.split(X, y):

    X_train, X_test = X.loc[train_index], X.loc[test_index]
    y_train, y_test = y.loc[train_index], y.loc[test_index]

    sampler = RandomOverSampler()
    X_train, y_train = sampler.fit_resample(X_train, y_train)

    n_components = 3
    isomap = Isomap(n_components=n_components)
    isomap.fit(X_train)

    x_isomap_train = isomap.transform(X_train)
    x_isomap_test = isomap.transform(X_test)

    x_isomap_train = pd.DataFrame(x_isomap_train, columns=['isomap_' + str(i) for i in range(n_components)], index=X_train.index)
    x_isomap_test = pd.DataFrame(x_isomap_test, columns=['isomap_' + str(i) for i in range(n_components)], index=X_test.index)

    X_train = pd.concat([X_train, x_isomap_train], axis=1)
    X_test = pd.concat([X_test, x_isomap_test], axis=1)
    cols = X_train.columns.tolist()

    evals_xgb = {}
    dtrain_xgb = xgb.DMatrix(X_train, y_train, feature_names=cols, enable_categorical=True)
    dtest_xgb = xgb.DMatrix(X_test, y_test, feature_names=cols, enable_categorical=True)

    xgb_model = xgb.train(params=xgb_params,
                          dtrain=dtrain_xgb,
                          obj=balancedlogloss_xgb,
                          verbose_eval=False,
                          feval=balancedlogloss_eval_xgb,
                          evals_result=evals_xgb,
                          num_boost_round=100,
                          )
    
    xgb_train_preds = expit(xgb_model.predict(dtrain_xgb, output_margin=True))
    xgb_test_preds = expit(xgb_model.predict(dtest_xgb, output_margin=True))

    xgb_test_score = score(xgb_test_preds, y_test)
    xgb_test_scores = xgb_test_scores + [xgb_test_score]
    print(f'xgb: {xgb_test_score}')

    evals_lgb = {}
    dtrain_lgb = lgb.Dataset(X_train, y_train)
    dtest_lgb = lgb.Dataset(X_test, y_test)

    lgb_model = lgb.train(params=lgb_params,
                          train_set=dtrain_lgb,
                          fobj=balancedlogloss_lgb,
                          feval=balancedlogloss_eval_lgb,
                          evals_result=evals_lgb,
                          num_boost_round=100,
                          verbose_eval=False)

    lgb_train_preds = expit(lgb_model.predict(X_train, raw_score=True))
    lgb_test_preds = expit(lgb_model.predict(X_test, raw_score=True))

    lgb_test_score = score(lgb_test_preds, y_test)
    lgb_test_scores = lgb_test_scores + [lgb_test_score]
    print(f'lgb: {lgb_test_score}')

    stacked_preds_train = np.column_stack(((expit(xgb_train_preds)), (expit(lgb_train_preds))))
    stacked_preds_test = np.column_stack(((expit(xgb_test_preds)), (expit(lgb_test_preds))))

    meta_model = LogisticRegression(C=5)
    # meta_model = xgb.XGBClassifier()
    meta_model.fit(stacked_preds_train, y_train)
    ensemble_preds = meta_model.predict_proba(stacked_preds_test)[:, 1]

    ensemble_score = score(ensemble_preds, np.array(y_test))
    ensemble_scores = ensemble_scores + [ensemble_score]
    print(f'ensemble: {ensemble_score}')


print('\n')
print(f'xgb: {np.mean(xgb_test_scores)}')
print(f'lgb: {np.mean(lgb_test_scores)}')
print(f'ensemble: {np.mean(ensemble_scores)}')

xgb: 0.28280554972081023
lgb: 0.2911328917019051
ensemble: 0.3113368065707118
xgb: 0.4547847927768122
lgb: 0.6609721867080084
ensemble: 0.6182513794652376
xgb: 0.31423385756598277
lgb: 0.37113181006799434
ensemble: 0.3809068012353133
xgb: 0.3942536670502953
lgb: 0.40883087578970784
ensemble: 0.4300709206205331
xgb: 0.24189116617156703
lgb: 0.19297906698568193
ensemble: 0.16462273866119823
xgb: 0.32290211665274743
lgb: 0.23507824562940272
ensemble: 0.2750940167375478
xgb: 0.29484458106219186
lgb: 0.3329104837048046
ensemble: 0.34190271713819503
xgb: 0.24837211495851114
lgb: 0.20443313305696392
ensemble: 0.21678901555792562
xgb: 0.2917008557356894
lgb: 0.30645423707940506
ensemble: 0.284575197499307
xgb: 0.22682511887513102
lgb: 0.15484390234350615
ensemble: 0.1798776344623449


xgb: 0.30726138205697384
lgb: 0.315876683306738
ensemble: 0.3203427227948314
