In [1]:
from sklearn.linear_model import LogisticRegression, LinearRegression
from scipy.special import expit
from typing import Tuple
from sklearn.manifold import Isomap
from sklearn.neural_network import MLPClassifier
from sklearn.utils import compute_class_weight, class_weight
from imblearn.over_sampling import RandomOverSampler, SMOTE, SMOTENC
from sklearn.metrics import (
    confusion_matrix,
    accuracy_score,
    roc_auc_score,
    balanced_accuracy_score,
)
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, StandardScaler, RobustScaler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
import ray
import optuna
import xgboost as xgb
import lightgbm as lgb
import warnings
import pandas as pd
import numpy as np
import sklearn
import matplotlib.pyplot as plt

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

warnings.filterwarnings('ignore')

In [2]:
train = pd.read_csv('train.csv')
train['EJ'].replace(['A', 'B'], [1, 0], inplace=True)

ej = np.array(train['EJ']).reshape(-1, 1)

sample_submission = pd.read_csv('sample_submission.csv')

y = train['Class']

In [3]:
scaler = RobustScaler()

x_numerical_columns = train.drop(
    columns=['Id', 'Class', 'EJ']).columns.tolist()
x_categorical_columns = ['EJ']
x_cols = x_numerical_columns + x_categorical_columns

scaler.fit(train[x_numerical_columns])

X = scaler.transform(train[x_numerical_columns])
X = np.concatenate((X, ej), axis=1)

In [4]:
from sklearn.impute import KNNImputer

knn = KNNImputer(n_neighbors=5)
knn.fit(X)

X = knn.fit_transform(X)

X = pd.DataFrame(X, columns=x_cols)
X['EJ'] = X['EJ'].astype('int')

In [5]:
outlier_df = X[X > 10].dropna(how='all').dropna(how='all', axis=1)

outlier_index = outlier_df.loc[(y == 0)].index.tolist()

X = X.drop(index=outlier_index).reset_index(drop=True)
y = y.drop(index=outlier_index).reset_index(drop=True)

X['EJ'] = X['EJ'].astype('category')

In [6]:
def balancedlogloss_lgb(
    predt: np.ndarray, dtrain: lgb.Dataset
) -> Tuple[np.ndarray, np.ndarray]:
    y = dtrain.get_label()
    n0 = len(y[y == 0])
    n1 = len(y[y == 1])

    p = expit(predt)
    p[p == 0] = 1e-15

    grad = 1 / 2 * ((1 - y) / (1 - p) - y / p)
    hess = 1 / 2 * ((1 - y) / ((1 - p) ** 2) + y / (p**2))
    return grad, hess


def balancedlogloss_xgb(
    predt: np.ndarray, dtrain: xgb.DMatrix
) -> Tuple[np.ndarray, np.ndarray]:
    y = dtrain.get_label()
    n0 = len(y[y == 0])
    n1 = len(y[y == 1])

    p = expit(predt)
    p[p == 0] = 1e-15

    grad = 1 / 2 * ((1 - y) / (1 - p) - y / p)
    hess = 1 / 2 * ((1 - y) / ((1 - p) ** 2) + y / (p**2))
    return grad, hess


def balancedlogloss_eval_lgb(
    predt: np.ndarray, dtrain: lgb.Dataset
) -> Tuple[np.ndarray, np.ndarray]:
    y = dtrain.get_label()
    n0 = len(y[y == 0])
    n1 = len(y[y == 1])
    p = expit(predt)

    p[p == 0] = 1e-15

    return (
        'balanced_logloss',
        (-1 / n0 * (sum((1 - y) * np.log(1 - p))) -
         1 / n1 * (sum(y * np.log(p)))) / 2,
        False
    )


def balancedlogloss_eval_xgb(
    predt: np.ndarray, dtrain: lgb.Dataset
) -> Tuple[np.ndarray, np.ndarray]:
    y = dtrain.get_label()
    n0 = len(y[y == 0])
    n1 = len(y[y == 1])
    p = expit(predt)

    p[p == 0] = 1e-15

    return (
        'balanced_logloss',
        (-1 / n0 * (sum((1 - y) * np.log(1 - p))) -
         1 / n1 * (sum(y * np.log(p)))) / 2,
    )


def score(p, y):

    p[p == 0] = 1e-15
    p[p == 1] = 1-(1e-15)

    n0 = len(y[y == 0])
    n1 = len(y[y == 1])

    return ((-1 / n0 * (sum((1 - y) * np.log(1 - p))) - 1 / n1 * (sum(y * np.log(p)))) / 2)

In [7]:
def get_trials_df(trials_dataframe):
    col_index = [1] + [i for i in range(5, trials_dataframe.shape[1]-1)]

    trials_dataframe = trials_dataframe.iloc[:, col_index]
    trials_dataframe = trials_dataframe.groupby(
        trials_dataframe.columns.tolist()[1:]).mean()

    trials_dataframe = trials_dataframe.sort_values(
        by=['value'], ascending=True)

    return trials_dataframe

In [8]:
def xgb_objective(trial):

    xgb_params = {
        'learning_rate': 0.1,
        'min_child_weight': trial.suggest_categorical('min_child_weight', [i for i in range(8, 15)]),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.1, 2, step=0.1),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.1, 2, step=0.1),
        'max_depth': trial.suggest_categorical('max_depth', [3, 8, 10, 20]),
        'max_delta_step': 4,
        'subsample': trial.suggest_float('subsample', 0.2, 1, step=0.1),
        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.15, 0.2, 0.3, 0.4]),
        'disable_default_eval_metric': True, 
        'seed': 5,
    }

    num_boost_round = trial.suggest_categorical('num_boost_round', [100, 150, 200])

    n_components = trial.suggest_categorical('n_components', [1, 3, 10])

    kf = StratifiedKFold(10, shuffle=True, random_state=30)
    cols = X.columns.tolist()

    xgb_scores = []
    
    for train_index, test_index in kf.split(X, y):
        try:
            X_train, X_test = X.loc[train_index], X.loc[test_index]
            y_train, y_test = y.loc[train_index], y.loc[test_index]

            sampler = RandomOverSampler()
            X_train, y_train = sampler.fit_resample(X_train, y_train)

            isomap = Isomap(n_components=n_components)
            isomap.fit(np.array(X_train).astype('float64'))

            x_isomap_train = isomap.transform(X_train)
            x_isomap_test = isomap.transform(X_test)

            x_isomap_train = pd.DataFrame(x_isomap_train, 
                                          columns=['isomap_' + str(i) for i in range(n_components)], 
                                          index=X_train.index)
            x_isomap_test = pd.DataFrame(x_isomap_test, 
                                         columns=['isomap_' + str(i) for i in range(n_components)], 
                                         index=X_test.index)

            X_train = pd.concat([X_train, x_isomap_train], axis=1)
            X_test = pd.concat([X_test, x_isomap_test], axis=1)

            cols = X_train.columns.tolist()

            dtrain_xgb = xgb.DMatrix(X_train, y_train, feature_names=cols, enable_categorical=True)
            dtest_xgb = xgb.DMatrix(X_test, y_test, feature_names=cols, enable_categorical=True)

            xgb_model = xgb.train(params=xgb_params,
                                  dtrain=dtrain_xgb,
                                  verbose_eval=False,
                                  obj=balancedlogloss_xgb,
                                  feval=balancedlogloss_eval_xgb,
                                  num_boost_round=100,
                                )

            xgb_test_preds = expit(xgb_model.predict(dtest_xgb, output_margin=True))
            xgb_score = score(xgb_test_preds, y_test)
            xgb_scores = xgb_scores + [xgb_score]
        
        except ValueError as e:
            print("An error occurred skipping this part")
            print(e)
            raise optuna.exceptions.TrialPruned()

    if np.isnan(np.mean(xgb_scores)):
        raise optuna.exceptions.TrialPruned()
    
    return np.mean(xgb_scores)

pruner = optuna.pruners.MedianPruner(n_warmup_steps=5)
xgb_study = optuna.create_study(direction='minimize', pruner=pruner)
xgb_study.optimize(xgb_objective, n_trials=100)

xgb_trials_dataframe = xgb_study.trials_dataframe()
get_trials_df(xgb_trials_dataframe).to_excel('xgb_trials.xlsx')
get_trials_df(xgb_trials_dataframe)


[I 2023-07-01 17:49:32,691] A new study created in memory with name: no-name-f4e92176-8015-41db-8c57-54ff6cda4ec8
[I 2023-07-01 17:49:46,143] Trial 0 finished with value: 0.23531573945472223 and parameters: {'min_child_weight': 14, 'reg_lambda': 2.0, 'reg_alpha': 0.4, 'max_depth': 3, 'subsample': 0.30000000000000004, 'colsample_bytree': 0.2, 'num_boost_round': 200, 'n_components': 10}. Best is trial 0 with value: 0.23531573945472223.
[I 2023-07-01 17:50:00,334] Trial 1 finished with value: 0.2537906785149122 and parameters: {'min_child_weight': 8, 'reg_lambda': 1.8000000000000003, 'reg_alpha': 1.4000000000000001, 'max_depth': 3, 'subsample': 0.6000000000000001, 'colsample_bytree': 0.2, 'num_boost_round': 200, 'n_components': 10}. Best is trial 0 with value: 0.23531573945472223.
[I 2023-07-01 17:50:17,428] Trial 2 finished with value: 0.243061913548709 and parameters: {'min_child_weight': 11, 'reg_lambda': 1.1, 'reg_alpha': 0.6, 'max_depth': 3, 'subsample': 0.2, 'colsample_bytree': 0.3,

An error occurred skipping this part
Input X contains infinity or a value too large for dtype('float64').


[I 2023-07-01 17:51:48,274] Trial 9 finished with value: 0.25195508658592397 and parameters: {'min_child_weight': 8, 'reg_lambda': 1.9000000000000001, 'reg_alpha': 1.7000000000000002, 'max_depth': 3, 'subsample': 0.30000000000000004, 'colsample_bytree': 0.4, 'num_boost_round': 100, 'n_components': 3}. Best is trial 4 with value: 0.22987130040469098.
[I 2023-07-01 17:52:02,605] Trial 10 finished with value: 0.23724493653744433 and parameters: {'min_child_weight': 14, 'reg_lambda': 0.30000000000000004, 'reg_alpha': 1.0, 'max_depth': 8, 'subsample': 1.0, 'colsample_bytree': 0.4, 'num_boost_round': 150, 'n_components': 1}. Best is trial 4 with value: 0.22987130040469098.
[I 2023-07-01 17:52:17,731] Trial 11 finished with value: 0.27366601202190427 and parameters: {'min_child_weight': 10, 'reg_lambda': 1.3000000000000003, 'reg_alpha': 0.2, 'max_depth': 10, 'subsample': 0.7, 'colsample_bytree': 0.4, 'num_boost_round': 100, 'n_components': 3}. Best is trial 4 with value: 0.22987130040469098.


In [None]:
""" def lgb_objective(trial):

    lgb_params = {
        'learning_rate': 0.1,
        'lambda_l2': trial.suggest_int('lambda_l2', 5, 20, step=3),
        'lambda_l1': trial.suggest_categorical('lambda_l1', [0, 0.5]),
        'subsample': trial.suggest_float('subsample', 0.1, 0.6, step=0.1),
        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.2]),
        'max_bins': trial.suggest_int('max_bins', 70, 100, step=10),
        'num_leaves': trial.suggest_int('num_leaves', 10, 20, step=2),
        'random_seed': 5,
        'first_metric_only': True,
        'verbosity': -1,
    }

    n_components = trial.suggest_categorical('n_components', [1, 2, 3])
    num_boost_round = trial.suggest_categorical('num_boost_round', [90, 100, 150])

    lgb_test_scores = []
    lgb_train_scores = []

    kf = StratifiedKFold(10, shuffle=True)

    for train_index, test_index in kf.split(X, y):
        
        X_train, X_test = X.loc[train_index], X.loc[test_index]
        y_train, y_test = y.loc[train_index], y.loc[test_index]

        sampler = RandomOverSampler(random_state=3)
        X_train, y_train = sampler.fit_resample(X_train, y_train)
        
        try:
            isomap = Isomap(n_components=n_components, metric='manhattan')
            isomap.fit(X_train)
            x_isomap_train = isomap.transform(X_train)
            x_isomap_test = isomap.transform(X_test)

            x_isomap_train = pd.DataFrame(x_isomap_train, columns=[
                                        'isomap_' + str(i) for i in range(n_components)], index=X_train.index)
            x_isomap_test = pd.DataFrame(x_isomap_test, columns=[
                                        'isomap_' + str(i) for i in range(n_components)], index=X_test.index)

            X_train = pd.concat([X_train, x_isomap_train], axis=1)
            X_test = pd.concat([X_test, x_isomap_test], axis=1)

            dtrain_lgb = lgb.Dataset(X_train, y_train)
            dtest_lgb = lgb.Dataset(X_test, y_test)

            lgb_evals = {}
            lgb_model = lgb.train(params=lgb_params,
                                train_set=dtrain_lgb,
                                fobj=balancedlogloss_lgb,
                                feval=balancedlogloss_eval_lgb,
                                num_boost_round=num_boost_round,
                                )

            lgb_test_preds = expit(lgb_model.predict(X_test, raw_score=True))
            lgb_test_score = score(lgb_test_preds, y_test)
            lgb_test_scores = lgb_test_scores + [lgb_test_score]

            lgb_train_preds = expit(lgb_model.predict(X_train, raw_score=True))
            lgb_train_score = score(lgb_train_preds, y_train)
            lgb_train_scores = lgb_train_scores + [lgb_train_score]

        except ValueError:
            print("An error occurred during Isomap fitting or transforming, skipping this part")
            optuna.exceptions.TrialPruned()

    print(('train', np.mean(lgb_train_scores)), ('test', np.mean(lgb_test_scores)))
    return np.mean(lgb_test_scores)

pruner = optuna.pruners.MedianPruner(n_warmup_steps=5)
lgb_study = optuna.create_study(direction='minimize', pruner=pruner)
lgb_study.optimize(lgb_objective, n_trials=20)

lgb_trials_dataframe = lgb_study.trials_dataframe()
get_trials_df(lgb_trials_dataframe)
 """

In [None]:
xgb_params = {
    'learning_rate': 0.1,
    'min_child_weight': xgb_study.best_params['min_child_weight'],
    'reg_lambda': xgb_study.best_params['reg_lambda'],
    'reg_alpha': xgb_study.best_params['reg_alpha'],
    'max_depth': xgb_study.best_params['max_depth'],
    'max_delta_step': 4,
    'subsample': xgb_study.best_params['subsample'],
    'colsample_bytree': xgb_study.best_params['colsample_bytree'],
    'disable_default_eval_metric': True, 
    'seed': 5,
}

""" lgb_params = {
    'learning_rate': 0.1,
    'lambda_l2': lgb_study.best_params['lambda_l2'],
    'lambda_l1': lgb_study.best_params['lambda_l1'],
    'subsample': lgb_study.best_params['subsample'],
    'colsample_bytree': lgb_study.best_params['colsample_bytree'],
    'max_bins': lgb_study.best_params['max_bins'],
    'num_leaves': lgb_study.best_params['num_leaves'],
    'random_seed': 5,
    'first_metric_only': True,
    'verbosity': -1,
} """

""" xgb_params = {
 'learning_rate': 0.1,
 'min_child_weight': 13,
 'reg_lambda': 0.75,
 'reg_alpha': 4.5,
 'max_depth': 8,
 'max_delta_step': 4,
 'subsample': 0.4,
 'colsample_bytree': 0.18,
 'disable_default_eval_metric': True,
 'seed': 5} """

lgb_params = {'learning_rate': 0.1,
 'lambda_l2': 5,
 'lambda_l1': 0.5,
 'subsample': 0.4,
 'colsample_bytree': 0.2,
 'max_bins': 80,
 'num_leaves': 12,
 'random_seed': 5,
 'first_metric_only': True,
 'verbosity': -1}

kf = StratifiedKFold(10, shuffle=True, random_state=30)
cols = X.columns.tolist()


xgb_test_scores = []
lgb_test_scores = []
logistic_test_scores = []
ensemble_scores = []
mlp_test_scores = []

k = 0

for train_index, test_index in kf.split(X, y):

    print(f'Fold {k}')
    k = k + 1

    X_train, X_test = X.loc[train_index], X.loc[test_index]
    y_train, y_test = y.loc[train_index], y.loc[test_index]

    sampler = RandomOverSampler()
    X_train, y_train = sampler.fit_resample(X_train, y_train)

    n_components = 3
    isomap = Isomap(n_components=n_components)
    isomap.fit(X_train)

    x_isomap_train = isomap.transform(X_train)
    x_isomap_test = isomap.transform(X_test)

    x_isomap_train = pd.DataFrame(x_isomap_train, columns=['isomap_' + str(i) for i in range(n_components)], index=X_train.index)
    x_isomap_test = pd.DataFrame(x_isomap_test, columns=['isomap_' + str(i) for i in range(n_components)], index=X_test.index)

    X_train = pd.concat([X_train, x_isomap_train], axis=1)
    X_test = pd.concat([X_test, x_isomap_test], axis=1)
    cols = X_train.columns.tolist()

    evals_xgb = {}
    dtrain_xgb = xgb.DMatrix(X_train, y_train, feature_names=cols, enable_categorical=True)
    dtest_xgb = xgb.DMatrix(X_test, y_test, feature_names=cols, enable_categorical=True)

    xgb_model = xgb.train(params=xgb_params,
                          dtrain=dtrain_xgb,
                          obj=balancedlogloss_xgb,
                          verbose_eval=10,
                          feval=balancedlogloss_eval_xgb,
                          evals_result=evals_xgb,
                          num_boost_round=100,
                          )
    
    xgb_train_preds = expit(xgb_model.predict(dtrain_xgb, output_margin=True))
    xgb_test_preds = expit(xgb_model.predict(dtest_xgb, output_margin=True))

    xgb_test_score = score(xgb_test_preds, y_test)
    xgb_test_scores = xgb_test_scores + [xgb_test_score]
    print(f'xgb: {xgb_test_score}')
   
    evals_lgb = {}
    dtrain_lgb = lgb.Dataset(X_train, y_train)
    dtest_lgb = lgb.Dataset(X_test, y_test)

    lgb_model = lgb.train(params=lgb_params,
                          train_set=dtrain_lgb,
                          fobj=balancedlogloss_lgb,
                          feval=balancedlogloss_eval_lgb,
                          evals_result=evals_lgb,
                          num_boost_round=100,
                          verbose_eval=False)

    lgb_train_preds = expit(lgb_model.predict(X_train, raw_score=True))
    lgb_test_preds = expit(lgb_model.predict(X_test, raw_score=True))

    lgb_test_score = score(lgb_test_preds, y_test)
    lgb_test_scores = lgb_test_scores + [lgb_test_score]
    print(f'lgb: {lgb_test_score}')

    """ mlp = MLPClassifier(hidden_layer_sizes=(100, 100), 
                        max_iter=1000,
                        # alpha=0.5,
                        early_stopping=True,
                        )

    mlp.fit(x_isomap_train, y_train)
    mlp_train_preds = mlp.predict_proba(x_isomap_train)[:, 1]
    mlp_test_preds = mlp.predict_proba(x_isomap_test)[:, 1]

    mlp_train_score = score(mlp_train_preds, y_train)
    mlp_test_score = score(mlp_test_preds, y_test)
    mlp_test_scores = mlp_test_scores + [mlp_test_score]
    print(f'mlp_train: {mlp_train_score} \t mlp_test: {mlp_test_score}') """

    preds_train_mean = np.mean([xgb_train_preds, lgb_train_preds], axis=0)
    preds_test_mean = np.mean([xgb_test_preds, lgb_test_preds], axis=0)
    stacked_preds_train = np.column_stack((xgb_train_preds, lgb_train_preds, preds_train_mean))
    stacked_preds_test = np.column_stack((xgb_test_preds, lgb_test_preds, preds_test_mean))

    meta_model = LogisticRegression(C=0.2)
    meta_model.fit(stacked_preds_train, y_train)
    ensemble_preds = meta_model.predict_proba(stacked_preds_test)[:, 1]

    ensemble_score = score(ensemble_preds, np.array(y_test))
    ensemble_scores = ensemble_scores + [ensemble_score]
    print(f'ensemble: {ensemble_score}')

print(f'xgb: {np.mean(xgb_test_scores)}')
print(f'lgb: {np.mean(lgb_test_scores)}')
# print(f'logistic: {np.mean(logistic_test_scores)}')
# print(f'mlp: {np.mean(mlp_test_scores)}')
print(f'ensemble: {np.mean(ensemble_scores)}')