In [1]:
from sklearn.linear_model import LogisticRegression, LinearRegression
from scipy.special import expit
from typing import Tuple
from sklearn.neural_network import MLPClassifier
from sklearn.utils import compute_class_weight, class_weight
from imblearn.over_sampling import RandomOverSampler, SMOTE, SMOTENC
from sklearn.metrics import (
    confusion_matrix,
    accuracy_score,
    roc_auc_score,
    balanced_accuracy_score,
)
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, StandardScaler, RobustScaler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
import ray
import optuna
import xgboost as xgb
import lightgbm as lgb
import warnings
import pandas as pd
import numpy as np
import sklearn
import matplotlib.pyplot as plt

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

warnings.filterwarnings('ignore')

In [2]:
train = pd.read_csv('/kaggle/input/icr-identify-age-related-conditions/train.csv')
train['EJ'].replace(['A', 'B'], [1, 0], inplace=True)

ej = np.array(train['EJ']).reshape(-1, 1)

sample_submission = pd.read_csv('/kaggle/input/icr-identify-age-related-conditions/sample_submission.csv')

y = train['Class']

In [3]:
scaler = RobustScaler()

x_numerical_columns = train.drop(
    columns=['Id', 'Class', 'EJ']).columns.tolist()
x_categorical_columns = ['EJ']
x_cols = x_numerical_columns + x_categorical_columns

scaler.fit(train[x_numerical_columns])

X = scaler.transform(train[x_numerical_columns])
X = np.concatenate((X, ej), axis=1)

In [4]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, KNNImputer

imputer = KNNImputer(n_neighbors=5)
# imputer = IterativeImputer(max_iter=10)
imputer.fit(X)

X = imputer.transform(X)

X = pd.DataFrame(X, columns=x_cols)
X['EJ'] = X['EJ'].astype('int')

In [5]:
X['EJ'] = X['EJ'].astype('category')

In [6]:
def balancedlogloss_lgb(
    predt: np.ndarray, dtrain: lgb.Dataset
) -> Tuple[np.ndarray, np.ndarray]:
    y = dtrain.get_label()
    n0 = len(y[y == 0])
    n1 = len(y[y == 1])

    p = expit(predt)
    p[p == 0] = 1e-15

    grad = 1 / 2 * ((1 - y) / (1 - p) - y / p)
    hess = 1 / 2 * ((1 - y) / ((1 - p) ** 2) + y / (p**2))
    return grad, hess


def balancedlogloss_xgb(
    predt: np.ndarray, dtrain: xgb.DMatrix
) -> Tuple[np.ndarray, np.ndarray]:
    y = dtrain.get_label()
    n0 = len(y[y == 0])
    n1 = len(y[y == 1])

    p = expit(predt)
    p[p == 0] = 1e-15

    grad = 1 / 2 * ((1 - y) / (1 - p) - y / p)
    hess = 1 / 2 * ((1 - y) / ((1 - p) ** 2) + y / (p**2))
    return grad, hess


def balancedlogloss_eval_lgb(
    predt: np.ndarray, dtrain: lgb.Dataset
) -> Tuple[np.ndarray, np.ndarray]:
    y = dtrain.get_label()
    n0 = len(y[y == 0])
    n1 = len(y[y == 1])
    p = expit(predt)

    p[p == 0] = 1e-15

    return (
        'balanced_logloss',
        (-1 / n0 * (sum((1 - y) * np.log(1 - p))) -
         1 / n1 * (sum(y * np.log(p)))) / 2,
        False
    )


def balancedlogloss_eval_xgb(
    predt: np.ndarray, dtrain: lgb.Dataset
) -> Tuple[np.ndarray, np.ndarray]:
    y = dtrain.get_label()
    n0 = len(y[y == 0])
    n1 = len(y[y == 1])
    p = expit(predt)

    p[p == 0] = 1e-15

    return (
        'balanced_logloss',
        (-1 / n0 * (sum((1 - y) * np.log(1 - p))) -
         1 / n1 * (sum(y * np.log(p)))) / 2,
    )


def score(p, y):

    p[p == 0] = 1e-15
    p[p == 1] = 1-(1e-15)

    n0 = len(y[y == 0])
    n1 = len(y[y == 1])

    return ((-1 / n0 * (sum((1 - y) * np.log(1 - p))) - 1 / n1 * (sum(y * np.log(p)))) / 2)

In [7]:
def get_trials_df(trials_dataframe):
    col_index = [1] + [i for i in range(5, trials_dataframe.shape[1]-1)]

    trials_dataframe = trials_dataframe.iloc[:, col_index]
    trials_dataframe = trials_dataframe.groupby(
        trials_dataframe.columns.tolist()[1:]).mean()

    trials_dataframe = trials_dataframe.sort_values(
        by=['value'], ascending=True)

    return trials_dataframe

In [8]:
def xgb_objective(trial):

    xgb_params = {
        'learning_rate': 0.1,
        'min_child_weight': trial.suggest_categorical('min_child_weight', [i for i in range(8, 15)]),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.1, 2, step=0.1),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.1, 2, step=0.1),
        'max_depth': trial.suggest_categorical('max_depth', [3, 8, 10, 20]),
        'max_delta_step': 4,
        'subsample': trial.suggest_float('subsample', 0.2, 1, step=0.1),
        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.15, 0.2, 0.3, 0.4]),
        'disable_default_eval_metric': True, 
        'seed': 5,
    }

    n_components = 3

    kf = StratifiedKFold(10, shuffle=True, random_state=30)
    cols = X.columns.tolist()

    xgb_scores = []
    
    for train_index, test_index in kf.split(X, y):
        X_train, X_test = X.loc[train_index], X.loc[test_index]
        y_train, y_test = y.loc[train_index], y.loc[test_index]

        sampler = RandomOverSampler(random_state=3)
        X_train, y_train = sampler.fit_resample(X_train, y_train)

        cols = X_train.columns.tolist()

        dtrain_xgb = xgb.DMatrix(X_train, y_train, feature_names=cols, enable_categorical=True)
        dtest_xgb = xgb.DMatrix(X_test, y_test, feature_names=cols, enable_categorical=True)

        xgb_model = xgb.train(params=xgb_params,
                              dtrain=dtrain_xgb,
                              verbose_eval=False,
                              obj=balancedlogloss_xgb,
                              feval=balancedlogloss_eval_xgb,
                              num_boost_round=100,
                            )

        xgb_test_preds = expit(xgb_model.predict(dtest_xgb, output_margin=True))
        xgb_score = score(xgb_test_preds, y_test)
        xgb_scores = xgb_scores + [xgb_score]
            
    return np.mean(xgb_scores)

pruner = optuna.pruners.MedianPruner(n_warmup_steps=5)
xgb_study = optuna.create_study(direction='minimize', pruner=pruner)
xgb_study.optimize(xgb_objective, n_trials=50)

xgb_trials_dataframe = xgb_study.trials_dataframe()
get_trials_df(xgb_trials_dataframe)

[I 2024-01-08 10:19:32,640] A new study created in memory with name: no-name-4b793c0e-a918-407a-9213-f98d8f55dab9
[I 2024-01-08 10:19:35,696] Trial 0 finished with value: 0.2802409293784319 and parameters: {'min_child_weight': 9, 'reg_lambda': 1.3000000000000003, 'reg_alpha': 1.9000000000000001, 'max_depth': 20, 'subsample': 0.4, 'colsample_bytree': 0.4}. Best is trial 0 with value: 0.2802409293784319.
[I 2024-01-08 10:19:37,571] Trial 1 finished with value: 0.3203231556237826 and parameters: {'min_child_weight': 13, 'reg_lambda': 0.30000000000000004, 'reg_alpha': 1.7000000000000002, 'max_depth': 3, 'subsample': 0.30000000000000004, 'colsample_bytree': 0.15}. Best is trial 0 with value: 0.2802409293784319.
[I 2024-01-08 10:19:39,925] Trial 2 finished with value: 0.31662396211543975 and parameters: {'min_child_weight': 10, 'reg_lambda': 0.1, 'reg_alpha': 0.1, 'max_depth': 3, 'subsample': 0.8, 'colsample_bytree': 0.4}. Best is trial 0 with value: 0.2802409293784319.
[I 2024-01-08 10:19:4

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,value
params_colsample_bytree,params_max_depth,params_min_child_weight,params_reg_alpha,params_reg_lambda,params_subsample,Unnamed: 6_level_1
0.3,10,13,1.6,0.1,0.4,0.25549
0.3,8,13,0.1,0.9,0.4,0.255637
0.3,20,9,1.9,1.0,0.4,0.257266
0.3,10,13,0.8,0.4,0.4,0.257864
0.3,8,13,0.1,0.5,0.4,0.25835
0.3,10,11,1.1,0.4,0.4,0.259039
0.3,10,13,0.7,0.4,0.4,0.259106
0.3,8,13,0.1,0.5,0.5,0.259518
0.3,8,13,0.3,0.5,0.4,0.260306
0.3,20,9,1.6,1.5,0.4,0.262621


In [9]:
def lgb_objective(trial):

    lgb_params = {
        'learning_rate': 0.1,
        'lambda_l2': trial.suggest_float('lambda_l2', 0, 10, step=0.1),
        'lambda_l1': trial.suggest_float('lambda_l1', 0, 10, step=0.1),
        'subsample': trial.suggest_float('subsample', 0.1, 0.9, step=0.1),
        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.1, 0.2, 0.3]),
        'max_bins': trial.suggest_int('max_bins', 10, 100, step=10),
        'num_leaves': trial.suggest_int('num_leaves', 10, 20, step=2),
        'seed': 5,
        'first_metric_only': True,
        'verbosity': -1,
    }


    lgb_test_scores = []
    lgb_train_scores = []

    kf = StratifiedKFold(10, shuffle=True, random_state=30)

    for train_index, test_index in kf.split(X, y):
        
        X_train, X_test = X.loc[train_index], X.loc[test_index]
        y_train, y_test = y.loc[train_index], y.loc[test_index]

        sampler = RandomOverSampler(random_state=3)
        X_train, y_train = sampler.fit_resample(X_train, y_train)
        
        dtrain_lgb = lgb.Dataset(X_train, y_train)
        dtest_lgb = lgb.Dataset(X_test, y_test)

        lgb_evals = {}
        lgb_model = lgb.train(
            params=lgb_params,
                            train_set=dtrain_lgb,
                            fobj=balancedlogloss_lgb,
                            feval=balancedlogloss_eval_lgb,
                            num_boost_round=100,
                            )

        lgb_test_preds = expit(lgb_model.predict(X_test, raw_score=True))
        lgb_test_score = score(lgb_test_preds, y_test)
        lgb_test_scores = lgb_test_scores + [lgb_test_score]

        lgb_train_preds = expit(lgb_model.predict(X_train, raw_score=True))
        lgb_train_score = score(lgb_train_preds, y_train)
        lgb_train_scores = lgb_train_scores + [lgb_train_score]

    print(('train', np.mean(lgb_train_scores)), ('test', np.mean(lgb_test_scores)))
    return np.mean(lgb_test_scores)

pruner = optuna.pruners.MedianPruner(n_warmup_steps=5)
lgb_study = optuna.create_study(direction='minimize', pruner=pruner)
lgb_study.optimize(lgb_objective, n_trials=30)

lgb_trials_dataframe = lgb_study.trials_dataframe()
get_trials_df(lgb_trials_dataframe)

[I 2024-01-08 10:21:57,931] A new study created in memory with name: no-name-4635aac2-1441-43cb-9234-180346a382ce
[I 2024-01-08 10:22:02,860] Trial 0 finished with value: 0.33596770854586955 and parameters: {'lambda_l2': 7.4, 'lambda_l1': 4.7, 'subsample': 0.30000000000000004, 'colsample_bytree': 0.2, 'max_bins': 50, 'num_leaves': 16}. Best is trial 0 with value: 0.33596770854586955.


('train', 0.08374159895966009) ('test', 0.33596770854586955)


[I 2024-01-08 10:22:07,858] Trial 1 finished with value: 0.33625591259205795 and parameters: {'lambda_l2': 0.9, 'lambda_l1': 3.2, 'subsample': 0.8, 'colsample_bytree': 0.3, 'max_bins': 80, 'num_leaves': 20}. Best is trial 0 with value: 0.33596770854586955.


('train', 0.03803575769917051) ('test', 0.33625591259205795)


[I 2024-01-08 10:22:12,602] Trial 2 finished with value: 0.31708144172266234 and parameters: {'lambda_l2': 0.1, 'lambda_l1': 5.0, 'subsample': 0.7000000000000001, 'colsample_bytree': 0.2, 'max_bins': 10, 'num_leaves': 20}. Best is trial 2 with value: 0.31708144172266234.


('train', 0.09021913474478202) ('test', 0.31708144172266234)


[I 2024-01-08 10:22:17,444] Trial 3 finished with value: 0.41098882404143344 and parameters: {'lambda_l2': 3.9000000000000004, 'lambda_l1': 3.6, 'subsample': 0.7000000000000001, 'colsample_bytree': 0.1, 'max_bins': 90, 'num_leaves': 20}. Best is trial 2 with value: 0.31708144172266234.


('train', 0.09207650363538297) ('test', 0.41098882404143344)


[I 2024-01-08 10:22:21,399] Trial 4 finished with value: 0.31980597359588436 and parameters: {'lambda_l2': 3.2, 'lambda_l1': 3.3000000000000003, 'subsample': 0.6, 'colsample_bytree': 0.3, 'max_bins': 100, 'num_leaves': 12}. Best is trial 2 with value: 0.31708144172266234.


('train', 0.060674307502419624) ('test', 0.31980597359588436)


[I 2024-01-08 10:22:25,880] Trial 5 finished with value: 0.39706678998526557 and parameters: {'lambda_l2': 3.9000000000000004, 'lambda_l1': 1.7000000000000002, 'subsample': 0.8, 'colsample_bytree': 0.1, 'max_bins': 50, 'num_leaves': 16}. Best is trial 2 with value: 0.31708144172266234.


('train', 0.09161323336615677) ('test', 0.39706678998526557)


[I 2024-01-08 10:22:30,359] Trial 6 finished with value: 0.33491054153356237 and parameters: {'lambda_l2': 7.300000000000001, 'lambda_l1': 5.0, 'subsample': 0.7000000000000001, 'colsample_bytree': 0.3, 'max_bins': 70, 'num_leaves': 16}. Best is trial 2 with value: 0.31708144172266234.


('train', 0.061347486636528746) ('test', 0.33491054153356237)


[I 2024-01-08 10:22:34,556] Trial 7 finished with value: 0.39196140214424496 and parameters: {'lambda_l2': 8.200000000000001, 'lambda_l1': 6.1000000000000005, 'subsample': 0.4, 'colsample_bytree': 0.1, 'max_bins': 80, 'num_leaves': 12}. Best is trial 2 with value: 0.31708144172266234.


('train', 0.16110705252266613) ('test', 0.39196140214424496)


[I 2024-01-08 10:22:38,382] Trial 8 finished with value: 0.37932653726218746 and parameters: {'lambda_l2': 7.6000000000000005, 'lambda_l1': 0.30000000000000004, 'subsample': 0.6, 'colsample_bytree': 0.1, 'max_bins': 30, 'num_leaves': 14}. Best is trial 2 with value: 0.31708144172266234.


('train', 0.11169449252144495) ('test', 0.37932653726218746)


[I 2024-01-08 10:22:42,965] Trial 9 finished with value: 0.3204876174981384 and parameters: {'lambda_l2': 7.7, 'lambda_l1': 3.5, 'subsample': 0.5, 'colsample_bytree': 0.3, 'max_bins': 100, 'num_leaves': 16}. Best is trial 2 with value: 0.31708144172266234.


('train', 0.053616190029270226) ('test', 0.3204876174981384)


[I 2024-01-08 10:22:47,475] Trial 10 finished with value: 0.31938595250783164 and parameters: {'lambda_l2': 0.1, 'lambda_l1': 8.700000000000001, 'subsample': 0.2, 'colsample_bytree': 0.2, 'max_bins': 10, 'num_leaves': 20}. Best is trial 2 with value: 0.31708144172266234.


('train', 0.12558712578277897) ('test', 0.31938595250783164)


[I 2024-01-08 10:22:51,900] Trial 11 finished with value: 0.32135148894330745 and parameters: {'lambda_l2': 0.1, 'lambda_l1': 9.4, 'subsample': 0.1, 'colsample_bytree': 0.2, 'max_bins': 10, 'num_leaves': 20}. Best is trial 2 with value: 0.31708144172266234.


('train', 0.13169113991672926) ('test', 0.32135148894330745)


[I 2024-01-08 10:22:56,399] Trial 12 finished with value: 0.3183058500457777 and parameters: {'lambda_l2': 1.6, 'lambda_l1': 7.7, 'subsample': 0.1, 'colsample_bytree': 0.2, 'max_bins': 10, 'num_leaves': 18}. Best is trial 2 with value: 0.31708144172266234.


('train', 0.12098637648433978) ('test', 0.3183058500457777)


[I 2024-01-08 10:23:00,833] Trial 13 finished with value: 0.3268456102951449 and parameters: {'lambda_l2': 1.9000000000000001, 'lambda_l1': 7.4, 'subsample': 0.9, 'colsample_bytree': 0.2, 'max_bins': 30, 'num_leaves': 18}. Best is trial 2 with value: 0.31708144172266234.


('train', 0.09210782855307031) ('test', 0.3268456102951449)


[I 2024-01-08 10:23:06,043] Trial 14 finished with value: 0.32592319657613145 and parameters: {'lambda_l2': 2.2, 'lambda_l1': 7.2, 'subsample': 0.1, 'colsample_bytree': 0.2, 'max_bins': 30, 'num_leaves': 18}. Best is trial 2 with value: 0.31708144172266234.


('train', 0.09119206885474676) ('test', 0.32592319657613145)


[I 2024-01-08 10:23:10,448] Trial 15 finished with value: 0.34401325716577813 and parameters: {'lambda_l2': 5.5, 'lambda_l1': 6.7, 'subsample': 0.4, 'colsample_bytree': 0.2, 'max_bins': 20, 'num_leaves': 18}. Best is trial 2 with value: 0.31708144172266234.


('train', 0.10269205896945706) ('test', 0.34401325716577813)


[I 2024-01-08 10:23:15,223] Trial 16 finished with value: 0.3393186158367976 and parameters: {'lambda_l2': 1.7000000000000002, 'lambda_l1': 8.1, 'subsample': 0.30000000000000004, 'colsample_bytree': 0.2, 'max_bins': 40, 'num_leaves': 18}. Best is trial 2 with value: 0.31708144172266234.


('train', 0.09322860202566945) ('test', 0.3393186158367976)


[I 2024-01-08 10:23:18,466] Trial 17 finished with value: 0.32648712627140475 and parameters: {'lambda_l2': 6.1000000000000005, 'lambda_l1': 10.0, 'subsample': 0.5, 'colsample_bytree': 0.2, 'max_bins': 10, 'num_leaves': 10}. Best is trial 2 with value: 0.31708144172266234.


('train', 0.17243519920428674) ('test', 0.32648712627140475)


[I 2024-01-08 10:23:22,968] Trial 18 finished with value: 0.34867929985327617 and parameters: {'lambda_l2': 2.8000000000000003, 'lambda_l1': 6.1000000000000005, 'subsample': 0.9, 'colsample_bytree': 0.2, 'max_bins': 20, 'num_leaves': 18}. Best is trial 2 with value: 0.31708144172266234.


('train', 0.09114427200690149) ('test', 0.34867929985327617)


[I 2024-01-08 10:23:27,097] Trial 19 finished with value: 0.33586439125219125 and parameters: {'lambda_l2': 9.200000000000001, 'lambda_l1': 5.800000000000001, 'subsample': 0.6, 'colsample_bytree': 0.2, 'max_bins': 60, 'num_leaves': 14}. Best is trial 2 with value: 0.31708144172266234.


('train', 0.09752748540614706) ('test', 0.33586439125219125)


[I 2024-01-08 10:23:31,725] Trial 20 finished with value: 0.3506096965311317 and parameters: {'lambda_l2': 1.0, 'lambda_l1': 5.0, 'subsample': 0.2, 'colsample_bytree': 0.2, 'max_bins': 20, 'num_leaves': 20}. Best is trial 2 with value: 0.31708144172266234.


('train', 0.07629396360591845) ('test', 0.3506096965311317)


[I 2024-01-08 10:23:36,488] Trial 21 finished with value: 0.32040957995912217 and parameters: {'lambda_l2': 0.1, 'lambda_l1': 8.5, 'subsample': 0.2, 'colsample_bytree': 0.2, 'max_bins': 10, 'num_leaves': 20}. Best is trial 2 with value: 0.31708144172266234.


('train', 0.12375125756413782) ('test', 0.32040957995912217)


[I 2024-01-08 10:23:41,252] Trial 22 finished with value: 0.32029630333636266 and parameters: {'lambda_l2': 1.1, 'lambda_l1': 8.4, 'subsample': 0.1, 'colsample_bytree': 0.2, 'max_bins': 10, 'num_leaves': 20}. Best is trial 2 with value: 0.31708144172266234.


('train', 0.12548287962267377) ('test', 0.32029630333636266)


[I 2024-01-08 10:23:45,763] Trial 23 finished with value: 0.34656400294433615 and parameters: {'lambda_l2': 0.1, 'lambda_l1': 9.3, 'subsample': 0.2, 'colsample_bytree': 0.2, 'max_bins': 20, 'num_leaves': 18}. Best is trial 2 with value: 0.31708144172266234.


('train', 0.10965525921797259) ('test', 0.34656400294433615)


[I 2024-01-08 10:23:50,309] Trial 24 finished with value: 0.33896106045185165 and parameters: {'lambda_l2': 1.0, 'lambda_l1': 7.5, 'subsample': 0.30000000000000004, 'colsample_bytree': 0.2, 'max_bins': 40, 'num_leaves': 20}. Best is trial 2 with value: 0.31708144172266234.


('train', 0.08658439627465853) ('test', 0.33896106045185165)


[I 2024-01-08 10:23:54,752] Trial 25 finished with value: 0.3208698033383522 and parameters: {'lambda_l2': 2.7, 'lambda_l1': 8.9, 'subsample': 0.4, 'colsample_bytree': 0.2, 'max_bins': 10, 'num_leaves': 18}. Best is trial 2 with value: 0.31708144172266234.


('train', 0.13384531255796867) ('test', 0.3208698033383522)


[I 2024-01-08 10:23:59,136] Trial 26 finished with value: 0.34001376988609283 and parameters: {'lambda_l2': 4.1000000000000005, 'lambda_l1': 10.0, 'subsample': 0.2, 'colsample_bytree': 0.2, 'max_bins': 40, 'num_leaves': 20}. Best is trial 2 with value: 0.31708144172266234.


('train', 0.11065699671497106) ('test', 0.34001376988609283)


[I 2024-01-08 10:24:03,856] Trial 27 finished with value: 0.32974585759421526 and parameters: {'lambda_l2': 1.5, 'lambda_l1': 4.1000000000000005, 'subsample': 0.1, 'colsample_bytree': 0.2, 'max_bins': 30, 'num_leaves': 18}. Best is trial 2 with value: 0.31708144172266234.


('train', 0.06634025738799199) ('test', 0.32974585759421526)


[I 2024-01-08 10:24:09,569] Trial 28 finished with value: 0.35098896172516375 and parameters: {'lambda_l2': 0.0, 'lambda_l1': 1.9000000000000001, 'subsample': 0.7000000000000001, 'colsample_bytree': 0.3, 'max_bins': 20, 'num_leaves': 20}. Best is trial 2 with value: 0.31708144172266234.


('train', 0.034701405997721695) ('test', 0.35098896172516375)


[I 2024-01-08 10:24:13,804] Trial 29 finished with value: 0.3925913856799272 and parameters: {'lambda_l2': 0.7000000000000001, 'lambda_l1': 5.4, 'subsample': 0.30000000000000004, 'colsample_bytree': 0.1, 'max_bins': 10, 'num_leaves': 16}. Best is trial 2 with value: 0.31708144172266234.


('train', 0.17222810188610146) ('test', 0.3925913856799272)


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,value
params_colsample_bytree,params_lambda_l1,params_lambda_l2,params_max_bins,params_num_leaves,params_subsample,Unnamed: 6_level_1
0.2,5.0,0.1,10,20,0.7,0.317081
0.2,7.7,1.6,10,18,0.1,0.318306
0.2,8.7,0.1,10,20,0.2,0.319386
0.3,3.3,3.2,100,12,0.6,0.319806
0.2,8.4,1.1,10,20,0.1,0.320296
0.2,8.5,0.1,10,20,0.2,0.32041
0.3,3.5,7.7,100,16,0.5,0.320488
0.2,8.9,2.7,10,18,0.4,0.32087
0.2,9.4,0.1,10,20,0.1,0.321351
0.2,7.2,2.2,30,18,0.1,0.325923


In [10]:
xgb_params = {
    'learning_rate': 0.1,
    'min_child_weight': xgb_study.best_params['min_child_weight'],
    'reg_lambda': xgb_study.best_params['reg_lambda'],
    'reg_alpha': xgb_study.best_params['reg_alpha'],
    'max_depth': xgb_study.best_params['max_depth'],
    'max_delta_step': 4,
    'subsample': xgb_study.best_params['subsample'],
    'colsample_bytree': xgb_study.best_params['colsample_bytree'],
    'disable_default_eval_metric': True, 
    'seed': 5,
}

lgb_params = {
    'learning_rate': 0.1,
    'lambda_l2': lgb_study.best_params['lambda_l2'],
    'lambda_l1': lgb_study.best_params['lambda_l1'],
    'subsample': lgb_study.best_params['subsample'],
    'colsample_bytree': lgb_study.best_params['colsample_bytree'],
    'max_bins': lgb_study.best_params['max_bins'],
    'num_leaves': lgb_study.best_params['num_leaves'],
    'random_seed': 5,
    'first_metric_only': True,
    'verbosity': -1,
} 

cols = X.columns.tolist()

def ensemble_objective(trial):
    C_meta = trial.suggest_loguniform('C_meta', 1e-2, 1)

    kf = StratifiedKFold(10, shuffle=True, random_state=30)
    ensemble_scores = []

    for train_index, test_index in kf.split(X, y):
        X_train, X_test = X.loc[train_index], X.loc[test_index]
        y_train, y_test = y.loc[train_index], y.loc[test_index]

        sampler = RandomOverSampler(random_state=3)
        X_train, y_train = sampler.fit_resample(X_train, y_train)

        # Train XGBoost Model
        dtrain_xgb = xgb.DMatrix(X_train, y_train, feature_names=cols, enable_categorical=True)
        dtest_xgb = xgb.DMatrix(X_test, y_test, feature_names=cols, enable_categorical=True)
        xgb_model = xgb.train(params=xgb_params,
                                  dtrain=dtrain_xgb,
                                  verbose_eval=False,
                                  obj=balancedlogloss_xgb,
                                  feval=balancedlogloss_eval_xgb,
                                  num_boost_round=100,)
        xgb_train_preds = expit(xgb_model.predict(dtrain_xgb, output_margin=True))
        xgb_test_preds = expit(xgb_model.predict(dtest_xgb, output_margin=True))

        # Train LightGBM Model
        dtrain_lgb = lgb.Dataset(X_train, y_train)
        lgb_model = lgb.train(params=lgb_params,
                                train_set=dtrain_lgb,
                                fobj=balancedlogloss_lgb,
                                feval=balancedlogloss_eval_lgb,
                                num_boost_round=100,)
        lgb_train_preds = expit(lgb_model.predict(X_train, raw_score=True))
        lgb_test_preds = expit(lgb_model.predict(X_test, raw_score=True))

        # Combine predictions for Meta Model
        stacked_preds_train = np.column_stack((xgb_train_preds, lgb_train_preds))
        stacked_preds_test = np.column_stack((xgb_test_preds, lgb_test_preds,))

        # Train Meta Model
        meta_model = LogisticRegression(C=C_meta)
        meta_model.fit(stacked_preds_train, y_train)
        ensemble_preds = meta_model.predict_proba(stacked_preds_test)[:, 1]

        ensemble_score = score(ensemble_preds, y_test)
        ensemble_scores.append(ensemble_score)

    return np.mean(ensemble_scores)

# Run Optuna study
pruner = optuna.pruners.MedianPruner(n_warmup_steps=5)
ensemble_study = optuna.create_study(direction='minimize', pruner=pruner)
ensemble_study.optimize(ensemble_objective, n_trials=20)

ensemble_trials_dataframe = ensemble_study.trials_dataframe()
get_trials_df(ensemble_trials_dataframe)

[I 2024-01-08 10:24:13,881] A new study created in memory with name: no-name-1d69bd9a-3780-402d-997c-a1e9dc34d8c1
[I 2024-01-08 10:24:22,326] Trial 0 finished with value: 0.3170989704554614 and parameters: {'C_meta': 0.5973155503529188}. Best is trial 0 with value: 0.3170989704554614.
[I 2024-01-08 10:24:30,717] Trial 1 finished with value: 0.36378137603987376 and parameters: {'C_meta': 0.01996887627507986}. Best is trial 0 with value: 0.3170989704554614.
[I 2024-01-08 10:24:39,236] Trial 2 finished with value: 0.31540000753338154 and parameters: {'C_meta': 0.04248094954506587}. Best is trial 2 with value: 0.31540000753338154.
[I 2024-01-08 10:24:48,000] Trial 3 finished with value: 0.29196406629893523 and parameters: {'C_meta': 0.21826611339351018}. Best is trial 3 with value: 0.29196406629893523.
[I 2024-01-08 10:24:56,477] Trial 4 finished with value: 0.31617518043378245 and parameters: {'C_meta': 0.581722352256463}. Best is trial 3 with value: 0.29196406629893523.
[I 2024-01-08 10:

Unnamed: 0_level_0,value
params_C_meta,Unnamed: 1_level_1
0.135083,0.289045
0.148684,0.289069
0.127568,0.289173
0.166799,0.289478
0.111713,0.289892
0.182437,0.290075
0.218266,0.291964
0.092064,0.292048
0.2565,0.294415
0.075053,0.295837


In [11]:
test = pd.read_csv('/kaggle/input/icr-identify-age-related-conditions/test.csv')
test['EJ'].replace(['A', 'B'], [1, 0], inplace=True)
test_ej = np.array(test['EJ']).reshape(-1, 1)

X_test = scaler.transform(test[x_numerical_columns])
X_test = np.concatenate((X_test, test_ej), axis=1)
X_test = imputer.transform(X_test)

X_test = pd.DataFrame(X_test, columns=x_cols)
X_test['EJ'] = X_test['EJ'].astype('int').astype('category')

X_train = X
y_train = y

sampler = RandomOverSampler(random_state=3)
X_train, y_train = sampler.fit_resample(X_train, y_train)

cols = X_train.columns.tolist()

dtrain_xgb = xgb.DMatrix(X_train, y_train, feature_names=cols, enable_categorical=True)
dtest_xgb = xgb.DMatrix(X_test, feature_names=cols, enable_categorical=True)

xgb_model = xgb.train(params=xgb_params,
                        dtrain=dtrain_xgb,
                        obj=balancedlogloss_xgb,
                        verbose_eval=10,
                        feval=balancedlogloss_eval_xgb,
                        num_boost_round=100,
                        )

xgb_train_preds = expit(xgb_model.predict(dtrain_xgb, output_margin=True))
xgb_test_preds = expit(xgb_model.predict(dtest_xgb, output_margin=True))

dtrain_lgb = lgb.Dataset(X_train, y_train)

lgb_model = lgb.train(
    params=lgb_params,
                      train_set=dtrain_lgb,
                      fobj=balancedlogloss_lgb,
                      feval=balancedlogloss_eval_lgb,
                      num_boost_round=100,
                      verbose_eval=False
                     )

lgb_train_preds = expit(lgb_model.predict(X_train, raw_score=True))
lgb_test_preds = expit(lgb_model.predict(X_test, raw_score=True))

stacked_preds_train = np.column_stack((xgb_train_preds, lgb_train_preds))
stacked_preds_test = np.column_stack((xgb_test_preds, lgb_test_preds))

meta_model = LogisticRegression(C=ensemble_study.best_params['C_meta'])
meta_model.fit(stacked_preds_train, y_train)
ensemble_preds = meta_model.predict_proba(stacked_preds_test)[:, 1]

preds_1 = ensemble_preds
preds_0 = 1 - ensemble_preds

submission = pd.DataFrame(index=test.index, columns=sample_submission.columns)
submission['Id'] = test['Id']
submission['class_0'] = preds_0
submission['class_1'] = preds_1

submission.to_csv('submission.csv', index=False)

In [12]:
submission

Unnamed: 0,Id,class_0,class_1
0,00eed32682bb,0.238177,0.761823
1,010ebe33f668,0.238177,0.761823
2,02fa521e1838,0.238177,0.761823
3,040e15f562a2,0.238177,0.761823
4,046e85c7cc7f,0.238177,0.761823
