In [None]:
import pandas as pd
import numpy as np
import sklearn
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix

# from IPython.display import set_matplotlib_formats
# set_matplotlib_formats('retina')

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import warnings
warnings.filterwarnings('ignore')

import lightgbm as lgb
import optuna
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
from imblearn.over_sampling import RandomOverSampler

from typing import Tuple
from scipy.special import expit
import xgboost as xgb

In [None]:
train = pd.read_csv('/kaggle/input/icr-identify-age-related-conditions/train.csv')

train['EJ'].replace(['A', 'B'], [1, 0], inplace=True)

ej = np.array(pd.get_dummies(train['EJ']))

sample_submission = pd.read_csv('/kaggle/input/icr-identify-age-related-conditions/sample_submission.csv')

y = np.array(train['Class'])

In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

scaler = MinMaxScaler()

x_numerical_columns = train.drop(columns=['Id', 'Class', 'EJ']).columns

scaler.fit(train[x_numerical_columns])
x_standardized = scaler.transform(train[x_numerical_columns])

In [None]:
from sklearn.impute import KNNImputer

knn = KNNImputer()
knn.fit(x_standardized)
x_imputed_standardized = knn.transform(x_standardized)

X = np.append(x_imputed_standardized, ej, axis=1)

In [None]:
def balancedlogloss(predt: np.ndarray, dtrain: xgb.DMatrix) -> Tuple[np.ndarray, np.ndarray]:
    y = dtrain.get_label()
    n0 = len(y[y==0])
    n1 = len(y[y==1])

    p = expit(predt)

    p[p==0] = 1e-15

    grad = 1/2*((1-y)/(1-p)-y/p)
    hess = 1/2*((1-y)/((1-p)**2)+y/(p**2))
    return grad, hess

def scoring(y, p):

    p = expit(p)

    p[p==0] = 1e-15

    n0 = len(y[y==0])
    n1 = len(y[y==1])
    
    return (-1/n0*(sum((1-y)*np.log(1-p)))-1/n1*(sum(y*np.log(p))))/2

def balancedlogloss_eval(predt: np.ndarray, dtrain: xgb.DMatrix) -> Tuple[np.ndarray, np.ndarray]:
    y = dtrain.get_label()
    n0 = len(y[y==0])
    n1 = len(y[y==1])
    p = expit(predt)

    p[p==0] = 1e-15

    return 'balanced_logloss', (-1/n0*(sum((1-y)*np.log(1-p)))-1/n1*(sum(y*np.log(p))))/2

In [None]:
def objective(trial):
    optimized_param = {'learning_rate': trial.suggest_float('learning_rate', 1e-3, 2, step=0.004),
                       'gamma': trial.suggest_float('gamma', 1e-3, 2.0, step=0.005),
                       'reg_lambda': trial.suggest_float('reg_lambda', 1, 100, step=10),
                       # 'scale_pos_weight': trial.suggest_float('scale_pos_weight', 1e-3, 10),
                       'max_depth': trial.suggest_int('max_depth', 2, 10),
                       'min_child_weight': trial.suggest_float('min_child_weight', 0.1, 0.95, step=0.1),
                       'max_delta_step': trial.suggest_int('max_delta_step', 1, 5),
                        }
    
    # Perform 10-fold cross-validation
    kf = KFold(n_splits=10, shuffle=True)
    mean_balanced_logloss_score = []

    for train_index, test_index in kf.split(X):

        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        sampler = RandomOverSampler()
        X_re, y_re = sampler.fit_resample(X_train, y_train)
        
        # Train a XGBoost model
        train_set = xgb.DMatrix(X_re, y_re)
        test_set = xgb.DMatrix(X_test, y_test)
        
        clf = xgb.train(params=optimized_param,
                        dtrain=train_set,
                        obj=balancedlogloss,
                        )

        # Make predictions on the test set
        preds = clf.predict(xgb.DMatrix(X_test), output_margin=True)
        
        # Calculate the balanced logloss score
        ll = scoring(y=y_test, p=preds)
        mean_balanced_logloss_score.append(ll)
    
    return np.mean(mean_balanced_logloss_score)


study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100)

optimization_results = pd.DataFrame([study.trials[i].params for i in range(len(study.trials))])
optimization_results['score'] = [study.trials[i].value for i in range(len(study.trials))]
optimization_results = optimization_results.sort_values(by='score')

optimization_results

In [None]:
best_params = study.best_params
best_params['disable_default_eval_metric'] = True
best_params['verbosity']=0
best_params['seed']=6

scores = []

X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, 
                                                            test_size=0.05, 
                                                            # random_state=20, 
                                                            shuffle=False)
dtest = xgb.DMatrix(X_test, y_test)


for i in range(0, 10):

    X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.05, shuffle=True)
    sampler_model = RandomOverSampler()
    X_re, y_re = sampler_model.fit_resample(X_train, y_train)

    dtrain = xgb.DMatrix(X_re, y_re)
    dval = xgb.DMatrix(X_val, y_val)

    model = xgb.train(params=best_params,
                      dtrain=dtrain,
                      obj=balancedlogloss,
                      evals=[(dtrain, 'dtrain'), (dval, 'dval')],
                      feval=balancedlogloss_eval, 
                      verbose_eval=5,
                      early_stopping_rounds=5
                      )

    scores = scores + [scoring(y=y_test, p=model.predict(dtest, output_margin=True))]

    print('balanced log loss: ' + str(scoring(y=y_test, p=model.predict(dtest, output_margin=True))))
    print('\n')

print(np.mean(scores))

In [None]:
test = pd.read_csv('/kaggle/input/icr-identify-age-related-conditions/test.csv')
test['EJ'].replace(['A', 'B'], [1, 0], inplace=True)

test_ej = np.array(pd.get_dummies(test['EJ']))

x_test_scaled = scaler.transform(test[x_numerical_columns])

X_test = np.append(x_test_scaled, test_ej, axis=1)
d_test = xgb.DMatrix(X_test)

preds = pd.DataFrame(index=range(test.shape[0]))

for i in range(0, 10):

    X_train, X_val, y_train, y_val = train_test_split(
        X, y, test_size=0.05, shuffle=True)
    sampler_model = RandomOverSampler()
    X_re, y_re = sampler_model.fit_resample(X_train, y_train)

    dtrain = xgb.DMatrix(X_re, y_re)
    dval = xgb.DMatrix(X_val, y_val)

    model = xgb.train(params=best_params,
                      dtrain=dtrain,
                      obj=balancedlogloss,
                      evals=[(dtrain, 'dtrain'), (dval, 'dval')],
                      feval=balancedlogloss_eval,
                      verbose_eval=5,
                      early_stopping_rounds=5
                      )

    p = expit(model.predict(d_test))
    p = pd.Series(p)

    preds = pd.concat([preds, p], axis=1)
    print('\n')

pred_1 = np.mean(preds, axis=1)
pred_0 = 1 - pred_1

# pred_0

submission = pd.DataFrame(index=test.index, columns=sample_submission.columns)
submission['Id'] = test['Id']
submission['class_0'] = pred_0
submission['class_1'] = pred_1

submission.to_csv('submission.csv', index=False)