In [1]:
import pandas as pd
import numpy as np
import sklearn
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import warnings
warnings.filterwarnings('ignore')

import lightgbm as lgb
import optuna
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
from imblearn.over_sampling import RandomOverSampler

from typing import Tuple
from scipy.special import expit
import xgboost as xgb

In [2]:
train = pd.read_csv('train.csv')

train['EJ'].replace(['A', 'B'], [1, 0], inplace=True)

ej = np.array(pd.get_dummies(train['EJ']))

sample_submission = pd.read_csv('sample_submission.csv')

y = np.array(train['Class'])

In [3]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

scaler = MinMaxScaler()

x_numerical_columns = train.drop(columns=['Id', 'Class', 'EJ']).columns

scaler.fit(train[x_numerical_columns])
x_standardized = scaler.transform(train[x_numerical_columns])

In [4]:
from sklearn.impute import KNNImputer

knn = KNNImputer()
knn.fit(x_standardized)
x_imputed_standardized = knn.transform(x_standardized)

X = np.append(x_imputed_standardized, ej, axis=1)

In [5]:
def balancedlogloss(predt: np.ndarray, dtrain: xgb.DMatrix) -> Tuple[np.ndarray, np.ndarray]:
    y = dtrain.get_label()
    n0 = len(y[y==0])
    n1 = len(y[y==1])

    p = expit(predt)

    p[p==0] = 1e-15

    grad = 1/2*((1-y)/(1-p)-y/p)
    hess = 1/2*((1-y)/((1-p)**2)+y/(p**2))
    return grad, hess

def scoring(y, p):

    p = expit(p)

    p[p==0] = 1e-15

    n0 = len(y[y==0])
    n1 = len(y[y==1])
    
    return (-1/n0*(sum((1-y)*np.log(1-p)))-1/n1*(sum(y*np.log(p))))/2

def balancedlogloss_eval(predt: np.ndarray, dtrain: xgb.DMatrix) -> Tuple[np.ndarray, np.ndarray]:
    y = dtrain.get_label()
    n0 = len(y[y==0])
    n1 = len(y[y==1])
    p = expit(predt)

    p[p==0] = 1e-15

    return 'balanced_logloss', (-1/n0*(sum((1-y)*np.log(1-p)))-1/n1*(sum(y*np.log(p))))/2

In [9]:
def objective(trial):
    optimized_param = {'learning_rate': trial.suggest_float('learning_rate', 1e-3, 0.5, step=0.004),
                       'gamma': trial.suggest_float('gamma', 1e-3, 2.0, step=0.005),
                       'reg_lambda': trial.suggest_float('reg_lambda', 1, 100, step=10),
                       # 'scale_pos_weight': trial.suggest_float('scale_pos_weight', 1e-3, 10),
                       'max_depth': trial.suggest_int('max_depth', 2, 10),
                       'min_child_weight': trial.suggest_float('min_child_weight', 0.1, 0.95, step=0.1),
                       'max_delta_step': trial.suggest_int('max_delta_step', 1, 5),
                        }
    
    # Perform 10-fold cross-validation
    kf = KFold(n_splits=10, shuffle=True)
    mean_balanced_logloss_score = []

    for train_index, test_index in kf.split(X):

        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        sampler = RandomOverSampler()
        X_re, y_re = sampler.fit_resample(X_train, y_train)
        
        # Train a XGBoost model
        train_set = xgb.DMatrix(X_re, y_re)
        test_set = xgb.DMatrix(X_test, y_test)
        
        clf = xgb.train(params=optimized_param,
                        dtrain=train_set,
                        obj=balancedlogloss,
                        num_boost_round=40,
                        )

        # Make predictions on the test set
        preds = clf.predict(xgb.DMatrix(X_test), output_margin=True)
        
        # Calculate the balanced logloss score
        ll = scoring(y=y_test, p=preds)
        mean_balanced_logloss_score.append(ll)
    
    return np.mean(mean_balanced_logloss_score)


study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100)

optimization_results = pd.DataFrame([study.trials[i].params for i in range(len(study.trials))])
optimization_results['score'] = [study.trials[i].value for i in range(len(study.trials))]
optimization_results = optimization_results.sort_values(by='score')

optimization_results

[I 2023-06-26 12:53:25,439] A new study created in memory with name: no-name-e5987ef2-84cb-40ba-90f8-c32a1c95aca2
[I 2023-06-26 12:53:26,946] Trial 0 finished with value: 0.48704556233704227 and parameters: {'learning_rate': 0.089, 'gamma': 0.401, 'reg_lambda': 41.0, 'max_depth': 2, 'min_child_weight': 0.9, 'max_delta_step': 4}. Best is trial 0 with value: 0.48704556233704227.
[I 2023-06-26 12:53:29,129] Trial 1 finished with value: 0.6206910432256842 and parameters: {'learning_rate': 0.341, 'gamma': 0.20600000000000002, 'reg_lambda': 91.0, 'max_depth': 8, 'min_child_weight': 0.8, 'max_delta_step': 1}. Best is trial 0 with value: 0.48704556233704227.
[I 2023-06-26 12:53:31,324] Trial 2 finished with value: 0.5272619943253083 and parameters: {'learning_rate': 0.053000000000000005, 'gamma': 1.8459999999999999, 'reg_lambda': 61.0, 'max_depth': 7, 'min_child_weight': 0.7000000000000001, 'max_delta_step': 3}. Best is trial 0 with value: 0.48704556233704227.
[I 2023-06-26 12:53:32,547] Trial

Unnamed: 0,learning_rate,gamma,reg_lambda,max_depth,min_child_weight,max_delta_step,score
67,0.225,1.231,61.0,3,0.7,2,0.325556
99,0.249,1.066,81.0,3,0.9,1,0.326946
68,0.269,1.226,61.0,3,0.3,2,0.331085
43,0.173,1.726,11.0,3,0.8,5,0.338592
92,0.257,1.071,71.0,3,0.8,1,0.34315
77,0.253,0.901,61.0,3,0.9,3,0.346948
94,0.261,1.056,81.0,3,0.9,1,0.347305
15,0.329,1.656,81.0,2,0.3,3,0.349476
81,0.289,1.006,51.0,2,0.9,3,0.350045
23,0.165,1.161,71.0,3,0.4,1,0.352488


In [13]:
best_params = study.best_params
best_params['disable_default_eval_metric'] = True
best_params['verbosity']=0

scores = []

kf = KFold(10)

for train_index, test_index in kf.split(X):

    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    sampler_model = RandomOverSampler()
    X_re, y_re = sampler_model.fit_resample(X_train, y_train)

    dtrain = xgb.DMatrix(X_re, y_re)
    dval = xgb.DMatrix(X_test, y_test)

    model = xgb.train(params=best_params,
                      dtrain=dtrain,
                      obj=balancedlogloss,
                      feval=balancedlogloss_eval, 
                      verbose_eval=5,
                      num_boost_round=50,
                    #   early_stopping_rounds=5
                      )
    score = scoring(y=y_test, p=model.predict(dval))
    scores = scores + [score]
  
print(np.mean(scores))

0.362118190389319


In [None]:
test = pd.read_csv('test.csv')
test['EJ'].replace(['A', 'B'], [1, 0], inplace=True)

test_ej = np.array(pd.get_dummies(test['EJ']))

x_test_scaled = scaler.transform(test[x_numerical_columns])

X_test = np.append(x_test_scaled, test_ej, axis=1)
d_test = xgb.DMatrix(X_test)

preds = pd.DataFrame(index=range(test.shape[0]))

for i in range(0, 10):

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.05, shuffle=True)
    sampler_model = RandomOverSampler()
    X_re, y_re = sampler_model.fit_resample(X_train, y_train)

    dtrain = xgb.DMatrix(X_re, y_re)
    dval = xgb.DMatrix(X_test, y_test)

    model = xgb.train(params=best_params,
                      dtrain=dtrain,
                      obj=balancedlogloss,
                      evals=[(dtrain, 'dtrain'), (dval, 'dval')],
                      feval=balancedlogloss_eval,
                      verbose_eval=5,
                      early_stopping_rounds=5
                      )

    p = expit(model.predict(d_test))
    p = pd.Series(p)

    preds = pd.concat([preds, p], axis=1)
    print('\n')

pred_1 = np.mean(preds, axis=1)
pred_0 = 1 - pred_1

# pred_0

submission = pd.DataFrame(index=test.index, columns=sample_submission.columns)
submission['Id'] = test['Id']
submission['class_0'] = pred_0
submission['class_1'] = pred_1

submission.to_csv('submission.csv', index=False)

[0]	dtrain-balanced_logloss:0.62793	dval-balanced_logloss:0.66931
[5]	dtrain-balanced_logloss:0.28599	dval-balanced_logloss:0.45725
[9]	dtrain-balanced_logloss:0.18724	dval-balanced_logloss:0.37280


[0]	dtrain-balanced_logloss:0.62590	dval-balanced_logloss:0.62661
[5]	dtrain-balanced_logloss:0.29293	dval-balanced_logloss:0.41340
[9]	dtrain-balanced_logloss:0.20325	dval-balanced_logloss:0.38609


[0]	dtrain-balanced_logloss:0.63278	dval-balanced_logloss:0.68131
[5]	dtrain-balanced_logloss:0.30967	dval-balanced_logloss:0.52930
[9]	dtrain-balanced_logloss:0.20884	dval-balanced_logloss:0.55227


[0]	dtrain-balanced_logloss:0.62753	dval-balanced_logloss:0.66600
[5]	dtrain-balanced_logloss:0.28829	dval-balanced_logloss:0.55568
[9]	dtrain-balanced_logloss:0.18854	dval-balanced_logloss:0.59923


[0]	dtrain-balanced_logloss:0.63162	dval-balanced_logloss:0.67883
[5]	dtrain-balanced_logloss:0.30896	dval-balanced_logloss:0.39287
[9]	dtrain-balanced_logloss:0.19603	dval-balanced_logloss:0.26420


