To do list:

Try greek information

In [49]:
import pandas as pd
import numpy as np
import sklearn
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix

from IPython.display import set_matplotlib_formats
set_matplotlib_formats('retina')

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import warnings
warnings.filterwarnings('ignore')

import lightgbm as lgb
import optuna
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
from imblearn.over_sampling import RandomOverSampler

from typing import Tuple
from scipy.special import expit
import xgboost as xgb

In [50]:
train = pd.read_csv('train.csv')
train['EJ'].replace(['A', 'B'], [1, 0], inplace=True)

ej = np.array(pd.get_dummies(train['EJ']))

sample_submission = pd.read_csv('sample_submission.csv')

y = np.array(train['Class'])

In [51]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

scaler = MinMaxScaler()

x_numerical_columns = train.drop(columns=['Id', 'Class', 'EJ']).columns

scaler.fit(train[x_numerical_columns])
x_standardized = scaler.transform(train[x_numerical_columns])


In [52]:
from sklearn.impute import KNNImputer

knn = KNNImputer()
knn.fit(x_standardized)
x_imputed_standardized = knn.transform(x_standardized)

X = np.append(x_imputed_standardized, ej, axis=1)

In [56]:
def balancedlogloss(predt: np.ndarray, dtrain: xgb.DMatrix) -> Tuple[np.ndarray, np.ndarray]:
    y = dtrain.get_label()
    n0 = len(y[y==0])
    n1 = len(y[y==1])

    p = expit(predt)

    p[p==0] = 1e-15

    grad = 1/2*((1-y)/(1-p)-y/p)
    hess = 1/2*((1-y)/((1-p)**2)+y/(p**2))
    return grad, hess

def scoring(y, p):

    p = expit(p)

    p[p==0] = 1e-15

    n0 = len(y[y==0])
    n1 = len(y[y==1])
    
    return (-1/n0*(sum((1-y)*np.log(1-p)))-1/n1*(sum(y*np.log(p))))/2

def balancedlogloss_eval(predt: np.ndarray, dtrain: xgb.DMatrix) -> Tuple[np.ndarray, np.ndarray]:
    y = dtrain.get_label()
    n0 = len(y[y==0])
    n1 = len(y[y==1])
    p = expit(predt)

    p[p==0] = 1e-15

    return 'balanced_logloss', (-1/n0*(sum((1-y)*np.log(1-p)))-1/n1*(sum(y*np.log(p))))/2

In [None]:
def objective(trial):
    optimized_param = {'learning_rate': trial.suggest_float('learning_rate', 1e-3, 2, step=0.02),
                       'gamma': trial.suggest_float('gamma', 1e-3, 2.0, step=0.005),
                       'reg_lambda': trial.suggest_float('reg_lambda', 1, 100, step=10),
                       # 'scale_pos_weight': trial.suggest_float('scale_pos_weight', 1e-3, 10),
                       'max_depth': trial.suggest_int('max_depth', 2, 10),
                       'min_child_weight': trial.suggest_float('min_child_weight', 0.1, 0.95, step=0.1),
                       'max_delta_step': trial.suggest_int('max_delta_step', 1, 5),
                        }
    
    # Perform 10-fold cross-validation
    kf = KFold(n_splits=10, shuffle=True)
    mean_balanced_logloss_score = []

    for train_index, test_index in kf.split(X):

        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        sampler = RandomOverSampler()
        X_re, y_re = sampler.fit_resample(X_train, y_train)
        
        # Train a XGBoost model
        train_set = xgb.DMatrix(X_re, y_re)
        test_set = xgb.DMatrix(X_test, y_test)
        
        clf = xgb.train(params=optimized_param,
                        dtrain=train_set,
                        obj=balancedlogloss,
                        feval=balancedlogloss_eval,
                        )

        # Make predictions on the test set
        preds = clf.predict(xgb.DMatrix(X_test), output_margin=True)
        
        # Calculate the balanced logloss score
        ll = scoring(y=y_test, p=preds)
        mean_balanced_logloss_score.append(ll)
    
    return np.mean(mean_balanced_logloss_score)


study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100)

optimization_results = pd.DataFrame([study.trials[i].params for i in range(len(study.trials))])
optimization_results['score'] = [study.trials[i].value for i in range(len(study.trials))]
optimization_results = optimization_results.sort_values(by='score')

optimization_results

In [17]:
best_params = study.best_params
best_params['disable_default_eval_metric'] = True
best_params['verbosity']=0
best_params['seed']=6

scores = []

X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, 
                                                            test_size=0.05, 
                                                            # random_state=20, 
                                                            shuffle=False)
dtest = xgb.DMatrix(X_test, y_test)


for i in range(0, 10):

    X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.05, shuffle=True)
    sampler_model = RandomOverSampler()
    X_re, y_re = sampler_model.fit_resample(X_train, y_train)

    dtrain = xgb.DMatrix(X_re, y_re)
    dval = xgb.DMatrix(X_val, y_val)

    model = xgb.train(params=best_params,
                      dtrain=dtrain,
                      obj=balancedlogloss,
                      evals=[(dtrain, 'dtrain'), (dval, 'dval')],
                      feval=balancedlogloss_eval, 
                      verbose_eval=5,
                      early_stopping_rounds=5
                      )

    scores = scores + [scoring(y=y_test, p=model.predict(dtest, output_margin=True))]

    print('balanced log loss: ' + str(scoring(y=y_test, p=model.predict(dtest, output_margin=True))))
    print('\n')

print(np.mean(scores))

[0]	dtrain-balanced_logloss:0.62521	dval-balanced_logloss:0.60090
[5]	dtrain-balanced_logloss:0.28174	dval-balanced_logloss:0.23436
[9]	dtrain-balanced_logloss:0.17818	dval-balanced_logloss:0.14367
balanced log loss: 0.15190405318779604


[0]	dtrain-balanced_logloss:0.61934	dval-balanced_logloss:0.59994
[5]	dtrain-balanced_logloss:0.27003	dval-balanced_logloss:0.25566
[9]	dtrain-balanced_logloss:0.16164	dval-balanced_logloss:0.19656
balanced log loss: 0.14238052896135264


[0]	dtrain-balanced_logloss:0.61890	dval-balanced_logloss:0.63215
[5]	dtrain-balanced_logloss:0.25641	dval-balanced_logloss:0.55371
[9]	dtrain-balanced_logloss:0.16063	dval-balanced_logloss:0.58318
balanced log loss: 0.1480838910876108


[0]	dtrain-balanced_logloss:0.61239	dval-balanced_logloss:0.63664
[5]	dtrain-balanced_logloss:0.25605	dval-balanced_logloss:0.39846
[9]	dtrain-balanced_logloss:0.16264	dval-balanced_logloss:0.40761
balanced log loss: 0.1367014557167533


[0]	dtrain-balanced_logloss:0.61744	dval-balan

In [18]:
test = pd.read_csv('test.csv')
test['EJ'].replace(['A', 'B'], [1, 0], inplace=True)

test_ej = np.array(pd.get_dummies(test['EJ']))

x_test_scaled = scaler.transform(test[x_numerical_columns])

X_test = np.append(x_test_scaled, test_ej, axis=1)
d_test = xgb.DMatrix(X_test)

preds = pd.DataFrame(index=range(test.shape[0]))

for i in range(0, 10):

    X_train, X_val, y_train, y_val = train_test_split(
        X, y, test_size=0.05, shuffle=True)
    sampler_model = RandomOverSampler()
    X_re, y_re = sampler_model.fit_resample(X_train, y_train)

    dtrain = xgb.DMatrix(X_re, y_re)
    dval = xgb.DMatrix(X_val, y_val)

    model = xgb.train(params=best_params,
                      dtrain=dtrain,
                      obj=balancedlogloss,
                      evals=[(dtrain, 'dtrain'), (dval, 'dval')],
                      feval=balancedlogloss_eval,
                      verbose_eval=5,
                      early_stopping_rounds=5
                      )

    p = expit(model.predict(d_test))
    p = pd.Series(p)

    preds = pd.concat([preds, p], axis=1)
    print('\n')

pred_1 = np.mean(preds, axis=1)
pred_0 = 1 - pred_1

# pred_0

submission = pd.DataFrame(index=test.index, columns=sample_submission.columns)
submission['Id'] = test['Id']
submission['class_0'] = pred_0
submission['class_1'] = pred_1

submission.to_csv('submission.csv', index=False)

[0]	dtrain-balanced_logloss:0.61328	dval-balanced_logloss:0.62704
[5]	dtrain-balanced_logloss:0.25886	dval-balanced_logloss:0.32987
[9]	dtrain-balanced_logloss:0.16999	dval-balanced_logloss:0.23159


[0]	dtrain-balanced_logloss:0.61653	dval-balanced_logloss:0.61232
[5]	dtrain-balanced_logloss:0.27242	dval-balanced_logloss:0.29932
[9]	dtrain-balanced_logloss:0.18532	dval-balanced_logloss:0.16908


[0]	dtrain-balanced_logloss:0.61862	dval-balanced_logloss:0.64976
[5]	dtrain-balanced_logloss:0.26078	dval-balanced_logloss:0.38419
[9]	dtrain-balanced_logloss:0.18083	dval-balanced_logloss:0.26106


[0]	dtrain-balanced_logloss:0.61955	dval-balanced_logloss:0.60142
[5]	dtrain-balanced_logloss:0.26328	dval-balanced_logloss:0.28250
[9]	dtrain-balanced_logloss:0.17536	dval-balanced_logloss:0.23096


[0]	dtrain-balanced_logloss:0.61040	dval-balanced_logloss:0.68194
[5]	dtrain-balanced_logloss:0.25809	dval-balanced_logloss:0.44859
[9]	dtrain-balanced_logloss:0.16985	dval-balanced_logloss:0.35369




In [19]:
submission

Unnamed: 0,Id,class_0,class_1
0,00eed32682bb,0.881854,0.118146
1,010ebe33f668,0.881854,0.118146
2,02fa521e1838,0.881854,0.118146
3,040e15f562a2,0.881854,0.118146
4,046e85c7cc7f,0.881854,0.118146
