In [1]:
import pandas as pd
import numpy as np
import sklearn
import matplotlib.pyplot as plt

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

import warnings

warnings.filterwarnings("ignore")

import lightgbm as lgb
import optuna
import ray
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.metrics import (
    confusion_matrix,
    accuracy_score,
    roc_auc_score,
    balanced_accuracy_score,
)
from imblearn.over_sampling import RandomOverSampler, SMOTE, SMOTENC
from sklearn.utils import compute_class_weight, class_weight
from sklearn.manifold import Isomap
from typing import Tuple
from scipy.special import expit
import xgboost as xgb

In [2]:
train = pd.read_csv('train.csv')
train['EJ'].replace(['A', 'B'], [1, 0], inplace=True)

ej = np.array(train['EJ']).reshape(-1, 1)

sample_submission = pd.read_csv('sample_submission.csv')

y = np.array(train['Class'])

greeks = pd.read_csv('greeks.csv')
greeks['Epsilon'].replace(['Unknown'], np.nan, inplace=True)

train_greeks = pd.merge(train, greeks, on='Id')
greek_columns = greeks.columns.drop(['Id', 'Epsilon', 'Alpha']).tolist()

In [3]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# scaler = MinMaxScaler()
scaler = StandardScaler()

x_numerical_columns = train.drop(columns=["Id", "Class", "EJ"]).columns.tolist()
x_categorical_columns = ["EJ"]
x_cols = x_numerical_columns + x_categorical_columns

scaler.fit(train_greeks[x_numerical_columns])
x_standardized = scaler.transform(train_greeks[x_numerical_columns])

In [4]:
from sklearn.impute import KNNImputer

knn = KNNImputer()
knn.fit(x_standardized)
x_imputed_standardized = knn.transform(x_standardized)

X = np.concatenate((x_imputed_standardized, ej), axis=1)

In [5]:
def to_df(arr):
    cols = x_numerical_columns + x_categorical_columns
    df = pd.DataFrame(arr, columns=cols)
    df[x_categorical_columns] = df[x_categorical_columns].astype("category")
    return df

In [6]:
def balancedlogloss(
    predt: np.ndarray, dtrain: lgb.Dataset
) -> Tuple[np.ndarray, np.ndarray]:
    y = dtrain.get_label()
    n0 = len(y[y == 0])
    n1 = len(y[y == 1])

    p = expit(predt)
    p[p == 0] = 1e-15

    grad = 1 / 2 * ((1 - y) / (1 - p) - y / p)
    hess = 1 / 2 * ((1 - y) / ((1 - p) ** 2) + y / (p**2))
    return grad, hess


def balancedlogloss_eval(
    predt: np.ndarray, dtrain: lgb.Dataset
) -> Tuple[np.ndarray, np.ndarray]:
    y = dtrain.get_label()
    n0 = len(y[y == 0])
    n1 = len(y[y == 1])
    p = expit(predt)

    p[p == 0] = 1e-15

    return (
        "balanced_logloss",
        (-1 / n0 * (sum((1 - y) * np.log(1 - p))) - 1 / n1 * (sum(y * np.log(p)))) / 2,
        True
    )

In [14]:
def cv(X_kf, y_kf, cv_param, num_boost_round=100):
    
    train_evals = pd.DataFrame()
    test_evals = pd.DataFrame()

    kf = StratifiedKFold(10, random_state=12, shuffle=True)
    k = 0

    for train_index, test_index in kf.split(X_kf, y_kf):
        X_train = pd.DataFrame(X_kf[train_index], columns=x_cols)
        X_test = pd.DataFrame(X_kf[test_index], columns=x_cols)
        y_train = y_kf[train_index]
        y_test = y_kf[test_index]

        # sampler = RandomOverSampler()
        sampler = SMOTE()
        X_train, y_train = sampler.fit_resample(X_train, y_train)

        n_components = 10
        dr_cols = ['Isomap'+str(i) for i in range(n_components)]
        dr = Isomap(n_components=n_components)
        dr.fit(X_train)
        X_train_dr = pd.DataFrame(dr.transform(X_train), columns=dr_cols)
        X_test_dr = pd.DataFrame(dr.transform(X_test), columns=dr_cols)

        X_train = pd.concat([X_train, X_train_dr], axis=1)
        X_test = pd.concat([X_test, X_test_dr], axis=1)
        cols = X_train.columns.tolist()

        train_set = lgb.Dataset(X_train, y_train, feature_name=cols)
        test_set = lgb.Dataset(X_test, y_test, feature_name=cols)

        evals = {}
        clf = lgb.train(
                        params=cv_param,
                        train_set=train_set,
                        categorical_feature=['EJ'],
                        valid_sets=[train_set, test_set],
                        verbose_eval=False,
                        fobj=balancedlogloss,
                        feval=balancedlogloss_eval,
                        num_boost_round=num_boost_round,
                        evals_result=evals
                        )

        train_evals[str(k)] = evals["training"]["balanced_logloss"]
        test_evals[str(k)] = evals["valid_1"]["balanced_logloss"]
        k = k + 1

    eval_df = pd.concat([train_evals.mean(axis=1, skipna=False), test_evals.mean(axis=1, skipna=False)], axis=1)
    eval_df.columns = ['train', 'test']
    return eval_df

cv_params = {'learning_rate': 0.1,
            'num_leaves': 10,
            #  'lambda_l1': i,
                # 'lambda_l2': 1,
            # 'max_depth': 100,
            'verbosity': -1
            }
cv_results = cv(X, y, cv_params, num_boost_round=400)

print(cv_results['test'].min())
# cv_results.iloc[range(0, len(cv_results), 50)]


0.3788504865670999


In [8]:
from itertools import product
from sklearn.model_selection import ParameterGrid

param_space = {
    'num_leaves': [10, 2500],
    'max_depth': [5, 80, 100],
    'lambda_l1': [0.02, 0.5, 10],
    'lambda_l2': [30], 
    'verbosity': [-1]
}

grid = ParameterGrid(param_space)

for params_grid in grid:
    print(params_grid)

    cv_results = cv(X, y, params_grid, num_boost_round=400)
    print(cv_results['test'].min())

{'lambda_l1': 0.02, 'lambda_l2': 30, 'max_depth': 5, 'num_leaves': 10, 'verbosity': -1}
0.4173228760123126
{'lambda_l1': 0.02, 'lambda_l2': 30, 'max_depth': 5, 'num_leaves': 2500, 'verbosity': -1}


ValueError: Input X contains infinity or a value too large for dtype('float64').

In [None]:
""" test = pd.read_csv("test.csv")
test["EJ"].replace(["A", "B"], [1, 0], inplace=True)
test_ej = np.array(test["EJ"]).reshape(-1, 1)

x_test_scaled = scaler.transform(test[x_numerical_columns])

X_test = np.append(x_test_scaled, test_ej, axis=1) """