In [None]:
import pandas as pd
import numpy as np
import sklearn
import matplotlib.pyplot as plt

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

import warnings

warnings.filterwarnings("ignore")

import lightgbm as lgb
import optuna
import ray
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.metrics import (
    confusion_matrix,
    accuracy_score,
    roc_auc_score,
    balanced_accuracy_score,
)
from imblearn.over_sampling import RandomOverSampler, SMOTE, SMOTENC
from sklearn.utils import compute_class_weight, class_weight
from sklearn.manifold import Isomap
from typing import Tuple
from scipy.special import expit
import xgboost as xgb

In [None]:
train = pd.read_csv('train.csv')
train['EJ'].replace(['A', 'B'], [1, 0], inplace=True)

ej = np.array(train['EJ']).reshape(-1, 1)

sample_submission = pd.read_csv('sample_submission.csv')

y = np.array(train['Class'])

greeks = pd.read_csv('greeks.csv')
greeks['Epsilon'].replace(['Unknown'], np.nan, inplace=True)

train_greeks = pd.merge(train, greeks, on='Id')
greek_columns = greeks.columns.drop(['Id', 'Epsilon', 'Alpha']).tolist()

In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

scaler = MinMaxScaler()
# scaler = StandardScaler()

x_numerical_columns = train.drop(columns=["Id", "Class", "EJ"]).columns.tolist()
x_categorical_columns = ["EJ"] + greek_columns
x_cols = x_numerical_columns + x_categorical_columns

scaler.fit(train_greeks[x_numerical_columns])
x_standardized = scaler.transform(train_greeks[x_numerical_columns])

In [None]:
from sklearn.impute import KNNImputer

knn = KNNImputer()
knn.fit(x_standardized)
x_imputed_standardized = knn.transform(x_standardized)


isomap = Isomap(n_components=3)
x_isomap = isomap.fit_transform(x_imputed_standardized)
x_numerical_columns = x_numerical_columns + [('isomap_'+str(i)) for i in range(3)]

X = np.concatenate((x_imputed_standardized, x_isomap, ej), axis=1)

In [None]:
def max_freq_to_one(series):
    new_series = series.copy()
    max_freq_value = series.value_counts().idxmax()
    new_series[new_series != max_freq_value] = 0
    new_series[new_series == max_freq_value] = 1
    return new_series


greeks_binary = greeks[greek_columns].copy()

for i in greek_columns:
    greeks_binary[i] = max_freq_to_one(greeks_binary[i])

greeks_binary = np.array(greeks_binary).astype("int")

X_greeks_binary = np.append(X, greeks_binary, axis=1)

In [None]:
def to_df(arr):
    df = pd.DataFrame(arr, columns=x_cols)
    df[x_categorical_columns] = df[x_categorical_columns].astype("category")
    return df

In [None]:
def balancedlogloss(
    predt: np.ndarray, dtrain: xgb.DMatrix
) -> Tuple[np.ndarray, np.ndarray]:
    y = dtrain.get_label()
    n0 = len(y[y == 0])
    n1 = len(y[y == 1])

    p = expit(predt)
    p[p == 0] = 1e-15

    grad = 1 / 2 * ((1 - y) / (1 - p) - y / p)
    hess = 1 / 2 * ((1 - y) / ((1 - p) ** 2) + y / (p**2))
    return grad, hess


def scoring(y, p):
    p = expit(p)

    p[p == 0] = 1e-4
    p[p == 1] = 1 - (1e-4)

    n0 = len(y[y == 0])
    n1 = len(y[y == 1])

    score = (-1 / n0 * (sum((1 - y) * np.log(1 - p))) - 1 / n1 * (sum(y * np.log(p)))) / 2
    return score


def balancedlogloss_eval(
    predt: np.ndarray, dtrain: xgb.DMatrix
) -> Tuple[np.ndarray, np.ndarray]:
    y = dtrain.get_label()
    n0 = len(y[y == 0])
    n1 = len(y[y == 1])
    p = expit(predt)

    p[p == 0] = 1e-15

    return (
        "balanced_logloss",
        (-1 / n0 * (sum((1 - y) * np.log(1 - p))) - 1 / n1 * (sum(y * np.log(p)))) / 2,
    )

In [None]:
params_0 = {
    "num_leaves": 10,
    "learning_rate": 0.1,
    "boosting_type": "gbdt",
    "n_estimators": 500,
    "max_bin": 100,
    "seed": 6,
}
params_1 = {
    "num_leaves": 30,
    "learning_rate": 0.24,
    "boosting_type": "goss",
    "n_estimators": 400,
    "max_bin": 200,
    "seed": 6,
}
params_2 = {
    "num_leaves": 15,
    "learning_rate": 0.1,
    "boosting_type": "goss",
    "n_estimators": 300,
    "max_bin": 3000,
    "seed": 6,
}
params = [params_0, params_1, params_2]

""" kf = StratifiedKFold(10, shuffle=True, random_state=24)
for t in range(2, 3):
    l = X.shape[1] + t
    acc_scores = []

    for train_index, test_index in kf.split(X_greeks_binary, y):
        X_train, X_test = X_greeks_binary[train_index, :l], X_greeks_binary[test_index, :l]
        y_train, y_test = X_greeks_binary[train_index, l], X_greeks_binary[test_index, l]
        
        sampler = SMOTE()
        X_train, y_train = sampler.fit_resample(X_train, y_train)

        param = params[t]
        imputer = lgb.LGBMClassifier(**param)
        imputer.fit(X_train, y_train)
        y_pred = imputer.predict(X_test)

        acc = balanced_accuracy_score(y_true=y_test, y_pred=y_pred)
        acc_scores = acc_scores + \
            [balanced_accuracy_score(y_true=y_test, y_pred=y_pred)]

    print(np.mean(acc_scores)) """

In [None]:
def imputation(train_set, test_set):
    params = [params_0, params_1, params_2]

    X_test = test_set[:, : X.shape[1]]

    preds = np.zeros_like(test_set[:, :3])

    for i in range(3):
        X_train = train_set[:, : X.shape[1]].copy()
        y_train = train_set[:, (X.shape[1] + i)].copy()

        sampler = SMOTE()
        X_train, y_train = sampler.fit_resample(X_train, y_train)

        imputer = lgb.LGBMClassifier(**params[i])
        imputer.fit(X_train, y_train)
        y_pred = imputer.predict(X_test)
        preds[:, i] = y_pred

        y_test = test_set[:, (X.shape[1] + i)]
        print(balanced_accuracy_score(y_pred=y_pred, y_true=y_test))

    test_set_imputed = test_set.copy()
    test_set_imputed[:, X.shape[1] :] = preds
    return test_set_imputed

In [None]:
def cv(X_kf, y_kf, param, num_boost_round=100):
    
    train_evals = pd.DataFrame()
    test_evals = pd.DataFrame()

    kf = StratifiedKFold(10, random_state=12, shuffle=True)
    k = 0

    for train_index, test_index in kf.split(X_kf, y_kf):
        X_train = X_kf[train_index]
        X_test = X_kf[test_index]
        y_train = y_kf[train_index]
        y_test = y_kf[test_index]

        sampler = SMOTENC(
            categorical_features=[i for i in range(X.shape[1], X_kf.shape[1])]
        )
        X_train, y_train = sampler.fit_resample(X_train, y_train)

        # X_train_complete, X_train_missing = train_test_split(range(len(X_train)), test_size=0.9)
        # X_train[X_train_missing] = imputation(X_train[X_train_complete], X_train[X_train_missing])
        # X_test = imputation(X_train[X_train_complete], X_test)

        X_test = imputation(X_train, X_test)
        
        train_set = xgb.DMatrix(to_df(X_train), y_train, enable_categorical=True)
        test_set = xgb.DMatrix(to_df(X_test), y_test, enable_categorical=True)

        evals = {}
        clf = xgb.train(
            params=params,
            dtrain=train_set,
            obj=balancedlogloss,
            feval=balancedlogloss_eval,
            evals=[(train_set, "dtrain"), (test_set, "dtest")],
            evals_result=evals,
            verbose_eval=50,
            num_boost_round=num_boost_round,
        )

        train_evals[str(k)] = evals["dtrain"]["balanced_logloss"]
        test_evals[str(k)] = evals["dtest"]["balanced_logloss"]
        k = k + 1

    eval_df = pd.concat([train_evals.mean(axis=1), test_evals.mean(axis=1)], axis=1)
    eval_df.columns = ["train", "test"]
    return eval_df

In [None]:
kf = StratifiedKFold(10, random_state=42, shuffle=True)

train_evals = pd.DataFrame()
test_evals = pd.DataFrame()

params = {'learning_rate': 0.02,
          'subsample': 0.8,
          'lambda': 40,
          'min_child_weight': 60,
          'disable_default_eval_metric': True,
          }

k = 0

for train_index, test_index in kf.split(X, y):
    X_train = X[train_index]
    X_test = X[test_index]
    y_train = y[train_index]
    y_test = y[test_index]

    sampler = SMOTENC(categorical_features=np.array([58]), k_neighbors=3)
    # sampler = RandomOverSampler()
    X_train, y_train = sampler.fit_resample(X_train, y_train)

    X_train = pd.DataFrame(X_train, columns=x_numerical_columns+['EJ'])
    X_train['EJ'] = X_train['EJ'].astype('category')
    
    X_test = pd.DataFrame(X_test, columns=x_numerical_columns+['EJ'])
    X_test['EJ'] = X_test['EJ'].astype('category')
    
    train_set = xgb.DMatrix(to_df(X_train), y_train, enable_categorical=True)
    test_set = xgb.DMatrix(to_df(X_test), y_test, enable_categorical=True)

    evals = {}
    clf = xgb.train(
        params=params,
        dtrain=train_set,
        obj=balancedlogloss,
        feval=balancedlogloss_eval,
        evals=[(train_set, "dtrain"), (test_set, "dtest")],
        evals_result=evals,
        verbose_eval=100,
        num_boost_round=1000,
    )

    train_evals[str(k)] = evals["dtrain"]["balanced_logloss"]
    test_evals[str(k)] = evals["dtest"]["balanced_logloss"]
    k = k + 1

eval_df = pd.concat([train_evals.mean(axis=1, skipna=False), test_evals.mean(axis=1, skipna=False)], axis=1)
eval_df.columns = ['train', 'test']

eval_df

In [None]:
test = pd.read_csv("test.csv")
test["EJ"].replace(["A", "B"], [1, 0], inplace=True)
test_ej = np.array(test["EJ"]).reshape(-1, 1)

x_test_scaled = scaler.transform(test[x_numerical_columns])

X_test = np.append(x_test_scaled, test_ej, axis=1)

test_greeks = np.zeros_like(X_test[:, :3])
X_test_greeks = np.append(X_test, test_greeks, axis=1)
# X_test_greeks[:, X.shape[1]:] = np.nan

X_test_greeks_imputed = imputation(train_set=X_greeks_binary, test_set=X_test_greeks)

dtest = xgb.DMatrix(X_test_greeks_imputed)

In [None]:
sampler_model = RandomOverSampler()
X_re, y_re = sampler_model.fit_resample(X_train, y_train)

dtrain = xgb.DMatrix(X_re, y_re)

model = xgb.train(
    params=best_params,
    dtrain=dtrain,
    obj=balancedlogloss,
    evals=[(dtrain, "dtrain")],
    feval=balancedlogloss_eval,
    verbose_eval=20,
    num_boost_round=200,
)

p = expit(model.predict(dtest))
p = pd.Series(p)


pred_1 = p
pred_0 = 1 - pred_1

# pred_0

submission = pd.DataFrame(index=test.index, columns=sample_submission.columns)
submission["Id"] = test["Id"]
submission["class_0"] = pred_0
submission["class_1"] = pred_1

submission.to_csv("submission.csv", index=False)

submission