In [26]:
import pandas as pd
import numpy as np
import xgboost as xgb
import lightgbm as lgb
#Feature importances
from sklearn.model_selection import KFold

import warnings
warnings.filterwarnings('ignore')

In [27]:
# https://www.kaggle.com/code/datafan07/icr-simple-eda-baseline
def balance_logloss(y_true, y_pred):
    
    y_pred = np.stack([1-y_pred,y_pred]).T
    y_pred = np.clip(y_pred, 1e-15, 1-1e-15)
    y_pred / np.sum(y_pred, axis=1)[:, None]
    nc = np.bincount(y_true)
    
    logloss = (-1/nc[0]*(np.sum(np.where(y_true==0,1,0) * np.log(y_pred[:,0]))) - 1/nc[1]*(np.sum(np.where(y_true!=0,1,0) * np.log(y_pred[:,1])))) / 2
    
    return logloss



In [28]:
raw = pd.read_csv("./data/train.csv")
greeks = pd.read_csv("./data/greeks.csv")
test = pd.read_csv("./data/test.csv")

In [29]:
# df = pd.merge(raw, greeks, on="Id")
df = raw.copy()

In [30]:
# clean features
def clean_features(df: pd.DataFrame) -> pd.DataFrame:
    df.columns = df.columns.str.replace(' ', '') # remove spaces
    df = df.fillna(df.mean(numeric_only=True))
    df["EJ"] = np.where(df["EJ"] == "A", 1, 0)
    return df

In [31]:
df  = df.pipe(clean_features)

## Define Features and Target

In [32]:
FEATURES = df.columns[1:-1]

## Bagging and Kfolds

In [33]:
target = df["Class"]
print(f"Out classes are imbalanced in a ration :{len(target.loc[target == 0])/len(target.loc[target == 1])} to 1 ")

Out classes are imbalanced in a ration :4.712962962962963 to 1 


In [34]:
# import train test split
from sklearn.model_selection import train_test_split
from lightgbm import early_stopping

In [35]:
FOLDS = 10
SEED = 42

In [36]:
simple_folds = KFold(n_splits=FOLDS, shuffle=True, random_state=SEED+FOLDS)
oof = np.zeros(len(df))
logloss = []

for fold, (train_idx, valid_idx) in enumerate(simple_folds.split(df[FEATURES], df["Class"])):
    early_stopping_callback = early_stopping(stopping_rounds=300, first_metric_only=True, verbose=True)
    print("-"* 20 + f"Fold {fold+1}" + "-"* 20)
    train_dataset = lgb.Dataset(df.loc[train_idx, FEATURES], df.loc[train_idx, "Class"], categorical_feature=["EJ"])
    eval_dataset  = lgb.Dataset(df.loc[valid_idx, FEATURES], df.loc[valid_idx, "Class"], categorical_feature=["EJ"])
    lgb_params = {
    'objective': 'binary', 
    'boosting': 'goss',
    'learning_rate': 0.0883447499631696,
    'num_leaves': 4,
    'feature_fraction': 0.5014338346504184,
    'bagging_fraction': 0.8486891010640193,
    'lambda_l1': 3.264832774300416e-06, 
    'lambda_l2': 8.605058359426325e-07,
    'n_jobs': -1,
    'is_unbalance':True, 
    'verbose': -1,
    'seed': SEED,

    
    }

    model = lgb.train(
                params = lgb_params,
                train_set = train_dataset,
                num_boost_round = 50000,
                valid_sets = [train_dataset, eval_dataset],
                callbacks = [early_stopping_callback],
                )
    
    oof[valid_idx] = model.predict(df.loc[valid_idx, FEATURES])
    logloss.append(balance_logloss(df.loc[valid_idx, "Class"], oof[valid_idx]))

--------------------Fold 1--------------------
Training until validation scores don't improve for 300 rounds
Early stopping, best iteration is:
[321]	training's binary_logloss: 0.00874041	valid_1's binary_logloss: 0.0728333
Evaluated only: binary_logloss
--------------------Fold 2--------------------
Training until validation scores don't improve for 300 rounds
Early stopping, best iteration is:
[321]	training's binary_logloss: 0.00852636	valid_1's binary_logloss: 0.115718
Evaluated only: binary_logloss
--------------------Fold 3--------------------
Training until validation scores don't improve for 300 rounds
Early stopping, best iteration is:
[992]	training's binary_logloss: 1.06573e-05	valid_1's binary_logloss: 0.0465939
Evaluated only: binary_logloss
--------------------Fold 4--------------------
Training until validation scores don't improve for 300 rounds
Early stopping, best iteration is:
[123]	training's binary_logloss: 0.0689367	valid_1's binary_logloss: 0.200273
Evaluated onl

### CV logloss

In [37]:
logloss

[0.18187420039410832,
 0.24744310633008815,
 0.03333962055805296,
 0.34644534512908953,
 0.0779410179690335,
 0.3795339352399518,
 0.23124057944740933,
 0.44983306373935716,
 0.3342433645327101,
 0.1952212715356735]

In [38]:
cv_logloss = balance_logloss(df["Class"].values, oof)
print("CV_score:", cv_logloss)

CV_score: 0.2610785714107777


In [None]:
import matplotlib.pyplot as plt

In [None]:
plt.hist(oof, bins=100)
plt.show()

# Submission

In [None]:
test = test.pipe(clean_features)

In [None]:
predictions = np.zeros(len(test))
for bag in range(BAGS):
    for xgb_c in models[bag]:
        X_test = test[FEATURES]
        predictions += xgb_c.predict_proba(X_test)[:,1]/(BAGS*FOLDS)

In [None]:
predictions

In [None]:
submission = pd.DataFrame({"Id": test["Id"], "class_0": 1 - predictions, "class_1": predictions})

In [None]:
submission.head()

In [None]:
submission.to_csv("submission.csv", index=False)