In [54]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import KFold

import warnings
warnings.filterwarnings('ignore')

In [55]:
raw = pd.read_csv("/kaggle/input/icr-identify-age-related-conditions/train.csv")
greeks = pd.read_csv("/kaggle/input/icr-identify-age-related-conditions/greeks.csv")
test = pd.read_csv("/kaggle/input/icr-identify-age-related-conditions/test.csv")

In [56]:
# df = pd.merge(raw, greeks, on="Id")
df = raw.copy()

In [57]:
# https://www.kaggle.com/code/datafan07/icr-simple-eda-baseline
def balance_logloss(y_true, y_pred):
    
    y_pred = np.stack([1-y_pred,y_pred]).T
    y_pred = np.clip(y_pred, 1e-15, 1-1e-15)
    y_pred / np.sum(y_pred, axis=1)[:, None]
    nc = np.bincount(y_true)
    
    logloss = (-1/nc[0]*(np.sum(np.where(y_true==0,1,0) * np.log(y_pred[:,0]))) - 1/nc[1]*(np.sum(np.where(y_true!=0,1,0) * np.log(y_pred[:,1])))) / 2
    
    return logloss


In [58]:
# clean features
def clean_features(df: pd.DataFrame) -> pd.DataFrame:
    df.columns = df.columns.str.replace(' ', '') # remove spaces
    df = df.fillna(df.mean(numeric_only=True))
    df["EJ"] = np.where(df["EJ"] == "A", 1, 0)
    return df

In [59]:
df.columns

Index(['Id', 'AB', 'AF', 'AH', 'AM', 'AR', 'AX', 'AY', 'AZ', 'BC', 'BD ', 'BN',
       'BP', 'BQ', 'BR', 'BZ', 'CB', 'CC', 'CD ', 'CF', 'CH', 'CL', 'CR', 'CS',
       'CU', 'CW ', 'DA', 'DE', 'DF', 'DH', 'DI', 'DL', 'DN', 'DU', 'DV', 'DY',
       'EB', 'EE', 'EG', 'EH', 'EJ', 'EL', 'EP', 'EU', 'FC', 'FD ', 'FE', 'FI',
       'FL', 'FR', 'FS', 'GB', 'GE', 'GF', 'GH', 'GI', 'GL', 'Class'],
      dtype='object')

In [60]:
df = df.pipe(clean_features)

In [61]:
numeric_features = df.select_dtypes(include=[np.number]).columns
#pop EJ
numeric_features = numeric_features.drop(["EJ", "Class"])


In [62]:
index = []
df_clean = df.copy()
# recenter to the lower or upper percentile
for col in numeric_features:
    
    lower_percentile,upper_percentile = np.percentile(df[col], [25,75])
    index.extend(df_clean[(df_clean[col] > upper_percentile*2.5) | (df_clean[col] < lower_percentile*0.5)].index)
    df_clean.loc[df_clean[col] > upper_percentile*2.5, col] = upper_percentile*2.5
    df_clean.loc[df_clean[col] < lower_percentile*0.5, col] = lower_percentile*0.5


index = list(set(index))

In [63]:
df_25_75 = np.percentile(df[numeric_features], [25,75], axis=0)
# Create a dataframe: 1st column is features (each row being one), second col is the 25th percentile*2.5, third col is the 75th percentile *0.5
df_25_75 = pd.DataFrame(np.concatenate([df_25_75[0].reshape(-1,1), df_25_75[1].reshape(-1,1)], axis=1), columns=["25th", "75th"], index=numeric_features)
df_25_75["25th"] = df_25_75["25th"]*0.5
df_25_75["75th"] = df_25_75["75th"]*2.5

In [64]:
FEATURES = df.columns[1:-1]

In [65]:
target = df_clean["Class"]
print(f"Out classes are imbalanced in a ration :{len(target.loc[target == 0])/len(target.loc[target == 1])} to 1 ")

Out classes are imbalanced in a ration :4.712962962962963 to 1 


In [66]:
# import train test split
from sklearn.model_selection import train_test_split
from lightgbm import early_stopping

In [67]:
FOLDS = 10
SEED = 42

In [68]:
simple_folds = KFold(n_splits=FOLDS, shuffle=True, random_state=SEED+FOLDS)
oof = np.zeros(len(df_clean))
logloss = []

for fold, (train_idx, valid_idx) in enumerate(simple_folds.split(df_clean[FEATURES], df_clean["Class"])):
    early_stopping_callback = early_stopping(stopping_rounds=500, first_metric_only=True, verbose=True)
    print("-"* 20 + f"Fold {fold+1}" + "-"* 20)
    train_dataset = lgb.Dataset(df_clean.loc[train_idx, FEATURES], df_clean.loc[train_idx, "Class"], categorical_feature=["EJ"])
    eval_dataset  = lgb.Dataset(df_clean.loc[valid_idx, FEATURES], df_clean.loc[valid_idx, "Class"], categorical_feature=["EJ"])
    lgb_params = {
        'objective': 'binary', 
        'metric': 'binary_logloss', 
        'boosting': 'goss',
        'learning_rate': 0.09110460114828077,
        'num_leaves': 8,
        'feature_fraction': 0.4989639912997521,
        'bagging_fraction': 0.54872439795985,
        'lambda_l1': 1.4522184914523175, 
        'lambda_l2': 1.7873553090132748e-08,
        'n_jobs': -1,
        'is_unbalance':True, 
        'verbose': -1,
        'seed': SEED,
    }

    model = lgb.train(
                params = lgb_params,
                train_set = train_dataset,
                num_boost_round = 50000,
                valid_sets = [train_dataset, eval_dataset],
                callbacks = [early_stopping_callback],
                )
    
    oof[valid_idx] = model.predict(df_clean.loc[valid_idx, FEATURES])
    logloss.append(balance_logloss(df_clean.loc[valid_idx, "Class"], oof[valid_idx]))

--------------------Fold 1--------------------
Training until validation scores don't improve for 500 rounds
Early stopping, best iteration is:
[776]	training's binary_logloss: 0.011023	valid_1's binary_logloss: 0.0549778
Evaluated only: binary_logloss
--------------------Fold 2--------------------
Training until validation scores don't improve for 500 rounds
Early stopping, best iteration is:
[1478]	training's binary_logloss: 0.00994793	valid_1's binary_logloss: 0.120251
Evaluated only: binary_logloss
--------------------Fold 3--------------------
Training until validation scores don't improve for 500 rounds
Early stopping, best iteration is:
[2020]	training's binary_logloss: 0.00935658	valid_1's binary_logloss: 0.0651712
Evaluated only: binary_logloss
--------------------Fold 4--------------------
Training until validation scores don't improve for 500 rounds
Early stopping, best iteration is:
[144]	training's binary_logloss: 0.0367969	valid_1's binary_logloss: 0.19953
Evaluated only:

In [69]:
logloss

[0.12256425093951392,
 0.22194144090675566,
 0.056962669327990406,
 0.3798453298775313,
 0.09040772162656344,
 0.350264316854682,
 0.20750179077246367,
 0.3277011817084696,
 0.305760188262462,
 0.1671273176504795]

In [70]:
cv_logloss = balance_logloss(df["Class"].values, oof)
print("CV_score:", cv_logloss)

CV_score: 0.2360680627035537


## Submission

In [71]:
test = test.pipe(clean_features)
# recenter using the df_25_75
for col in numeric_features:
    test.loc[test[col] > df_25_75.loc[col, "75th"], col] = df_25_75.loc[col, "75th"]
    test.loc[test[col] < df_25_75.loc[col, "25th"], col] = df_25_75.loc[col, "25th"]

In [72]:
predictions = np.zeros(len(test))
predictions = model.predict(test[FEATURES])

In [73]:
predictions

array([0.64969683, 0.64969683, 0.64969683, 0.64969683, 0.64969683])

In [74]:
submission = pd.DataFrame({"Id": test["Id"], "class_0": 1 - predictions, "class_1": predictions})

In [75]:
submission.head()

Unnamed: 0,Id,class_0,class_1
0,00eed32682bb,0.350303,0.649697
1,010ebe33f668,0.350303,0.649697
2,02fa521e1838,0.350303,0.649697
3,040e15f562a2,0.350303,0.649697
4,046e85c7cc7f,0.350303,0.649697


In [76]:
submission.to_csv("submission.csv", index=False)