In [1]:
#Standard Imports
import os

#Third Party Imports
from sklearn.metrics import accuracy_score, log_loss, roc_auc_score, make_scorer
import xgboost as xgb
#import fivecentplots as fcp #clean data vis library for easy pandas integration | will need to remove for submission
from sklearn.preprocessing import LabelEncoder
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
import seaborn as sns
from tqdm import tqdm
from sklearn.model_selection import cross_val_score, KFold, train_test_split, GridSearchCV
from sklearn.impute import KNNImputer

from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier


In [2]:
train_raw = pd.read_csv('/kaggle/input/icr-identify-age-related-conditions/train.csv')
test_raw = pd.read_csv('/kaggle/input/icr-identify-age-related-conditions/test.csv')

train_ID = train_raw['Id']
test_ID = test_raw['Id']

train_raw = train_raw.drop(axis='columns', labels='Id')
test_raw = test_raw.drop(axis='columns', labels='Id')

#confirm data
train_raw.head()

Unnamed: 0,AB,AF,AH,AM,AR,AX,AY,AZ,BC,BD,...,FL,FR,FS,GB,GE,GF,GH,GI,GL,Class
0,0.209377,3109.03329,85.200147,22.394407,8.138688,0.699861,0.025578,9.812214,5.555634,4126.58731,...,7.298162,1.73855,0.094822,11.339138,72.611063,2003.810319,22.136229,69.834944,0.120343,1
1,0.145282,978.76416,85.200147,36.968889,8.138688,3.63219,0.025578,13.51779,1.2299,5496.92824,...,0.173229,0.49706,0.568932,9.292698,72.611063,27981.56275,29.13543,32.131996,21.978,0
2,0.47003,2635.10654,85.200147,32.360553,8.138688,6.73284,0.025578,12.82457,1.2299,5135.78024,...,7.70956,0.97556,1.198821,37.077772,88.609437,13676.95781,28.022851,35.192676,0.196941,0
3,0.252107,3819.65177,120.201618,77.112203,8.138688,3.685344,0.025578,11.053708,1.2299,4169.67738,...,6.122162,0.49706,0.284466,18.529584,82.416803,2094.262452,39.948656,90.493248,0.155829,0
4,0.380297,3733.04844,85.200147,14.103738,8.138688,3.942255,0.05481,3.396778,102.15198,5728.73412,...,8.153058,48.50134,0.121914,16.408728,146.109943,8524.370502,45.381316,36.262628,0.096614,1


In [3]:
for col in train_raw.columns:
    print(f"NAN in {col}:  ", train_raw[col].isna().sum().sum())

NAN in AB:   0
NAN in AF:   0
NAN in AH:   0
NAN in AM:   0
NAN in AR:   0
NAN in AX:   0
NAN in AY:   0
NAN in AZ:   0
NAN in BC:   0
NAN in BD :   0
NAN in BN:   0
NAN in BP:   0
NAN in BQ:   60
NAN in BR:   0
NAN in BZ:   0
NAN in CB:   2
NAN in CC:   3
NAN in CD :   0
NAN in CF:   0
NAN in CH:   0
NAN in CL:   0
NAN in CR:   0
NAN in CS:   0
NAN in CU:   0
NAN in CW :   0
NAN in DA:   0
NAN in DE:   0
NAN in DF:   0
NAN in DH:   0
NAN in DI:   0
NAN in DL:   0
NAN in DN:   0
NAN in DU:   1
NAN in DV:   0
NAN in DY:   0
NAN in EB:   0
NAN in EE:   0
NAN in EG:   0
NAN in EH:   0
NAN in EJ:   0
NAN in EL:   60
NAN in EP:   0
NAN in EU:   0
NAN in FC:   1
NAN in FD :   0
NAN in FE:   0
NAN in FI:   0
NAN in FL:   1
NAN in FR:   0
NAN in FS:   2
NAN in GB:   0
NAN in GE:   0
NAN in GF:   0
NAN in GH:   0
NAN in GI:   0
NAN in GL:   1
NAN in Class:   0


In [4]:
#Seperate out our target feature
X = train_raw.iloc[:, :-1]
y = train_raw.iloc[:, -1]

#Now is a good time to confirm a few things about our data:
# 1: lets check how our encoding looks and we can decide if we need a new approach
# 2: lets check the distribution of our target feature

print("Categorical spread for EJ:\n", X.EJ.value_counts())
print("istribution of target:\n", y.value_counts())

#impute NaN values
imputer = KNNImputer(n_neighbors=2, weights='uniform')

#Add encoding for our only categorical col
encoder = LabelEncoder()
X.EJ = encoder.fit_transform(X.EJ)
X_imp = imputer.fit_transform(X)
X = pd.DataFrame(X_imp, columns=X.columns)
X.set_index(train_ID, inplace=True)

#Now we'll generate our prediction CSV
X_val = test_raw
encoder = LabelEncoder()
imputer = KNNImputer(n_neighbors=2, weights='uniform')
X_val.EJ = encoder.fit_transform(X_val.EJ)
X_val_imp = imputer.fit_transform(X_val)
X_val = pd.DataFrame(X_val_imp, columns=X_val.columns)
X_val.set_index(test_ID, inplace=True)

#define problem specific balanced log loss function
def balanced_log_loss(y_true, y_pred):
    nc = np.bincount(y_true)
    return log_loss(y_true, y_pred, sample_weight = 1/nc[y_true], eps=1e-15)

Categorical spread for EJ:
 B    395
A    222
Name: EJ, dtype: int64
istribution of target:
 0    509
1    108
Name: Class, dtype: int64


In [5]:
%%time
#for testing ONLY
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

rf_params = {'max_depth': None, 'min_samples_split': 3, 'n_estimators': 120, 'class_weight': 'balanced',}
lgb_params = {'learning_rate': 0.05, 'max_depth': 4, 'n_estimators': 120, 'num_leaves': 7, 'class_weight': 'balanced',}
xgb_params= {'learning_rate': 0.7,
                'max_depth': 2,
                'tree_method': 'gpu_hist',
                'predictor': 'gpu_predictor',
                'gpu_id': 0,
                'min_child_weight': 7,
                'max_bin': 100,
                'scale_pos_weight': 4.7, 
            }

# Define the base models
base_models = [
    ('xgb', XGBClassifier(**xgb_params)),
    ('rf', RandomForestClassifier(**rf_params)),
    ('lgb', LGBMClassifier(**lgb_params))
]

# Define meta learner model
meta_model = LogisticRegression()

# Define the stacking ensemble
stacking_model = StackingClassifier(estimators=base_models, cv=5)

# For storing out-of-fold predictions
train_oof = np.zeros((X.shape[0],))


NUM_FOLDS = 5
kf = KFold(n_splits=NUM_FOLDS, shuffle=True, random_state=0)

# Same cross-validation scheme
for f, (train_ind, val_ind) in enumerate(kf.split(X, y)):
    train_df, val_df = X.iloc[train_ind], X.iloc[val_ind]
    train_target, val_target = y[train_ind], y[val_ind]
    
    # Fit the stacking classifier and make predictions
    stacking_model.fit(train_df, train_target)
    
    train_oof[val_ind] = stacking_model.predict_proba(val_df)[:, 1]
    
    print(f"Fold {f} log loss: {balanced_log_loss(val_target, train_oof[val_ind])}")

print(f"Overall log loss: {balanced_log_loss(y, train_oof)}")


Fold 0 log loss: 0.35821913286107665
Fold 1 log loss: 0.46156309344135243
Fold 2 log loss: 0.404855312920086
Fold 3 log loss: 0.2667457258282674
Fold 4 log loss: 0.2522961504372501
Overall log loss: 0.34913648107487777
CPU times: user 21.2 s, sys: 619 ms, total: 21.8 s
Wall time: 18.3 s


In [6]:
#final training
stacking_model.fit(X, y)

In [7]:
preds = stacking_model.predict_proba(X_val)

submission = pd.DataFrame()
submission["Id"] = X_val.index

submission["class_0"] = preds[:,0]
submission["class_1"] = preds[:,1]


In [8]:
submission.head()

Unnamed: 0,Id,class_0,class_1
0,00eed32682bb,0.839452,0.160548
1,010ebe33f668,0.839452,0.160548
2,02fa521e1838,0.839452,0.160548
3,040e15f562a2,0.839452,0.160548
4,046e85c7cc7f,0.839452,0.160548


In [9]:
submission.to_csv("submission.csv",header=True,index=False)