# Imports and configuration

In [2]:
import pandas as pd, xgboost as xgb, lightgbm as lgbm, sklearn, numpy as np
import optuna
from sklearn.compose import ColumnTransformer
from sklearn.impute import KNNImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.utils.class_weight import compute_sample_weight



In [3]:
#Allow visualizing as many columns and rows as requested.
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# Data retrieval and preparation

In [4]:
def read_train_and_prediction_request():
    train = pd.read_csv('../input/icr-identify-age-related-conditions/train.csv', index_col = 'Id')
    prediction_request = pd.read_csv('../input/icr-identify-age-related-conditions/test.csv', index_col = 'Id')
    return train, prediction_request

In [7]:
train, prediction_request = read_train_and_prediction_request()

x, y = train.drop(columns='Class'), train['Class'] #Labels cannot be handled by column transformers. https://scikit-learn.org/stable/modules/preprocessing_targets.html https://scikit-learn.org/stable/modules/generated/sklearn.compose.TransformedTargetRegressor.html
numerical_column_names = x.select_dtypes(include=np.number).columns.tolist()

column_transformer = ColumnTransformer(
    transformers=[
        #One hot encodes using one column per category. Doesn't need handling of missing values prior, but will error if unknown values are passed.
        ('one_hot_encode_categories', OneHotEncoder(sparse_output=False, dtype='int'), ['EJ']),
        ('rescale_numericals_from_0_to_1', sklearn.preprocessing.MinMaxScaler(), numerical_column_names),
    ],
    remainder='passthrough',
    verbose_feature_names_out = False #Prevent the transformer from pointlessly prefixing all column names.
)
column_transformer.set_output(transform='pandas')

x, prediction_request = column_transformer.fit_transform(x), column_transformer.transform(prediction_request)

display(x.head(3))
display(prediction_request.head(3))

Unnamed: 0_level_0,EJ_A,EJ_B,AB,AF,AH,AM,AR,AX,AY,AZ,BC,BD,BN,BP,BQ,BR,BZ,CB,CC,CD,CF,CH,CL,CR,CS,CU,CW,DA,DE,DF,DH,DI,DL,DN,DU,DV,DY,EB,EE,EG,EH,EL,EP,EU,FC,FD,FE,FI,FL,FR,FS,GB,GE,GF,GH,GI,GL
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1
000ff2bfdfe9,0,1,0.021082,0.102347,0.0,0.030632,0.0,0.0,0.0,0.180337,0.002958,0.047364,0.654545,0.04324,0.440929,0.004312,0.0,0.015372,0.098469,0.0,0.021656,0.091892,0.0,0.0,0.0,0.241834,0.507476,0.305651,0.125554,0.0,0.238606,0.029338,0.234167,0.412931,0.03288,0.0,0.147697,0.026299,0.094302,0.041505,0.022225,0.245682,0.0,0.0,0.001939,0.006316,0.052697,0.0,0.05172,0.000998,0.000866,0.054959,0.0,0.013846,0.176983,0.362261,0.005425
007255e47698,1,0,0.010541,0.027589,0.0,0.053864,0.0,0.078048,0.0,0.2845,0.0,0.074042,0.490909,0.034915,0.0391,0.0,0.0,0.007873,0.078406,0.044646,0.027807,0.127928,0.002077,0.353002,0.057157,0.253295,0.529577,0.314082,0.068953,0.0,0.316354,0.050913,0.207033,0.552387,0.0,0.0,0.108335,0.0,0.031732,0.030797,0.0,1.0,0.017144,0.007454,0.003189,0.0,0.036862,0.209978,0.0,0.0,0.016014,0.039418,0.0,0.194527,0.274495,0.164135,1.0
013f2bd269f5,0,1,0.063949,0.085715,0.0,0.046519,0.0,0.160575,0.0,0.265013,0.0,0.067011,0.854545,0.023597,0.634957,0.002405,0.0,0.008882,0.081244,0.102545,0.024273,0.14955,0.0,0.212468,0.100648,0.181089,0.250979,0.314187,0.138061,0.0,0.16622,0.060493,0.174504,0.384532,0.007959,0.0,0.237922,0.032069,0.435754,0.043532,0.00879,1.0,0.0,0.00024,0.071672,0.005353,0.047831,0.249274,0.054706,0.000385,0.036139,0.250424,0.011229,0.095035,0.258994,0.180218,0.00891


Unnamed: 0_level_0,EJ_A,EJ_B,AB,AF,AH,AM,AR,AX,AY,AZ,BC,BD,BN,BP,BQ,BR,BZ,CB,CC,CD,CF,CH,CL,CR,CS,CU,CW,DA,DE,DF,DH,DI,DL,DN,DU,DV,DY,EB,EE,EG,EH,EL,EP,EU,FC,FD,FE,FI,FL,FR,FS,GB,GE,GF,GH,GI,GL
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1
00eed32682bb,1,0,-0.013352,-0.006759,-0.046687,-0.005065,-0.047649,-0.018628,-0.002486,-0.095483,-0.000841,-0.032971,-0.509091,-0.030717,-0.003877,-0.000286,-0.005166,-0.005533,-0.04505,-0.038331,-0.002549,-0.014414,-0.034279,-0.023305,-0.054234,-0.028653,-0.122291,-0.033951,-0.017413,-0.006338,-0.040214,-0.060906,-0.032751,-0.112266,-3.4e-05,-0.074332,-0.005306,-0.054718,-0.015866,-0.006174,-7.1e-05,-0.052007,-0.079717,-0.000589,-0.002492,-0.000188,-0.011034,-0.111054,-0.001257,-0.0004,-0.002164,-0.031153,-0.050964,-9.1e-05,-0.131415,-0.004717,-5.1e-05
010ebe33f668,1,0,-0.013352,-0.006759,-0.046687,-0.005065,-0.047649,-0.018628,-0.002486,-0.095483,-0.000841,-0.032971,-0.509091,-0.030717,-0.003877,-0.000286,-0.005166,-0.005533,-0.04505,-0.038331,-0.002549,-0.014414,-0.034279,-0.023305,-0.054234,-0.028653,-0.122291,-0.033951,-0.017413,-0.006338,-0.040214,-0.060906,-0.032751,-0.112266,-3.4e-05,-0.074332,-0.005306,-0.054718,-0.015866,-0.006174,-7.1e-05,-0.052007,-0.079717,-0.000589,-0.002492,-0.000188,-0.011034,-0.111054,-0.001257,-0.0004,-0.002164,-0.031153,-0.050964,-9.1e-05,-0.131415,-0.004717,-5.1e-05
02fa521e1838,1,0,-0.013352,-0.006759,-0.046687,-0.005065,-0.047649,-0.018628,-0.002486,-0.095483,-0.000841,-0.032971,-0.509091,-0.030717,-0.003877,-0.000286,-0.005166,-0.005533,-0.04505,-0.038331,-0.002549,-0.014414,-0.034279,-0.023305,-0.054234,-0.028653,-0.122291,-0.033951,-0.017413,-0.006338,-0.040214,-0.060906,-0.032751,-0.112266,-3.4e-05,-0.074332,-0.005306,-0.054718,-0.015866,-0.006174,-7.1e-05,-0.052007,-0.079717,-0.000589,-0.002492,-0.000188,-0.011034,-0.111054,-0.001257,-0.0004,-0.002164,-0.031153,-0.050964,-9.1e-05,-0.131415,-0.004717,-5.1e-05


# Model creation, training, and submission

In [8]:
#These parameters were obtained by optuna hyperparameter tuning with stratified k-fold crossvalidation
model_creation_params= {
   'verbosity': 0,
   'nthread': None,
   'objective': 'binary:logistic',
   'eval_metric': 'logloss',
   'booster': 'gbtree',
   'alpha': 0.00040510453065869997,
   'lambda': 3.198576520498575e-05,
   'learning_rate': 0.05538695716630914,
   'n_estimators': 250,
   'random_state': 42,
   'eta': 0.017670343976603234,
   'gamma': 6.307111169468931e-07,
   'max_depth': 5,
   'min_child_weight': 5.734858377948545,
   'max_delta_step': 3.2806533102798117,
   'subsample': 0.4245697631042182,
   'sampling_method': 'uniform',
   'colsample_bytree': 0.9279384240066151,
   'colsample_bylevel': 0.9559142848417146,
   'colsample_bynode': 0.6555824396664807,
   'tree_method': 'exact',
   'scale_pos_weight': 0.9547224624671352
}

clf1 = xgb.XGBClassifier(**model_creation_params)
clf1.fit(x,y,sample_weight=compute_sample_weight('balanced', y))
predictions1 = pd.DataFrame(clf1.predict_proba(prediction_request))

submission=pd.DataFrame(
                    {'class_0':predictions1[0],
                    'class_1':predictions1[1]},
                  ).set_index(prediction_request.index)
display(submission)
submission.to_csv('submission.csv')

Unnamed: 0_level_0,class_0,class_1
Id,Unnamed: 1_level_1,Unnamed: 2_level_1
00eed32682bb,0.588577,0.411423
010ebe33f668,0.588577,0.411423
02fa521e1838,0.588577,0.411423
040e15f562a2,0.588577,0.411423
046e85c7cc7f,0.588577,0.411423
