# Intro
After seeing 'simple' neural networks weren't performing that well. I decided to look for the best publicly available neural network on the competition and learn for it. it was : https://www.kaggle.com/code/dngngy/icr-tensorflow-multimodel


I simplified, cleaned and adapted its code a bit, and analyzed it. The key takeaway is that for neural networks to be closer to competitive for tabular data ensembles were of great help.

In [1]:
def create_tf_ffn_preds():
    """Process the dataframe, create the stratified kfolds, create the model, and use it to make the submissions"""
    import pandas as pd
    import seaborn as sns
    import matplotlib.pyplot as plt

    import tensorflow as tf
    from tensorflow.keras import Sequential, layers, Input, Model
    from tensorflow.experimental import numpy as np
    from tensorflow.keras.losses import Loss
    from tensorflow.keras import backend as K
    from tensorflow.keras.callbacks import EarlyStopping

    from sklearn.model_selection import train_test_split,StratifiedKFold
    from sklearn.impute import KNNImputer
    from sklearn.preprocessing import StandardScaler,OneHotEncoder
    from sklearn.compose import ColumnTransformer
    from sklearn.pipeline import Pipeline
    seed = 1
    np.random.seed(seed)
    tf.random.set_seed(seed)

    class Path:
        sample_submission: str = "/kaggle/input/icr-identify-age-related-conditions/sample_submission.csv"
        greeks: str = "/kaggle/input/icr-identify-age-related-conditions/greeks.csv"
        train: str = "/kaggle/input/icr-identify-age-related-conditions/train.csv"
        to_predict: str = "/kaggle/input/icr-identify-age-related-conditions/test.csv" #the data that needs to be predicted if it has disease or not
                   

    def get_corr_cols(data:pd.DataFrame,label:str, corr:float) -> list:
        """
            Get columns with high correlations to label
        """
        assert corr <=1, "corr shoud be in [0,1]"
        correlations = data.corrwith(data[label],numeric_only=True).to_frame()
        correlations["abs"] = correlations[0].abs()
        correlations = correlations.sort_values("abs", ascending=False)
        correlations.drop(label,inplace=True)
        train_cols = correlations[correlations["abs"] >= corr].index.to_list() 
        return train_cols
    

    def create_processing_pipeline(numeric_cols, cat_cols) ->Pipeline:
        preprocessor = ColumnTransformer(
                transformers=[
                    ('numeric_scaler',  StandardScaler(), numeric_cols),
                    ('categorical', OneHotEncoder(), cat_cols)
                ])
        preprocess_pipeline = Pipeline([
            ("preprocessor", preprocessor),
            ("imputer",KNNImputer())
        ])

        return preprocess_pipeline
    
    def make_datasets(train_x, train_y, valid_x, valid_y):
        ds_train= tf.data.Dataset.from_tensor_slices(
            (train_x,train_y.astype("float64"))).batch(16).shuffle(1000, seed=seed)
        ds_valid= tf.data.Dataset.from_tensor_slices(
            (valid_x,valid_y.astype("float64"))).batch(16).shuffle(1000, seed=seed)
        return ds_train, ds_valid

    class BalancedLogLoss(Loss):
        def call(self, y_true, y_pred):
            # y_true: correct labels 0, 1
            # y_pred: predicted probabilities of class=1
            # Implements the Evaluation equation with w_0 = w_1 = 1.
            # Calculate the number of observations for each class
            zero = tf.constant(1e-15)
            N_0 = tf.clip_by_value(tf.math.reduce_sum(1 - y_true), zero,1e10)
            N_1 = tf.clip_by_value(tf.math.reduce_sum(y_true), zero,1e10)
            # Calculate the predicted probabilities for each class
            p_1 = tf.clip_by_value(y_pred, zero, 1-zero)
            p_0 = tf.clip_by_value(1-p_1, zero, 1-zero)
            # Calculate the average log loss for each class
            log_loss_0 = -tf.math.reduce_sum((1 - y_true) * tf.math.log(p_0)) / N_0 
            log_loss_1 = -tf.math.reduce_sum(y_true * tf.math.log(p_1)) / N_1 
            # return the (not further weighted) average of the averages
            return (log_loss_0 + log_loss_1)/2 

    class MultiMoldel():
        def __init__(self, models:list) -> None:
            self.models = models
            self.average = layers.Average()
        def __call__(self,x):
            outputs = []
            for model in model_dicts:
                submodel_x = model["preprocess_pipeline"].transform(x) #Use the processing pipeline the model was fit with.
                output = model["model"](submodel_x)
                outputs.append(output)
            return self.average(outputs)
    
    train_df, greeks_df, df_to_predict, sample_submission_df = pd.read_csv(Path.train,index_col="Id"), pd.read_csv(Path.greeks,index_col="Id"), pd.read_csv(Path.to_predict,index_col="Id"), pd.read_csv(Path.sample_submission,index_col="Id")
    
    
    cat_cols = [col for col in train_df if train_df[col].dtype == "O"]
    numeric_cols = get_corr_cols(train_df,"Class",0.16)
    input_shape = (len(numeric_cols) + 2, ) # magic number
    
    #Separate target column
    train_x = train_df.iloc[:,:-1]; train_y = train_df.iloc[:,-1]
    
    skf = StratifiedKFold(n_splits=5)
    splits = enumerate(skf.split(train_x, train_y))
    model_dicts = []
    lrs = [5.0119e-04,0.0056,1.0000e-04,5.6234e-05,7.0795e-05]  #The lrs were chosen with a lr callback, and thening the lr. The epochs by ploting learning with those rates. See the original for more details.
    epochs = [108,50,15,15,15]
    for i, (train_index, valid_index) in splits: #The train_ids and valid ids of each split (aka the ids for the fold)
        fold_train_x, fold_train_y, fold_valx, fold_valy = train_x.iloc[train_index], train_y.iloc[train_index], train_x.iloc[valid_index], train_y.iloc[valid_index]
        fold_x_processing_pipeline = create_processing_pipeline(numeric_cols, cat_cols).fit(fold_train_x)
        
        fold_train_x ,fold_valx = fold_x_processing_pipeline.transform(fold_train_x), fold_x_processing_pipeline.transform(fold_valx)
        
        ds_train, ds_valid, = make_datasets(fold_train_x, fold_train_y, fold_valx, fold_valy)
        

        model = Sequential([
            Input(shape=input_shape),
            layers.Lambda(lambda x: tf.expand_dims(x, axis=-1)),

            layers.Conv1D(16,5,padding="same",
                      kernel_initializer='lecun_normal',
                      activation='selu'),
            layers.Conv1D(32,2,padding="same",
                      kernel_initializer='lecun_normal',
                      activation='selu'),
            layers.MaxPool1D(),

            layers.Flatten(),
            layers.AlphaDropout(0.5),
            layers.Dense(128,
                     kernel_initializer='lecun_normal',
                     activation="selu"),
            layers.Dense(1,activation="sigmoid")
        ])


        model.compile(
            optimizer=tf.keras.optimizers.Adam(lrs[i]),
            loss= BalancedLogLoss(),
            metrics="accuracy"
        )

        history = model.fit(
            ds_train,
            validation_data=ds_valid,
            epochs=epochs[i],
            verbose=0
        )
        model_dicts.append(
        {
            "model":model,
            "preprocess_pipeline": fold_x_processing_pipeline,
            "history":history
        })


    multi_model = MultiMoldel([model_dict['model'] for model_dict in model_dicts])
    pred = multi_model(df_to_predict)
    submission = pd.read_csv(Path.sample_submission)
    submission["class_1"] = pred.numpy()
    submission["class_0"] = 1 - pred.numpy()
    submission.to_csv("submission.csv",index=False)
    
    return model_dicts, multi_model, pred, submission


In [2]:
model_dicts, multi_model, pred, submission = create_tf_ffn_preds()



In [3]:
submission

Unnamed: 0,Id,class_0,class_1
0,00eed32682bb,0.811411,0.188589
1,010ebe33f668,0.811411,0.188589
2,02fa521e1838,0.811411,0.188589
3,040e15f562a2,0.811411,0.188589
4,046e85c7cc7f,0.811411,0.188589
