## Environment Set UP

In [None]:
# fundamental packages 
import pandas as pd
import numpy as np
import pickle
from matplotlib import pyplot as plt 
from matplotlib.ticker import MaxNLocator 
import random
import datetime
import math 
#! pip install colorama
from colorama import Fore, Back, Style 
import gc # garbage collector

# sklearn packages 
from sklearn.model_selection import StratifiedGroupKFold, StratifiedGroupKFold
from sklearn.preprocessing import StandardScaler, QuantileTransformer, OneHotEncoder
from sklearn.metrics import roc_curve, roc_auc_score

# keras tensorflow packages 
import tensorflow as tf 
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ReduceLROnPlateau, LearningRateScheduler,EarlyStopping
from tensorflow.keras.layers import Dense, Input, InputLayer, Add, Concatenate, Dropout, BatchNormalization
from tensorflow.keras.utils import plot_model 

INFERENCE = True

print('✔️ Libraries Imported!')


✔️ Libraries Imported!


In [None]:
# plot training history
def plot_history(history, *, n_epochs = None, plot_lr = False, title = None, bottom = None, top = None):
  """
  Function of plotting the last n_epochs of the training history,
  plots loss and optionally val_loss and lr. 
  """
  plt.figure(figsize = (15, 6))
  # here the `history` is the model result defined in `fit_model` function
  from_epoch = 0 if n_epochs is None else max(len(history['loss']) - n_epochs, 0) 

  # plot training and validation losses 
  plt.plot(np.arange(from_epoch, len(history['loss'])), history['loss'][from_epoch:], label = 'Training loss')

  pass 

## Reading and Preprocessing the training data


Here I read the parquet file as well that has already been denoised. 

I'm about to create few groups of features: 
- Averaged selected features over all statements of a customer
- Minimum and maximum of the selected features over all statements of a customer 
- Selected features taken from the last statement of a customer 

I'm going to use __*one hot encoding*__ for the categorical features, and fill in all the missing values with 0 as Keras not taking Nan as the input. 

In practice, `.iloc[mask_array, columns]` needs much less RAM than the `groupby` method, and for saving more memory for  deleting the index of the training dataframe. 

In [None]:
features_avg = ['B_11', 'B_13', 'B_14', 'B_15', 'B_16', 'B_17', 'B_18', 'B_19', 'B_2', 
                'B_20', 'B_28', 'B_29', 'B_3', 'B_33', 'B_36', 'B_37', 'B_4', 'B_42', 
                'B_5', 'B_8', 'B_9', 'D_102', 'D_103', 'D_105', 'D_111', 'D_112', 'D_113', 
                'D_115', 'D_118', 'D_119', 'D_121', 'D_124', 'D_128', 'D_129', 'D_131', 
                'D_132', 'D_133', 'D_139', 'D_140', 'D_141', 'D_143', 'D_144', 'D_145', 
                'D_39', 'D_41', 'D_42', 'D_43', 'D_44', 'D_45', 'D_46', 'D_47', 'D_48', 
                'D_49', 'D_50', 'D_51', 'D_52', 'D_56', 'D_58', 'D_62', 'D_70', 'D_71', 
                'D_72', 'D_74', 'D_75', 'D_79', 'D_81', 'D_83', 'D_84', 'D_88', 'D_91', 
                'P_2', 'P_3', 'R_1', 'R_10', 'R_11', 'R_13', 'R_18', 'R_19', 'R_2', 'R_26', 
                'R_27', 'R_28', 'R_3', 'S_11', 'S_12', 'S_22', 'S_23', 'S_24', 'S_26', 
                'S_27', 'S_5', 'S_7', 'S_8', ]
features_min = ['B_13', 'B_14', 'B_15', 'B_16', 'B_17', 'B_19', 'B_2', 'B_20', 'B_22', 
                'B_24', 'B_27', 'B_28', 'B_29', 'B_3', 'B_33', 'B_36', 'B_4', 'B_42', 
                'B_5', 'B_9', 'D_102', 'D_103', 'D_107', 'D_109', 'D_110', 'D_111', 
                'D_112', 'D_113', 'D_115', 'D_118', 'D_119', 'D_121', 'D_122', 'D_128', 
                'D_129', 'D_132', 'D_133', 'D_139', 'D_140', 'D_141', 'D_143', 'D_144', 
                'D_145', 'D_39', 'D_41', 'D_42', 'D_45', 'D_46', 'D_48', 'D_50', 'D_51', 
                'D_53', 'D_54', 'D_55', 'D_56', 'D_58', 'D_59', 'D_60', 'D_62', 'D_70', 
                'D_71', 'D_74', 'D_75', 'D_78', 'D_79', 'D_81', 'D_83', 'D_84', 'D_86', 
                'D_88', 'D_96', 'P_2', 'P_3', 'P_4', 'R_1', 'R_11', 'R_13', 'R_17', 'R_19', 
                'R_2', 'R_27', 'R_28', 'R_4', 'R_5', 'R_8', 'S_11', 'S_12', 'S_23', 'S_25', 
                'S_3', 'S_5', 'S_7', 'S_9', ]
features_max = ['B_1', 'B_11', 'B_13', 'B_15', 'B_16', 'B_17', 'B_18', 'B_19', 'B_2', 
                'B_22', 'B_24', 'B_27', 'B_28', 'B_29', 'B_3', 'B_31', 'B_33', 'B_36', 
                'B_4', 'B_42', 'B_5', 'B_7', 'B_9', 'D_102', 'D_103', 'D_105', 'D_109', 
                'D_110', 'D_112', 'D_113', 'D_115', 'D_121', 'D_124', 'D_128', 'D_129', 
                'D_131', 'D_139', 'D_141', 'D_144', 'D_145', 'D_39', 'D_41', 'D_42', 
                'D_43', 'D_44', 'D_45', 'D_46', 'D_47', 'D_48', 'D_50', 'D_51', 'D_52', 
                'D_53', 'D_56', 'D_58', 'D_59', 'D_60', 'D_62', 'D_70', 'D_72', 'D_74', 
                'D_75', 'D_79', 'D_81', 'D_83', 'D_84', 'D_88', 'D_89', 'P_2', 'P_3', 
                'R_1', 'R_10', 'R_11', 'R_26', 'R_28', 'R_3', 'R_4', 'R_5', 'R_7', 'R_8', 
                'S_11', 'S_12', 'S_23', 'S_25', 'S_26', 'S_27', 'S_3', 'S_5', 'S_7', 'S_8', ]
features_last = ['B_1', 'B_11', 'B_12', 'B_13', 'B_14', 'B_16', 'B_18', 'B_19', 'B_2', 
                 'B_20', 'B_21', 'B_24', 'B_27', 'B_28', 'B_29', 'B_3', 'B_30', 'B_31', 
                 'B_33', 'B_36', 'B_37', 'B_38', 'B_39', 'B_4', 'B_40', 'B_42', 'B_5', 
                 'B_8', 'B_9', 'D_102', 'D_105', 'D_106', 'D_107', 'D_108', 'D_110', 
                 'D_111', 'D_112', 'D_113', 'D_114', 'D_115', 'D_116', 'D_117', 'D_118', 
                 'D_119', 'D_120', 'D_121', 'D_124', 'D_126', 'D_128', 'D_129', 'D_131', 
                 'D_132', 'D_133', 'D_137', 'D_138', 'D_139', 'D_140', 'D_141', 'D_142', 
                 'D_143', 'D_144', 'D_145', 'D_39', 'D_41', 'D_42', 'D_43', 'D_44', 'D_45', 
                 'D_46', 'D_47', 'D_48', 'D_49', 'D_50', 'D_51', 'D_52', 'D_53', 'D_55', 
                 'D_56', 'D_59', 'D_60', 'D_62', 'D_63', 'D_64', 'D_66', 'D_68', 'D_70', 
                 'D_71', 'D_72', 'D_73', 'D_74', 'D_75', 'D_77', 'D_78', 'D_81', 'D_82', 
                 'D_83', 'D_84', 'D_88', 'D_89', 'D_91', 'D_94', 'D_96', 'P_2', 'P_3', 
                 'P_4', 'R_1', 'R_10', 'R_11', 'R_12', 'R_13', 'R_16', 'R_17', 'R_18', 
                 'R_19', 'R_25', 'R_28', 'R_3', 'R_4', 'R_5', 'R_8', 'S_11', 'S_12', 
                 'S_23', 'S_25', 'S_26', 'S_27', 'S_3', 'S_5', 'S_7', 'S_8', 'S_9', ]
features_categorical = ['B_30_last', 'B_38_last', 'D_114_last', 'D_116_last',
                        'D_117_last', 'D_120_last', 'D_126_last',
                        'D_63_last', 'D_64_last', 'D_66_last', 'D_68_last']

In [None]:
%%time

for i in ['train','test'] if INFERENCE else ['train']:
    df = pd. read_parquet(f'../data/{i}.parquet') 
    cid = pd.Categorical(df.pop('customer_ID'), ordered = True)
    last = (cid != np.roll(cid, -1)) # mask for last statement of every customer 
    if 'target' in df.columns:
        df.drop(columns = ['target'], inplace = True)
    print('Read', i)
    gc.collect() # performs a blocking garbage collection of all generations
    
    df_avg = (df
                .groupby(cid)
                .mean()[features_avg]
                .rename(columns = {f: f"{f}_avg" for f in features_avg})
                )
    print('Computed avg', i )
    gc.collect()

    df_max = (df
                .groupby(cid)
                .max()[features_max]
                .rename(columns = {f: f"{f}_max" for f in features_max})
                )
    print('Computed max', i)
    gc.collect()

    df_min = (df
              .groupby(cid)
              .min()[features_min]
              .rename(columns={f: f"{f}_min" for f in features_min})
             )
    print('Computed min', i)
    gc.collect()


    df_last = (df.loc[last, features_last]
               .rename(columns={f: f"{f}_last" for f in features_last})
               .set_index(np.asarray(cid[last]))
              )
    df = None # we no longer need the original data
    print('Computed last', i)

    df_categorical = df_last[features_categorical].astype(object)
    features_not_cat = [f for f in df_last.columns if f not in features_categorical]
    if i == 'train':
        ohe = OneHotEncoder(drop = 'first', sparse = False, 
                            dtype = np.float32, handle_unknown = 'ignore')
        ohe.fit(df_categorical)
        # where is this from? 
        with open('ohe.pickle', 'wb') as f:pickle.dump(ohe, f)
        df_categorical = pd.DataFrame(ohe.transform(df_categorical).astype(np.float16),
                                    index = df_categorical.index).rename(columns = str)
    print('Computed categorical', i)

    df = pd.concat([df_last[features_not_cat], df_categorical, df_avg, df_min, df_max], axis=1)
    
    # Impute missing values
    df.fillna(value=0, inplace=True)
    
    del df_avg, df_max, df_min, df_last, df_categorical, cid, last, features_not_cat

    print(f"{i} shape: {df.shape}")
    if i == 'train': 
        # free the memory 
        df.reset_index(drop = True, inplace = True) # free the memory 
        df.to_feather('train_processed.ftr')
        df = None 
        gc.collect()


Read train




Computed avg train
Computed max train
Computed min train
Computed last train
Computed categorical train
train shape: (458913, 435)
Read test




Computed avg test
Computed max test
Computed min test
Computed last test
Computed categorical test
test shape: (924621, 407)
CPU times: total: 6min 31s
Wall time: 5min 36s


In [None]:
train = pd.read_feather('train_processed.ftr')
target = pd.read_csv('../data/train_labels.csv')
print(f'target shape:{target.shape}')

target shape:(458913, 2)


## Model Architecture

For the neural network model, my rule of thumb is to start with one hidden layer, then gradually add more hidden layers, and when the network seems to overfit or diverges, I'll insert a Dropout and/or a connection which will skips two layers. <br><br>
The final model has four hidden layers, and is enriched by a skip connection and a Dropout layer. 

In [None]:
LR_START = 0.01
features = [f for f in train.columns if f != 'target' and f != 'customer_ID']

In [None]:
def keras_model(n_inputs = len(features)):
    """
    Function for a sequential neural network with a skip connection. 
    
    Returns a complicated instance of tensorflows.keras.models.Model
    """

    activation = 'swish' # why use swish as the activation function here? 
    reg = 4e-4 # what is `reg` here?
    inputs = Input(shape = (n_inputs,))

    # build neural network layers 
    x0 = Dense (256, kernel_regularizer = tf.keras.regularizers.l2(reg),
            activation = activation,
            )(inputs)
    x = Dense (64, kernel_regularizer = tf.keras.regularizers.l2(reg),
            activation = activation,
            )(x0)
    x = Concatenate()([ x , x0])
    x = Dropout(0.1)(x) # how is the dropout work, and does the sequence matters? 
    x = Dense(16, kernel_regularizer = tf.keras.regularizers.l2(reg),
            activation = activation,
            )(x)
    x = Dense (1, #kernel_regularizer = tf.keras.regularizers.l2(reg),
            activation = 'sigmoid',
            )(x) # meaning of the numbers in dense layer? 
    
    model = Model(inputs, x)
    model.compile(optimizer = tf.keras.optimizers.Adam(learning_rate= LR_START),
                loss = tf.keras.losses.BinaryCrossentropy()) # rewind the crossentropy
    
    return model 

In [None]:
plot_model(keras_model(), 
        show_layer_names = False,
        show_shapes = True)

You must install pydot (`pip install pydot`) and install graphviz (see instructions at https://graphviz.gitlab.io/download/) for plot_model to work.


### Cross - Validation 
I'll use a standar validation loop, in which we scale the data and train the model. Since the dataset is imbalance, we'll use a StratifiedKFold because of it. 

In [None]:
# set up the parameters 
ONLY_FIRST_FOLD = False 
EPOCHS_EXPONENTIALDECAY = 100 
VERBOSE = 0 # set to 0 for less output, or to 2 for more output 
LR_END = 1e-5 # learning rate at the end of training 
CYCLES = 1 
EPOCHS = 200 # how to choose the epochs? 
DIAGRAMS = True 
USE_PLATEAU = False # set to True for early stopping, or to False for exponential learning rate decay 
BATCH_SIZE = 2048 

np.random.seed(1)
random.seed(1) 
tf.random.set_seed(1)

In [None]:
def exponential_decay(epoch): 
    """
    The helper function for the learning rate's exponentially decay.
    -------------------
    v decays from e^a to 1 in every cycle
    w decays from 1 to 0 in every cycle
    epoch == 0                  -> w = 1 (first epoch of cycle)
    epoch == epochs_per_cycle-1 -> w = 0 (last epoch of cycle)
    higher a                    -> decay starts with a steeper decline
    """
    a = 3
    epochs_per_cycle = EPOCHS // CYCLES
    epoch_in_cycle = epoch % epochs_per_cycle
    if epochs_per_cycle > 1:
        v = math.exp(a * (1 - epoch_in_cycle / (epochs_per_cycle-1)))
        w = (v - 1) / (math.exp(a) - 1)
    else:
        w = 1
    return w * LR_START + (1 - w) * LR_END    


In [None]:
def fit_model ( X_tr, y_tr, X_va = None, y_va = None, fold = 0, run = 0):
    """
    Scale the data, fit a model, plot the training history and optionally validate the model.
    
    Saves a trained instance of tensorflow.keras.models.Model.
    
    As a side effect, updates y_va_pred, history_list, y_pred_list and score_list.
    """

    global y_va_pred 
    gc.collect()
    start_time = datetime.datetime.now()

    scaler = StandardScaler()
    X_tr = scaler.fit_transform(X_tr)

    if X_va is not None: 
        X_va = scaler.transform(X_va)
        validation_data = (X_va, y_va)
    else:
        validation_data = None 
    
    #define the learning rate schedule and EarlyStopping 
    if USE_PLATEAU and X_va is not None: # use early stopping 
        epochs = EPOCHS 
        lr = ReduceLROnPlateau(monitor = 'val_loss', # there's a question asking about the implementation of this question 
                            factor = 0.7, 
                            patience = 4, 
                            verbose = VERBOSE

        )
        es = EarlyStopping(monitor = 'val_loss',
                        patience = 12,
                        verbose = 1,
                        mode = 'min',
                        restore_best_weights = True)
        callbacks = [lr, es, tf.keras.callbacks.TerminateOnNaN]
    
    else: 
        #use exponential learning rate decay rather than early stopping 
        epochs = EPOCHS_EXPONENTIALDECAY
        lr = LearningRateScheduler(exponential_decay, verbose = 0)
        callbacks = [lr, tf.keras.callbacks.TerminateOnNaN()]

    #construct and compile the model 
    model = keras_model(X_tr.shape[1])

    #train the model 
    history = model.fit(X_tr, y_tr,
                        validation_data = validation_data,
                        epochs = epochs,
                        verbose = VERBOSE,
                        batch_size = BATCH_SIZE,
                        shuffle = True,
                        callbacks = callbacks)
    del X_tr, y_tr

    with open(f'scaler_{fold}.pickle','wb') as f:pickle.dump(scaler,f)
    model.save(f'model_{fold}')
    history_list.append(history.history)
    callbacks, es, lr, history = None, None, None, None

    if X_va is None: 
        print(f'Training loss:{history_list[-1]['loss'][-1]:.4f}')
    else: 
        pass

    pass 
