# Strategy

- Preprocessing
    - RankGauss
    - PCA + Existing Features
    - Variance Encoding
- Model
    - Normal Neural Network
    - Split Neural Network
    - ~~NODE (Neural Oblivious Decision Ensembles)~~
    - TabNet
    - Multi input ResNet
    - ~~Kernel Ridge Regression - Platt Scaling ~~
- Learning
    - Pre-train with non-scored label
    - Optimizer: AdamW with weight_decay
    - Label smoothing
- Prediction
    - Ensemble above with weight optimization
    - With clipping

# Library

In [555]:
import warnings

warnings.filterwarnings("ignore")

In [556]:
import sys

sys.path.append("../input/iterative-stratification/iterative-stratification-master")
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

sys.path.append("../input/autograd")
import autograd.numpy as np
from autograd import grad

In [557]:
import datetime
import gc
import os
import random
from collections import defaultdict
from time import time
from typing import Optional

# import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow.keras.backend as K
import tensorflow.keras.layers as L
import tensorflow_addons as tfa
import tensorflow_probability as tfp

# import optuna
from scipy.optimize import fsolve, minimize
from sklearn.decomposition import PCA
from sklearn.feature_selection import VarianceThreshold
from sklearn.kernel_approximation import Nystroem
from sklearn.kernel_ridge import KernelRidge
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import QuantileTransformer, StandardScaler
from sklearn.svm import SVC, SVR
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau

In [558]:
MIXED_PRECISION = False
XLA_ACCELERATE = True

if MIXED_PRECISION:
    from tensorflow.keras.mixed_precision import experimental as mixed_precision

    if tpu:
        policy = tf.keras.mixed_precision.experimental.Policy("mixed_bfloat16")
    else:
        policy = tf.keras.mixed_precision.experimental.Policy("mixed_float16")
    mixed_precision.set_policy(policy)
    print("Mixed precision enabled")

if XLA_ACCELERATE:
    tf.config.optimizer.set_jit(True)
    print("Accelerated Linear Algebra enabled")

Accelerated Linear Algebra enabled


# Functions

In [559]:
def fix_seed(seed=2020):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)


random_seed = 22
fix_seed(random_seed)

In [560]:
# https://www.kaggle.com/c/lish-moa/discussion/189857#1043953

# Prediction Clipping Thresholds
p_min = 0.001
p_max = 0.999

# Evaluation Metric with clipping and no label smoothing
def logloss(y_true, y_pred):
    # y_pred = tf.clip_by_value(y_pred, p_min, p_max)
    return -K.mean(y_true * K.log(y_pred) + (1 - y_true) * K.log(1 - y_pred))

In [561]:
# [Fast Numpy Log Loss] https://www.kaggle.com/gogo827jz/optimise-blending-weights-4-5x-faster-log-loss
def metric(y_true, y_pred):
    loss = 0
    y_pred_clip = np.clip(y_pred, 1e-7, 1 - 1e-7)
    for i in range(y_pred.shape[1]):
        loss += -np.mean(y_true[:, i] * np.log(y_pred_clip[:, i]) + (1 - y_true[:, i]) * np.log(1 - y_pred_clip[:, i]))
    return loss / y_pred.shape[1]

In [562]:
def blend(size, weights, oof):
    blend_ = np.zeros(size)
    for i, key in enumerate(oof.keys()):
        blend_ += weights[i] * oof[key].values
    return blend_

# Load Data

In [563]:
train_df = pd.read_csv("../input/lish-moa/train_features.csv")
test_df = pd.read_csv("../input/lish-moa/test_features.csv")
target_df = pd.read_csv("../input/lish-moa/train_targets_scored.csv")
non_target_df = pd.read_csv("../input/lish-moa/train_targets_nonscored.csv")
submit_df = pd.read_csv("../input/lish-moa/sample_submission.csv")

In [564]:
train = train_df.copy()
test = test_df.copy()
ss = submit_df.copy()

# Preprocessing

In [565]:
train.loc[:, "cp_dose"] = train.loc[:, "cp_dose"].map({"D1": 0, "D2": 1})
test.loc[:, "cp_dose"] = test.loc[:, "cp_dose"].map({"D1": 0, "D2": 1})

train.loc[:, "cp_time"] = train.loc[:, "cp_time"].map({24: 0, 48: 1, 72: 2})
test.loc[:, "cp_time"] = test.loc[:, "cp_time"].map({24: 0, 48: 1, 72: 2})

## cp_type が ctrl_vehicle なものは MoA を持たない

ので、学習から除外する

In [566]:
target_df = target_df.loc[train["cp_type"] != "ctl_vehicle"].reset_index(drop=True)
non_target_df = non_target_df.loc[train["cp_type"] != "ctl_vehicle"].reset_index(drop=True)
train = train.loc[train["cp_type"] != "ctl_vehicle"].reset_index(drop=True)

In [567]:
train = train.drop("cp_type", axis=1)
test = test.drop("cp_type", axis=1)

In [568]:
del train["sig_id"]
del target_df["sig_id"]
del non_target_df["sig_id"]
del test["sig_id"]
del ss["sig_id"]

In [569]:
# train

## Rank Gauss

https://www.kaggle.com/nayuts/moa-pytorch-nn-pca-rankgauss

連続値を特定の範囲の閉域に押し込めて、分布の偏りを解消する方法です。

In [570]:
g_cols = [col for col in train_df.columns if col.startswith("g-")]
c_cols = [col for col in train_df.columns if col.startswith("c-")]

for col in g_cols + c_cols:
    transformer = QuantileTransformer(n_quantiles=100, random_state=random_seed, output_distribution="normal")

    vec_len = len(train[col].values)
    vec_len_test = len(test[col].values)

    raw_vec = train[col].values.reshape(vec_len, 1)
    transformer.fit(raw_vec)

    train[col] = transformer.transform(raw_vec).reshape(1, vec_len)[0]
    test[col] = transformer.transform(test[col].values.reshape(vec_len_test, 1)).reshape(1, vec_len_test)[0]

In [571]:
# train

## PCA features (+ Existing features)

既存のカラムは残したほうがいいのだろうか？？
→ このコンペでは残したほうがいい成績が出ている。

In [572]:
# g-
n_comp = 50

data = pd.concat([pd.DataFrame(train[g_cols]), pd.DataFrame(test[g_cols])])
data2 = PCA(n_components=n_comp, random_state=random_seed).fit_transform(data[g_cols])
train2 = data2[: train.shape[0]]
test2 = data2[-test.shape[0] :]

train2 = pd.DataFrame(train2, columns=[f"pca_G-{i}" for i in range(n_comp)])
test2 = pd.DataFrame(test2, columns=[f"pca_G-{i}" for i in range(n_comp)])

train = pd.concat((train, train2), axis=1)
test = pd.concat((test, test2), axis=1)

In [573]:
# c-
n_comp = 15

data = pd.concat([pd.DataFrame(train[c_cols]), pd.DataFrame(test[c_cols])])
data2 = PCA(n_components=n_comp, random_state=random_seed).fit_transform(data[c_cols])
train2 = data2[: train.shape[0]]
test2 = data2[-test.shape[0] :]

train2 = pd.DataFrame(train2, columns=[f"pca_C-{i}" for i in range(n_comp)])
test2 = pd.DataFrame(test2, columns=[f"pca_C-{i}" for i in range(n_comp)])

train = pd.concat((train, train2), axis=1)
test = pd.concat((test, test2), axis=1)

In [574]:
# train

In [575]:
train_pca = train.copy()
test_pca = test.copy()

train_pca.drop(g_cols, axis=1, inplace=True)
test_pca.drop(g_cols, axis=1, inplace=True)

train_pca.drop(c_cols, axis=1, inplace=True)
test_pca.drop(c_cols, axis=1, inplace=True)

In [576]:
# train_pca

## feature Selection using Variance Encoding

分散がしきい値以下の特徴量を捨てます。

In [577]:
var_thresh = VarianceThreshold(threshold=0.5)

data = train.append(test)
data_transformed = var_thresh.fit_transform(data.iloc[:, 2:])

train_transformed = data_transformed[: train.shape[0]]
test_transformed = data_transformed[-test.shape[0] :]


train = pd.DataFrame(train[["cp_time", "cp_dose"]].values.reshape(-1, 2), columns=["cp_time", "cp_dose"])
train = pd.concat([train, pd.DataFrame(train_transformed)], axis=1, ignore_index=True)


test = pd.DataFrame(test[["cp_time", "cp_dose"]].values.reshape(-1, 2), columns=["cp_time", "cp_dose"])
test = pd.concat([test, pd.DataFrame(test_transformed)], axis=1, ignore_index=True)

In [578]:
# train

# Create Model

In [579]:
def create_model_simple_nn(num_col, output_dim):
    model = tf.keras.Sequential(
        [
            L.Input(num_col),
            L.BatchNormalization(),
            L.Dropout(0.2),
            tfa.layers.WeightNormalization(L.Dense(2048, activation="relu")),
            L.BatchNormalization(),
            L.Dropout(0.5),
            tfa.layers.WeightNormalization(L.Dense(1024, activation="relu")),
            L.BatchNormalization(),
            L.Dropout(0.5),
            tfa.layers.WeightNormalization(L.Dense(output_dim, activation="sigmoid")),
        ]
    )

    return model

# Create Model - Split Neural Network

https://www.kaggle.com/gogo827jz/split-neural-network-approach-tf-keras

In [580]:
def create_split_nn(num_columns, hidden_units, dropout_rate, output_dim):

    inp1 = L.Input(shape=(num_columns,))
    x1 = L.BatchNormalization()(inp1)

    for i, units in enumerate(hidden_units[0]):
        x1 = tfa.layers.WeightNormalization(L.Dense(units, activation="elu"))(x1)
        x1 = L.Dropout(dropout_rate[0])(x1)
        x1 = L.BatchNormalization()(x1)

    inp2 = L.Input(shape=(num_columns,))
    x2 = L.BatchNormalization()(inp2)

    for i, units in enumerate(hidden_units[1]):
        x2 = tfa.layers.WeightNormalization(L.Dense(units, activation="elu"))(x2)
        x2 = L.Dropout(dropout_rate[1])(x2)
        x2 = L.BatchNormalization()(x2)

    inp3 = L.Input(shape=(num_columns,))
    x3 = L.BatchNormalization()(inp3)

    for i, units in enumerate(hidden_units[2]):
        x3 = tfa.layers.WeightNormalization(L.Dense(units, activation="elu"))(x3)
        x3 = L.Dropout(dropout_rate[2])(x3)
        x3 = L.BatchNormalization()(x3)

    x = L.Concatenate()([x1, x2, x3])
    x = L.Dropout(dropout_rate[3])(x)
    x = L.BatchNormalization()(x)

    for units in hidden_units[3]:

        x = tfa.layers.WeightNormalization(L.Dense(units, activation="elu"))(x)
        x = L.Dropout(dropout_rate[4])(x)
        x = L.BatchNormalization()(x)

    out = tfa.layers.WeightNormalization(L.Dense(output_dim, activation="sigmoid"))(x)

    model = tf.keras.models.Model(inputs=[inp1, inp2, inp3], outputs=out)

    return model

In [581]:
def create_model_split_nn(num_col, output_dim):
    hidden_units = [[2048, 512], [1024, 1024], [1024, 512], [512, 512]]
    dropout_rate = [0.4, 0.35, 0.3, 0.3, 0.2]
    size = int(np.ceil(0.8 * num_col))

    model = create_split_nn(size, hidden_units, dropout_rate, output_dim)
    return model

# Create Model - Multi input ResNet

https://www.kaggle.com/rahulsd91/moa-multi-input-resnet-model

In [582]:
def create_model_resnet(n_features, n_features_2, n_labels):
    input_1 = L.Input(shape=(n_features,), name="Input1")
    input_2 = L.Input(shape=(n_features_2,), name="Input2")

    head_1 = tf.keras.Sequential(
        [
            L.BatchNormalization(),
            L.Dropout(0.2),
            tfa.layers.WeightNormalization(L.Dense(512, activation="elu")),
            L.BatchNormalization(),
            L.Dropout(0.2),
            tfa.layers.WeightNormalization(L.Dense(256, activation="elu")),
        ],
        name="Head1",
    )

    input_3 = head_1(input_1)
    input_3_concat = L.Concatenate()([input_2, input_3])

    head_2 = tf.keras.Sequential(
        [
            L.BatchNormalization(),
            L.Dropout(0.3),
            tfa.layers.WeightNormalization(L.Dense(512, activation="relu")),
            L.BatchNormalization(),
            L.Dropout(0.2),
            tfa.layers.WeightNormalization(L.Dense(512, activation="elu")),
            L.BatchNormalization(),
            L.Dropout(0.2),
            tfa.layers.WeightNormalization(L.Dense(256, activation="relu")),
            L.BatchNormalization(),
            L.Dropout(0.2),
            tfa.layers.WeightNormalization(L.Dense(256, activation="elu")),
        ],
        name="Head2",
    )

    input_4 = head_2(input_3_concat)
    input_4_avg = L.Average()([input_3, input_4])

    head_3 = tf.keras.Sequential(
        [
            L.BatchNormalization(),
            tfa.layers.WeightNormalization(L.Dense(256, activation="selu")),
            L.BatchNormalization(),
            L.Dropout(0.2),
            tfa.layers.WeightNormalization(L.Dense(256, activation="selu")),
            L.BatchNormalization(),
            L.Dense(n_labels, activation="sigmoid"),
        ],
        name="Head3",
    )

    output = head_3(input_4_avg)

    model = tf.keras.models.Model(inputs=[input_1, input_2], outputs=output)

    return model

# Create Model - NODE

Neural Oblivious Decision Ensembles

https://www.kaggle.com/gogo827jz/moa-neural-oblivious-decision-ensembles-tf-keras

In [583]:
@tf.function
def sparsemoid(inputs: tf.Tensor):
    return tf.clip_by_value(0.5 * inputs + 0.5, 0.0, 1.0)

In [584]:
@tf.function
def identity(x: tf.Tensor):
    return x

In [585]:
class ODST(L.Layer):
    def __init__(self, n_trees: int = 3, depth: int = 4, units: int = 1, threshold_init_beta: float = 1.0):
        super(ODST, self).__init__()
        self.initialized = False
        self.n_trees = n_trees
        self.depth = depth
        self.units = units
        self.threshold_init_beta = threshold_init_beta

    def build(self, input_shape: tf.TensorShape):
        feature_selection_logits_init = tf.zeros_initializer()
        self.feature_selection_logits = tf.Variable(
            initial_value=feature_selection_logits_init(
                shape=(input_shape[-1], self.n_trees, self.depth), dtype="float32"
            ),
            trainable=True,
            name="feature_selection_logits",
        )

        feature_thresholds_init = tf.zeros_initializer()
        self.feature_thresholds = tf.Variable(
            initial_value=feature_thresholds_init(shape=(self.n_trees, self.depth), dtype="float32"),
            trainable=True,
            name="feature_thresholds",
        )

        log_temperatures_init = tf.ones_initializer()
        self.log_temperatures = tf.Variable(
            initial_value=log_temperatures_init(shape=(self.n_trees, self.depth), dtype="float32"),
            trainable=True,
            name="log_temperatures",
        )

        indices = K.arange(0, 2 ** self.depth, 1)
        offsets = 2 ** K.arange(0, self.depth, 1)
        bin_codes = tf.reshape(indices, (1, -1)) // tf.reshape(offsets, (-1, 1)) % 2
        bin_codes_1hot = tf.stack([bin_codes, 1 - bin_codes], axis=-1)
        self.bin_codes_1hot = tf.Variable(
            initial_value=tf.cast(bin_codes_1hot, "float32"), trainable=False, name="bin_codes_1hot"
        )

        response_init = tf.ones_initializer()
        self.response = tf.Variable(
            initial_value=response_init(shape=(self.n_trees, self.units, 2 ** self.depth), dtype="float32"),
            trainable=True,
            name="response",
        )

    def initialize(self, inputs):
        feature_values = self.feature_values(inputs)

        # intialize feature_thresholds
        percentiles_q = 100 * tfp.distributions.Beta(self.threshold_init_beta, self.threshold_init_beta).sample(
            [self.n_trees * self.depth]
        )
        flattened_feature_values = tf.map_fn(K.flatten, feature_values)
        init_feature_thresholds = tf.linalg.diag_part(
            tfp.stats.percentile(flattened_feature_values, percentiles_q, axis=0)
        )

        self.feature_thresholds.assign(tf.reshape(init_feature_thresholds, self.feature_thresholds.shape))

        # intialize log_temperatures
        self.log_temperatures.assign(
            tfp.stats.percentile(tf.math.abs(feature_values - self.feature_thresholds), 50, axis=0)
        )

    def feature_values(self, inputs: tf.Tensor, training: bool = None):
        feature_selectors = tfa.activations.sparsemax(self.feature_selection_logits)
        # ^--[in_features, n_trees, depth]

        feature_values = tf.einsum("bi,ind->bnd", inputs, feature_selectors)
        # ^--[batch_size, n_trees, depth]

        return feature_values

    def call(self, inputs: tf.Tensor, training: bool = None):
        if not self.initialized:
            self.initialize(inputs)
            self.initialized = True

        feature_values = self.feature_values(inputs)

        threshold_logits_a = (feature_values - self.feature_thresholds) * tf.math.exp(-self.log_temperatures)

        threshold_logits_b = tf.stack([-threshold_logits_a, threshold_logits_a], axis=-1)
        # ^--[batch_size, n_trees, depth, 2]

        bins = sparsemoid(threshold_logits_b)
        # ^--[batch_size, n_trees, depth, 2], approximately binary

        bin_matches = tf.einsum("btds,dcs->btdc", bins, self.bin_codes_1hot)
        # ^--[batch_size, n_trees, depth, 2 ** depth]

        response_weights = tf.math.reduce_prod(bin_matches, axis=-2)
        # ^-- [batch_size, n_trees, 2 ** depth]

        response = tf.einsum("bnd,ncd->bnc", response_weights, self.response)
        # ^-- [batch_size, n_trees, units]

        return tf.reduce_sum(response, axis=1)

In [586]:
class NODE(tf.keras.Model):
    def __init__(
        self,
        units: int = 1,
        n_layers: int = 1,
        output_dim=1,
        dropout_rate=0.1,
        link: tf.function = tf.identity,
        n_trees: int = 3,
        depth: int = 4,
        threshold_init_beta: float = 1.0,
        feature_column: Optional[L.DenseFeatures] = None,
    ):
        super(NODE, self).__init__()
        self.units = units
        self.n_layers = n_layers
        self.n_trees = n_trees
        self.depth = depth
        self.units = units
        self.threshold_init_beta = threshold_init_beta
        self.feature_column = feature_column
        self.dropout_rate = dropout_rate
        self.output_dim = output_dim

        if feature_column is None:
            self.feature = L.Lambda(identity)
        else:
            self.feature = feature_column

        self.bn = [L.BatchNormalization() for _ in range(n_layers + 1)]
        self.dropout = [L.Dropout(self.dropout_rate) for _ in range(n_layers + 1)]
        self.ensemble = [
            ODST(n_trees=n_trees, depth=depth, units=units, threshold_init_beta=threshold_init_beta)
            for _ in range(n_layers)
        ]

        self.last_layer = L.Dense(self.output_dim)

        self.link = link

    def call(self, inputs, training=None):
        X_a = self.feature(inputs)
        X_b = self.bn[0](X_a, training=training)
        X_c = self.dropout[0](X_b, training=training)

        X = defaultdict(dict)
        X[0][0] = X_c
        for i, tree in enumerate(self.ensemble):
            X[i][1] = tf.concat([X[i][0], tree(X[i][0])], axis=1)
            X[i][2] = self.bn[i + 1](X[i][1], training=training)
            X[i + 1][0] = self.dropout[i + 1](X[i][2], training=training)

        return self.link(self.last_layer(X[i + 1][0]))

In [587]:
def create_model_node(output_dim):
    model = NODE(
        n_layers=3,
        units=128,
        output_dim=output_dim,
        dropout_rate=0.1,
        depth=6,
        n_trees=3,
        link=tf.keras.activations.sigmoid,
    )

    return model

# Create Model - TabNet for MultiLabel

https://www.kaggle.com/gogo827jz/moa-stacked-tabnet-baseline-tensorflow-2-0#Model-Functions

TabNetとは。
https://cloud.google.com/blog/ja/products/ai-machine-learning/ml-model-tabnet-is-easy-to-use-on-cloud-ai-platform

In [588]:
def register_keras_custom_object(cls):
    tf.keras.utils.get_custom_objects()[cls.__name__] = cls
    return cls

In [589]:
def glu(x, n_units=None):
    """Generalized linear unit nonlinear activation."""
    if n_units is None:
        n_units = tf.shape(x)[-1] // 2

    return x[..., :n_units] * tf.nn.sigmoid(x[..., n_units:])

In [590]:
"""
Code replicated from https://github.com/tensorflow/addons/blob/master/tensorflow_addons/activations/sparsemax.py
"""


@register_keras_custom_object
@tf.function
def sparsemax(logits, axis):
    """Sparsemax activation function [1].
    For each batch `i` and class `j` we have
      $$sparsemax[i, j] = max(logits[i, j] - tau(logits[i, :]), 0)$$
    [1]: https://arxiv.org/abs/1602.02068
    Args:
        logits: Input tensor.
        axis: Integer, axis along which the sparsemax operation is applied.
    Returns:
        Tensor, output of sparsemax transformation. Has the same type and
        shape as `logits`.
    Raises:
        ValueError: In case `dim(logits) == 1`.
    """
    logits = tf.convert_to_tensor(logits, name="logits")

    # We need its original shape for shape inference.
    shape = logits.get_shape()
    rank = shape.rank
    is_last_axis = (axis == -1) or (axis == rank - 1)

    if is_last_axis:
        output = _compute_2d_sparsemax(logits)
        output.set_shape(shape)
        return output

    # If dim is not the last dimension, we have to do a transpose so that we can
    # still perform softmax on its last dimension.

    # Swap logits' dimension of dim and its last dimension.
    rank_op = tf.rank(logits)
    axis_norm = axis % rank
    logits = _swap_axis(logits, axis_norm, tf.math.subtract(rank_op, 1))

    # Do the actual softmax on its last dimension.
    output = _compute_2d_sparsemax(logits)
    output = _swap_axis(output, axis_norm, tf.math.subtract(rank_op, 1))

    # Make shape inference work since transpose may erase its static shape.
    output.set_shape(shape)
    return output

In [591]:
def _swap_axis(logits, dim_index, last_index, **kwargs):
    return tf.transpose(
        logits,
        tf.concat(
            [
                tf.range(dim_index),
                [last_index],
                tf.range(dim_index + 1, last_index),
                [dim_index],
            ],
            0,
        ),
        **kwargs,
    )

In [592]:
def _compute_2d_sparsemax(logits):
    """Performs the sparsemax operation when axis=-1."""
    shape_op = tf.shape(logits)
    obs = tf.math.reduce_prod(shape_op[:-1])
    dims = shape_op[-1]

    # In the paper, they call the logits z.
    # The mean(logits) can be substracted from logits to make the algorithm
    # more numerically stable. the instability in this algorithm comes mostly
    # from the z_cumsum. Substacting the mean will cause z_cumsum to be close
    # to zero. However, in practise the numerical instability issues are very
    # minor and substacting the mean causes extra issues with inf and nan
    # input.
    # Reshape to [obs, dims] as it is almost free and means the remanining
    # code doesn't need to worry about the rank.
    z = tf.reshape(logits, [obs, dims])

    # sort z
    z_sorted, _ = tf.nn.top_k(z, k=dims)

    # calculate k(z)
    z_cumsum = tf.math.cumsum(z_sorted, axis=-1)
    k = tf.range(1, tf.cast(dims, logits.dtype) + 1, dtype=logits.dtype)
    z_check = 1 + k * z_sorted > z_cumsum
    # because the z_check vector is always [1,1,...1,0,0,...0] finding the
    # (index + 1) of the last `1` is the same as just summing the number of 1.
    k_z = tf.math.reduce_sum(tf.cast(z_check, tf.int32), axis=-1)

    # calculate tau(z)
    # If there are inf values or all values are -inf, the k_z will be zero,
    # this is mathematically invalid and will also cause the gather_nd to fail.
    # Prevent this issue for now by setting k_z = 1 if k_z = 0, this is then
    # fixed later (see p_safe) by returning p = nan. This results in the same
    # behavior as softmax.
    k_z_safe = tf.math.maximum(k_z, 1)
    indices = tf.stack([tf.range(0, obs), tf.reshape(k_z_safe, [-1]) - 1], axis=1)
    tau_sum = tf.gather_nd(z_cumsum, indices)
    tau_z = (tau_sum - 1) / tf.cast(k_z, logits.dtype)

    # calculate p
    p = tf.math.maximum(tf.cast(0, logits.dtype), z - tf.expand_dims(tau_z, -1))
    # If k_z = 0 or if z = nan, then the input is invalid
    p_safe = tf.where(
        tf.expand_dims(
            tf.math.logical_or(tf.math.equal(k_z, 0), tf.math.is_nan(z_cumsum[:, -1])),
            axis=-1,
        ),
        tf.fill([obs, dims], tf.cast(float("nan"), logits.dtype)),
        p,
    )

    # Reshape back to original size
    p_safe = tf.reshape(p_safe, shape_op)
    return p_safe

In [593]:
"""
Code replicated from https://github.com/tensorflow/addons/blob/master/tensorflow_addons/layers/normalizations.py
"""


@register_keras_custom_object
class GroupNormalization(L.Layer):
    """Group normalization layer.
    Group Normalization divides the channels into groups and computes
    within each group the mean and variance for normalization.
    Empirically, its accuracy is more stable than batch norm in a wide
    range of small batch sizes, if learning rate is adjusted linearly
    with batch sizes.
    Relation to Layer Normalization:
    If the number of groups is set to 1, then this operation becomes identical
    to Layer Normalization.
    Relation to Instance Normalization:
    If the number of groups is set to the
    input dimension (number of groups is equal
    to number of channels), then this operation becomes
    identical to Instance Normalization.
    Arguments
        groups: Integer, the number of groups for Group Normalization.
            Can be in the range [1, N] where N is the input dimension.
            The input dimension must be divisible by the number of groups.
        axis: Integer, the axis that should be normalized.
        epsilon: Small float added to variance to avoid dividing by zero.
        center: If True, add offset of `beta` to normalized tensor.
            If False, `beta` is ignored.
        scale: If True, multiply by `gamma`.
            If False, `gamma` is not used.
        beta_initializer: Initializer for the beta weight.
        gamma_initializer: Initializer for the gamma weight.
        beta_regularizer: Optional regularizer for the beta weight.
        gamma_regularizer: Optional regularizer for the gamma weight.
        beta_constraint: Optional constraint for the beta weight.
        gamma_constraint: Optional constraint for the gamma weight.
    Input shape
        Arbitrary. Use the keyword argument `input_shape`
        (tuple of integers, does not include the samples axis)
        when using this layer as the first layer in a model.
    Output shape
        Same shape as input.
    References
        - [Group Normalization](https://arxiv.org/abs/1803.08494)
    """

    def __init__(
        self,
        groups: int = 2,
        axis: int = -1,
        epsilon: float = 1e-3,
        center: bool = True,
        scale: bool = True,
        beta_initializer="zeros",
        gamma_initializer="ones",
        beta_regularizer=None,
        gamma_regularizer=None,
        beta_constraint=None,
        gamma_constraint=None,
        **kwargs
    ):
        super().__init__(**kwargs)
        self.supports_masking = True
        self.groups = groups
        self.axis = axis
        self.epsilon = epsilon
        self.center = center
        self.scale = scale
        self.beta_initializer = tf.keras.initializers.get(beta_initializer)
        self.gamma_initializer = tf.keras.initializers.get(gamma_initializer)
        self.beta_regularizer = tf.keras.regularizers.get(beta_regularizer)
        self.gamma_regularizer = tf.keras.regularizers.get(gamma_regularizer)
        self.beta_constraint = tf.keras.constraints.get(beta_constraint)
        self.gamma_constraint = tf.keras.constraints.get(gamma_constraint)
        self._check_axis()

    def build(self, input_shape):

        self._check_if_input_shape_is_none(input_shape)
        self._set_number_of_groups_for_instance_norm(input_shape)
        self._check_size_of_dimensions(input_shape)
        self._create_input_spec(input_shape)

        self._add_gamma_weight(input_shape)
        self._add_beta_weight(input_shape)
        self.built = True
        super().build(input_shape)

    def call(self, inputs, training=None):
        # Training=none is just for compat with batchnorm signature call
        input_shape = K.int_shape(inputs)
        tensor_input_shape = tf.shape(inputs)

        reshaped_inputs, group_shape = self._reshape_into_groups(inputs, input_shape, tensor_input_shape)

        normalized_inputs = self._apply_normalization(reshaped_inputs, input_shape)

        outputs = tf.reshape(normalized_inputs, tensor_input_shape)

        return outputs

    def get_config(self):
        config = {
            "groups": self.groups,
            "axis": self.axis,
            "epsilon": self.epsilon,
            "center": self.center,
            "scale": self.scale,
            "beta_initializer": tf.keras.initializers.serialize(self.beta_initializer),
            "gamma_initializer": tf.keras.initializers.serialize(self.gamma_initializer),
            "beta_regularizer": tf.keras.regularizers.serialize(self.beta_regularizer),
            "gamma_regularizer": tf.keras.regularizers.serialize(self.gamma_regularizer),
            "beta_constraint": tf.keras.constraints.serialize(self.beta_constraint),
            "gamma_constraint": tf.keras.constraints.serialize(self.gamma_constraint),
        }
        base_config = super().get_config()
        return {**base_config, **config}

    def compute_output_shape(self, input_shape):
        return input_shape

    def _reshape_into_groups(self, inputs, input_shape, tensor_input_shape):

        group_shape = [tensor_input_shape[i] for i in range(len(input_shape))]
        group_shape[self.axis] = input_shape[self.axis] // self.groups
        group_shape.insert(self.axis, self.groups)
        group_shape = tf.stack(group_shape)
        reshaped_inputs = tf.reshape(inputs, group_shape)
        return reshaped_inputs, group_shape

    def _apply_normalization(self, reshaped_inputs, input_shape):

        group_shape = K.int_shape(reshaped_inputs)
        group_reduction_axes = list(range(1, len(group_shape)))
        axis = -2 if self.axis == -1 else self.axis - 1
        group_reduction_axes.pop(axis)

        mean, variance = tf.nn.moments(reshaped_inputs, group_reduction_axes, keepdims=True)

        gamma, beta = self._get_reshaped_weights(input_shape)
        normalized_inputs = tf.nn.batch_normalization(
            reshaped_inputs,
            mean=mean,
            variance=variance,
            scale=gamma,
            offset=beta,
            variance_epsilon=self.epsilon,
        )
        return normalized_inputs

    def _get_reshaped_weights(self, input_shape):
        broadcast_shape = self._create_broadcast_shape(input_shape)
        gamma = None
        beta = None
        if self.scale:
            gamma = tf.reshape(self.gamma, broadcast_shape)

        if self.center:
            beta = tf.reshape(self.beta, broadcast_shape)
        return gamma, beta

    def _check_if_input_shape_is_none(self, input_shape):
        dim = input_shape[self.axis]
        if dim is None:
            raise ValueError(
                "Axis " + str(self.axis) + " of "
                "input tensor should have a defined dimension "
                "but the layer received an input with shape " + str(input_shape) + "."
            )

    def _set_number_of_groups_for_instance_norm(self, input_shape):
        dim = input_shape[self.axis]

        if self.groups == -1:
            self.groups = dim

    def _check_size_of_dimensions(self, input_shape):

        dim = input_shape[self.axis]
        if dim < self.groups:
            raise ValueError(
                "Number of groups (" + str(self.groups) + ") cannot be "
                "more than the number of channels (" + str(dim) + ")."
            )

        if dim % self.groups != 0:
            raise ValueError(
                "Number of groups (" + str(self.groups) + ") must be a "
                "multiple of the number of channels (" + str(dim) + ")."
            )

    def _check_axis(self):

        if self.axis == 0:
            raise ValueError(
                "You are trying to normalize your batch axis. Do you want to "
                "use tf.layer.batch_normalization instead"
            )

    def _create_input_spec(self, input_shape):

        dim = input_shape[self.axis]
        self.input_spec = L.InputSpec(ndim=len(input_shape), axes={self.axis: dim})

    def _add_gamma_weight(self, input_shape):

        dim = input_shape[self.axis]
        shape = (dim,)

        if self.scale:
            self.gamma = self.add_weight(
                shape=shape,
                name="gamma",
                initializer=self.gamma_initializer,
                regularizer=self.gamma_regularizer,
                constraint=self.gamma_constraint,
            )
        else:
            self.gamma = None

    def _add_beta_weight(self, input_shape):

        dim = input_shape[self.axis]
        shape = (dim,)

        if self.center:
            self.beta = self.add_weight(
                shape=shape,
                name="beta",
                initializer=self.beta_initializer,
                regularizer=self.beta_regularizer,
                constraint=self.beta_constraint,
            )
        else:
            self.beta = None

    def _create_broadcast_shape(self, input_shape):
        broadcast_shape = [1] * len(input_shape)
        broadcast_shape[self.axis] = input_shape[self.axis] // self.groups
        broadcast_shape.insert(self.axis, self.groups)
        return broadcast_shape

In [594]:
class TransformBlock(tf.keras.Model):
    def __init__(self, features, norm_type, momentum=0.9, virtual_batch_size=None, groups=2, block_name="", **kwargs):
        super(TransformBlock, self).__init__(**kwargs)

        self.features = features
        self.norm_type = norm_type
        self.momentum = momentum
        self.groups = groups
        self.virtual_batch_size = virtual_batch_size

        self.transform = L.Dense(self.features, use_bias=False, name=f"transformblock_dense_{block_name}")

        if norm_type == "batch":
            self.bn = L.BatchNormalization(
                axis=-1,
                momentum=momentum,
                virtual_batch_size=virtual_batch_size,
                name=f"transformblock_bn_{block_name}",
            )

        else:
            self.bn = GroupNormalization(axis=-1, groups=self.groups, name=f"transformblock_gn_{block_name}")

    def call(self, inputs, training=None):
        x = self.transform(inputs)
        x = self.bn(x, training=training)
        return x

In [595]:
class TabNet(tf.keras.Model):
    def __init__(
        self,
        feature_columns,
        feature_dim=64,
        output_dim=64,
        num_features=None,
        num_decision_steps=5,
        relaxation_factor=1.5,
        sparsity_coefficient=1e-5,
        norm_type="group",
        batch_momentum=0.98,
        virtual_batch_size=None,
        num_groups=2,
        epsilon=1e-5,
        **kwargs,
    ):
        """
        Tensorflow 2.0 implementation of [TabNet: Attentive Interpretable Tabular Learning](https://arxiv.org/abs/1908.07442)
        # Hyper Parameter Tuning (Excerpt from the paper)
        We consider datasets ranging from ∼10K to ∼10M training points, with varying degrees of fitting
        difficulty. TabNet obtains high performance for all with a few general principles on hyperparameter
        selection:
            - Most datasets yield the best results for Nsteps ∈ [3, 10]. Typically, larger datasets and
            more complex tasks require a larger Nsteps. A very high value of Nsteps may suffer from
            overfitting and yield poor generalization.
            - Adjustment of the values of Nd and Na is the most efficient way of obtaining a trade-off
            between performance and complexity. Nd = Na is a reasonable choice for most datasets. A
            very high value of Nd and Na may suffer from overfitting and yield poor generalization.
            - An optimal choice of γ can have a major role on the overall performance. Typically a larger
            Nsteps value favors for a larger γ.
            - A large batch size is beneficial for performance - if the memory constraints permit, as large
            as 1-10 % of the total training dataset size is suggested. The virtual batch size is typically
            much smaller than the batch size.
            - Initially large learning rate is important, which should be gradually decayed until convergence.
        Args:
            feature_columns: The Tensorflow feature columns for the dataset.
            feature_dim (N_a): Dimensionality of the hidden representation in feature
                transformation block. Each layer first maps the representation to a
                2*feature_dim-dimensional output and half of it is used to determine the
                nonlinearity of the GLU activation where the other half is used as an
                input to GLU, and eventually feature_dim-dimensional output is
                transferred to the next layer.
            output_dim (N_d): Dimensionality of the outputs of each decision step, which is
                later mapped to the final classification or regression output.
            num_features: The number of input features (i.e the number of columns for
                tabular data assuming each feature is represented with 1 dimension).
            num_decision_steps(N_steps): Number of sequential decision steps.
            relaxation_factor (gamma): Relaxation factor that promotes the reuse of each
                feature at different decision steps. When it is 1, a feature is enforced
                to be used only at one decision step and as it increases, more
                flexibility is provided to use a feature at multiple decision steps.
            sparsity_coefficient (lambda_sparse): Strength of the sparsity regularization.
                Sparsity may provide a favorable inductive bias for convergence to
                higher accuracy for some datasets where most of the input features are redundant.
            norm_type: Type of normalization to perform for the model. Can be either
                'batch' or 'group'. 'group' is the default.
            batch_momentum: Momentum in ghost batch normalization.
            virtual_batch_size: Virtual batch size in ghost batch normalization. The
                overall batch size should be an integer multiple of virtual_batch_size.
            num_groups: Number of groups used for group normalization.
            epsilon: A small number for numerical stability of the entropy calculations.
        """
        super(TabNet, self).__init__(**kwargs)

        # Input checks
        if feature_columns is not None:
            if type(feature_columns) not in (list, tuple):
                raise ValueError("`feature_columns` must be a list or a tuple.")

            if len(feature_columns) == 0:
                raise ValueError("`feature_columns` must be contain at least 1 tf.feature_column !")

            if num_features is None:
                num_features = len(feature_columns)
            else:
                num_features = int(num_features)

        else:
            if num_features is None:
                raise ValueError("If `feature_columns` is None, then `num_features` cannot be None.")

        if num_decision_steps < 1:
            raise ValueError("Num decision steps must be greater than 0.")

        if feature_dim < output_dim:
            raise ValueError("To compute `features_for_coef`, feature_dim must be larger than output dim")

        feature_dim = int(feature_dim)
        output_dim = int(output_dim)
        num_decision_steps = int(num_decision_steps)
        relaxation_factor = float(relaxation_factor)
        sparsity_coefficient = float(sparsity_coefficient)
        batch_momentum = float(batch_momentum)
        num_groups = max(1, int(num_groups))
        epsilon = float(epsilon)

        if relaxation_factor < 0.0:
            raise ValueError("`relaxation_factor` cannot be negative !")

        if sparsity_coefficient < 0.0:
            raise ValueError("`sparsity_coefficient` cannot be negative !")

        if virtual_batch_size is not None:
            virtual_batch_size = int(virtual_batch_size)

        if norm_type not in ["batch", "group"]:
            raise ValueError("`norm_type` must be either `batch` or `group`")

        self.feature_columns = feature_columns
        self.num_features = num_features
        self.feature_dim = feature_dim
        self.output_dim = output_dim

        self.num_decision_steps = num_decision_steps
        self.relaxation_factor = relaxation_factor
        self.sparsity_coefficient = sparsity_coefficient
        self.norm_type = norm_type
        self.batch_momentum = batch_momentum
        self.virtual_batch_size = virtual_batch_size
        self.num_groups = num_groups
        self.epsilon = epsilon

        # if num_decision_steps > 1:
        # features_for_coeff = feature_dim - output_dim
        # print(f"[TabNet]: {features_for_coeff} features will be used for decision steps.")

        if self.feature_columns is not None:
            self.input_features = L.DenseFeatures(feature_columns, trainable=True)

            if self.norm_type == "batch":
                self.input_bn = L.BatchNormalization(axis=-1, momentum=batch_momentum, name="input_bn")
            else:
                self.input_bn = GroupNormalization(axis=-1, groups=self.num_groups, name="input_gn")

        else:
            self.input_features = None
            self.input_bn = None

        self.transform_f1 = TransformBlock(
            2 * self.feature_dim,
            self.norm_type,
            self.batch_momentum,
            self.virtual_batch_size,
            self.num_groups,
            block_name="f1",
        )

        self.transform_f2 = TransformBlock(
            2 * self.feature_dim,
            self.norm_type,
            self.batch_momentum,
            self.virtual_batch_size,
            self.num_groups,
            block_name="f2",
        )

        self.transform_f3_list = [
            TransformBlock(
                2 * self.feature_dim,
                self.norm_type,
                self.batch_momentum,
                self.virtual_batch_size,
                self.num_groups,
                block_name=f"f3_{i}",
            )
            for i in range(self.num_decision_steps)
        ]

        self.transform_f4_list = [
            TransformBlock(
                2 * self.feature_dim,
                self.norm_type,
                self.batch_momentum,
                self.virtual_batch_size,
                self.num_groups,
                block_name=f"f4_{i}",
            )
            for i in range(self.num_decision_steps)
        ]

        self.transform_coef_list = [
            TransformBlock(
                self.num_features,
                self.norm_type,
                self.batch_momentum,
                self.virtual_batch_size,
                self.num_groups,
                block_name=f"coef_{i}",
            )
            for i in range(self.num_decision_steps - 1)
        ]

        self._step_feature_selection_masks = None
        self._step_aggregate_feature_selection_mask = None

    def call(self, inputs, training=None):
        if self.input_features is not None:
            features = self.input_features(inputs)
            features = self.input_bn(features, training=training)

        else:
            features = inputs

        batch_size = tf.shape(features)[0]
        self._step_feature_selection_masks = []
        self._step_aggregate_feature_selection_mask = None

        # Initializes decision-step dependent variables.
        output_aggregated = tf.zeros([batch_size, self.output_dim])
        masked_features = features
        mask_values = tf.zeros([batch_size, self.num_features])
        aggregated_mask_values = tf.zeros([batch_size, self.num_features])
        complementary_aggregated_mask_values = tf.ones([batch_size, self.num_features])

        total_entropy = 0.0
        entropy_loss = 0.0

        for ni in range(self.num_decision_steps):
            # Feature transformer with two shared and two decision step dependent
            # blocks is used below.=
            transform_f1 = self.transform_f1(masked_features, training=training)
            transform_f1 = glu(transform_f1, self.feature_dim)

            transform_f2 = self.transform_f2(transform_f1, training=training)
            transform_f2 = (glu(transform_f2, self.feature_dim) + transform_f1) * tf.math.sqrt(0.5)

            transform_f3 = self.transform_f3_list[ni](transform_f2, training=training)
            transform_f3 = (glu(transform_f3, self.feature_dim) + transform_f2) * tf.math.sqrt(0.5)

            transform_f4 = self.transform_f4_list[ni](transform_f3, training=training)
            transform_f4 = (glu(transform_f4, self.feature_dim) + transform_f3) * tf.math.sqrt(0.5)

            if ni > 0 or self.num_decision_steps == 1:
                decision_out = tf.nn.relu(transform_f4[:, : self.output_dim])

                # Decision aggregation.
                output_aggregated += decision_out

                # Aggregated masks are used for visualization of the
                # feature importance attributes.
                scale_agg = tf.reduce_sum(decision_out, axis=1, keepdims=True)

                if self.num_decision_steps > 1:
                    scale_agg = scale_agg / tf.cast(self.num_decision_steps - 1, tf.float32)

                aggregated_mask_values += mask_values * scale_agg

            features_for_coef = transform_f4[:, self.output_dim :]

            if ni < (self.num_decision_steps - 1):
                # Determines the feature masks via linear and nonlinear
                # transformations, taking into account of aggregated feature use.
                mask_values = self.transform_coef_list[ni](features_for_coef, training=training)
                mask_values *= complementary_aggregated_mask_values
                mask_values = sparsemax(mask_values, axis=-1)

                # Relaxation factor controls the amount of reuse of features between
                # different decision blocks and updated with the values of
                # coefficients.
                complementary_aggregated_mask_values *= self.relaxation_factor - mask_values

                # Entropy is used to penalize the amount of sparsity in feature
                # selection.
                total_entropy += tf.reduce_mean(
                    tf.reduce_sum(-mask_values * tf.math.log(mask_values + self.epsilon), axis=1)
                ) / (tf.cast(self.num_decision_steps - 1, tf.float32))

                # Add entropy loss
                entropy_loss = total_entropy

                # Feature selection.
                masked_features = tf.multiply(mask_values, features)

                # Visualization of the feature selection mask at decision step ni
                # tf.summary.image(
                #     "Mask for step" + str(ni),
                #     tf.expand_dims(tf.expand_dims(mask_values, 0), 3),
                #     max_outputs=1)
                mask_at_step_i = tf.expand_dims(tf.expand_dims(mask_values, 0), 3)
                self._step_feature_selection_masks.append(mask_at_step_i)

            else:
                # This branch is needed for correct compilation by tf.autograph
                entropy_loss = 0.0

        # Adds the loss automatically
        self.add_loss(self.sparsity_coefficient * entropy_loss)

        # Visualization of the aggregated feature importances
        # tf.summary.image(
        #     "Aggregated mask",
        #     tf.expand_dims(tf.expand_dims(aggregated_mask_values, 0), 3),
        #     max_outputs=1)

        agg_mask = tf.expand_dims(tf.expand_dims(aggregated_mask_values, 0), 3)
        self._step_aggregate_feature_selection_mask = agg_mask

        return output_aggregated

    @property
    def feature_selection_masks(self):
        return self._step_feature_selection_masks

    @property
    def aggregate_feature_selection_mask(self):
        return self._step_aggregate_feature_selection_mask

In [596]:
class TabNetClassifier(tf.keras.Model):
    def __init__(
        self,
        feature_columns,
        num_classes,
        num_features=None,
        feature_dim=64,
        output_dim=64,
        num_decision_steps=5,
        relaxation_factor=1.5,
        sparsity_coefficient=1e-5,
        norm_type="group",
        batch_momentum=0.98,
        virtual_batch_size=None,
        num_groups=1,
        epsilon=1e-5,
        multi_label=False,
        **kwargs
    ):
        """
        Tensorflow 2.0 implementation of [TabNet: Attentive Interpretable Tabular Learning](https://arxiv.org/abs/1908.07442)
        # Hyper Parameter Tuning (Excerpt from the paper)
        We consider datasets ranging from ∼10K to ∼10M training points, with varying degrees of fitting
        difficulty. TabNet obtains high performance for all with a few general principles on hyperparameter
        selection:
            - Most datasets yield the best results for Nsteps ∈ [3, 10]. Typically, larger datasets and
            more complex tasks require a larger Nsteps. A very high value of Nsteps may suffer from
            overfitting and yield poor generalization.
            - Adjustment of the values of Nd and Na is the most efficient way of obtaining a trade-off
            between performance and complexity. Nd = Na is a reasonable choice for most datasets. A
            very high value of Nd and Na may suffer from overfitting and yield poor generalization.
            - An optimal choice of γ can have a major role on the overall performance. Typically a larger
            Nsteps value favors for a larger γ.
            - A large batch size is beneficial for performance - if the memory constraints permit, as large
            as 1-10 % of the total training dataset size is suggested. The virtual batch size is typically
            much smaller than the batch size.
            - Initially large learning rate is important, which should be gradually decayed until convergence.
        Args:
            feature_columns: The Tensorflow feature columns for the dataset.
            num_classes: Number of classes.
            feature_dim (N_a): Dimensionality of the hidden representation in feature
                transformation block. Each layer first maps the representation to a
                2*feature_dim-dimensional output and half of it is used to determine the
                nonlinearity of the GLU activation where the other half is used as an
                input to GLU, and eventually feature_dim-dimensional output is
                transferred to the next layer.
            output_dim (N_d): Dimensionality of the outputs of each decision step, which is
                later mapped to the final classification or regression output.
            num_features: The number of input features (i.e the number of columns for
                tabular data assuming each feature is represented with 1 dimension).
            num_decision_steps(N_steps): Number of sequential decision steps.
            relaxation_factor (gamma): Relaxation factor that promotes the reuse of each
                feature at different decision steps. When it is 1, a feature is enforced
                to be used only at one decision step and as it increases, more
                flexibility is provided to use a feature at multiple decision steps.
            sparsity_coefficient (lambda_sparse): Strength of the sparsity regularization.
                Sparsity may provide a favorable inductive bias for convergence to
                higher accuracy for some datasets where most of the input features are redundant.
            norm_type: Type of normalization to perform for the model. Can be either
                'group' or 'group'. 'group' is the default.
            batch_momentum: Momentum in ghost batch normalization.
            virtual_batch_size: Virtual batch size in ghost batch normalization. The
                overall batch size should be an integer multiple of virtual_batch_size.
            num_groups: Number of groups used for group normalization.
            epsilon: A small number for numerical stability of the entropy calculations.
        """
        super(TabNetClassifier, self).__init__(**kwargs)

        self.num_classes = num_classes

        self.tabnet = TabNet(
            feature_columns=feature_columns,
            num_features=num_features,
            feature_dim=feature_dim,
            output_dim=output_dim,
            num_decision_steps=num_decision_steps,
            relaxation_factor=relaxation_factor,
            sparsity_coefficient=sparsity_coefficient,
            norm_type=norm_type,
            batch_momentum=batch_momentum,
            virtual_batch_size=virtual_batch_size,
            num_groups=num_groups,
            epsilon=epsilon,
            **kwargs
        )

        if multi_label:

            self.clf = L.Dense(num_classes, activation="sigmoid", use_bias=False, name="classifier")

        else:

            self.clf = L.Dense(num_classes, activation="softmax", use_bias=False, name="classifier")

    def call(self, inputs, training=None):
        self.activations = self.tabnet(inputs, training=training)
        out = self.clf(self.activations)

        return out

    def summary(self, *super_args, **super_kwargs):
        super().summary(*super_args, **super_kwargs)
        self.tabnet.summary(*super_args, **super_kwargs)

In [597]:
class TabNetRegressor(tf.keras.Model):
    def __init__(
        self,
        feature_columns,
        num_regressors,
        num_features=None,
        feature_dim=64,
        output_dim=64,
        num_decision_steps=5,
        relaxation_factor=1.5,
        sparsity_coefficient=1e-5,
        norm_type="group",
        batch_momentum=0.98,
        virtual_batch_size=None,
        num_groups=1,
        epsilon=1e-5,
        **kwargs
    ):
        """
        Tensorflow 2.0 implementation of [TabNet: Attentive Interpretable Tabular Learning](https://arxiv.org/abs/1908.07442)
        # Hyper Parameter Tuning (Excerpt from the paper)
        We consider datasets ranging from ∼10K to ∼10M training points, with varying degrees of fitting
        difficulty. TabNet obtains high performance for all with a few general principles on hyperparameter
        selection:
            - Most datasets yield the best results for Nsteps ∈ [3, 10]. Typically, larger datasets and
            more complex tasks require a larger Nsteps. A very high value of Nsteps may suffer from
            overfitting and yield poor generalization.
            - Adjustment of the values of Nd and Na is the most efficient way of obtaining a trade-off
            between performance and complexity. Nd = Na is a reasonable choice for most datasets. A
            very high value of Nd and Na may suffer from overfitting and yield poor generalization.
            - An optimal choice of γ can have a major role on the overall performance. Typically a larger
            Nsteps value favors for a larger γ.
            - A large batch size is beneficial for performance - if the memory constraints permit, as large
            as 1-10 % of the total training dataset size is suggested. The virtual batch size is typically
            much smaller than the batch size.
            - Initially large learning rate is important, which should be gradually decayed until convergence.
        Args:
            feature_columns: The Tensorflow feature columns for the dataset.
            num_regressors: Number of regression variables.
            feature_dim (N_a): Dimensionality of the hidden representation in feature
                transformation block. Each layer first maps the representation to a
                2*feature_dim-dimensional output and half of it is used to determine the
                nonlinearity of the GLU activation where the other half is used as an
                input to GLU, and eventually feature_dim-dimensional output is
                transferred to the next layer.
            output_dim (N_d): Dimensionality of the outputs of each decision step, which is
                later mapped to the final classification or regression output.
            num_features: The number of input features (i.e the number of columns for
                tabular data assuming each feature is represented with 1 dimension).
            num_decision_steps(N_steps): Number of sequential decision steps.
            relaxation_factor (gamma): Relaxation factor that promotes the reuse of each
                feature at different decision steps. When it is 1, a feature is enforced
                to be used only at one decision step and as it increases, more
                flexibility is provided to use a feature at multiple decision steps.
            sparsity_coefficient (lambda_sparse): Strength of the sparsity regularization.
                Sparsity may provide a favorable inductive bias for convergence to
                higher accuracy for some datasets where most of the input features are redundant.
            norm_type: Type of normalization to perform for the model. Can be either
                'group' or 'group'. 'group' is the default.
            batch_momentum: Momentum in ghost batch normalization.
            virtual_batch_size: Virtual batch size in ghost batch normalization. The
                overall batch size should be an integer multiple of virtual_batch_size.
            num_groups: Number of groups used for group normalization.
            epsilon: A small number for numerical stability of the entropy calculations.
        """
        super(TabNetRegressor, self).__init__(**kwargs)

        self.num_regressors = num_regressors

        self.tabnet = TabNet(
            feature_columns=feature_columns,
            num_features=num_features,
            feature_dim=feature_dim,
            output_dim=output_dim,
            num_decision_steps=num_decision_steps,
            relaxation_factor=relaxation_factor,
            sparsity_coefficient=sparsity_coefficient,
            norm_type=norm_type,
            batch_momentum=batch_momentum,
            virtual_batch_size=virtual_batch_size,
            num_groups=num_groups,
            epsilon=epsilon,
            **kwargs
        )

        self.regressor = L.Dense(num_regressors, use_bias=False, name="regressor")

    def call(self, inputs, training=None):
        self.activations = self.tabnet(inputs, training=training)
        out = self.regressor(self.activations)
        return out

    def summary(self, *super_args, **super_kwargs):
        super().summary(*super_args, **super_kwargs)
        self.tabnet.summary(*super_args, **super_kwargs)

In [598]:
# Aliases
TabNetClassification = TabNetClassifier
TabNetRegression = TabNetRegressor

In [599]:
class StackedTabNet(tf.keras.Model):
    def __init__(
        self,
        feature_columns,
        num_layers=1,
        feature_dim=64,
        output_dim=64,
        num_features=None,
        num_decision_steps=5,
        relaxation_factor=1.5,
        sparsity_coefficient=1e-5,
        norm_type="group",
        batch_momentum=0.98,
        virtual_batch_size=None,
        num_groups=2,
        epsilon=1e-5,
        **kwargs
    ):
        """
        Tensorflow 2.0 implementation of [TabNet: Attentive Interpretable Tabular Learning](https://arxiv.org/abs/1908.07442)
        Stacked variant of the TabNet model, which stacks multiple TabNets into a singular model.
        # Hyper Parameter Tuning (Excerpt from the paper)
        We consider datasets ranging from ∼10K to ∼10M training points, with varying degrees of fitting
        difficulty. TabNet obtains high performance for all with a few general principles on hyperparameter
        selection:
            - Most datasets yield the best results for Nsteps ∈ [3, 10]. Typically, larger datasets and
            more complex tasks require a larger Nsteps. A very high value of Nsteps may suffer from
            overfitting and yield poor generalization.
            - Adjustment of the values of Nd and Na is the most efficient way of obtaining a trade-off
            between performance and complexity. Nd = Na is a reasonable choice for most datasets. A
            very high value of Nd and Na may suffer from overfitting and yield poor generalization.
            - An optimal choice of γ can have a major role on the overall performance. Typically a larger
            Nsteps value favors for a larger γ.
            - A large batch size is beneficial for performance - if the memory constraints permit, as large
            as 1-10 % of the total training dataset size is suggested. The virtual batch size is typically
            much smaller than the batch size.
            - Initially large learning rate is important, which should be gradually decayed until convergence.
        Args:
            feature_columns: The Tensorflow feature columns for the dataset.
            num_layers: Number of TabNets to stack together.
            feature_dim (N_a): Dimensionality of the hidden representation in feature
                transformation block. Each layer first maps the representation to a
                2*feature_dim-dimensional output and half of it is used to determine the
                nonlinearity of the GLU activation where the other half is used as an
                input to GLU, and eventually feature_dim-dimensional output is
                transferred to the next layer. Can be either a single int, or a list of
                integers. If a list, must be of same length as the number of layers.
            output_dim (N_d): Dimensionality of the outputs of each decision step, which is
                later mapped to the final classification or regression output.
                Can be either a single int, or a list of
                integers. If a list, must be of same length as the number of layers.
            num_features: The number of input features (i.e the number of columns for
                tabular data assuming each feature is represented with 1 dimension).
            num_decision_steps(N_steps): Number of sequential decision steps.
            relaxation_factor (gamma): Relaxation factor that promotes the reuse of each
                feature at different decision steps. When it is 1, a feature is enforced
                to be used only at one decision step and as it increases, more
                flexibility is provided to use a feature at multiple decision steps.
            sparsity_coefficient (lambda_sparse): Strength of the sparsity regularization.
                Sparsity may provide a favorable inductive bias for convergence to
                higher accuracy for some datasets where most of the input features are redundant.
            norm_type: Type of normalization to perform for the model. Can be either
                'batch' or 'group'. 'group' is the default.
            batch_momentum: Momentum in ghost batch normalization.
            virtual_batch_size: Virtual batch size in ghost batch normalization. The
                overall batch size should be an integer multiple of virtual_batch_size.
            num_groups: Number of groups used for group normalization.
            epsilon: A small number for numerical stability of the entropy calculations.
        """
        super(StackedTabNet, self).__init__(**kwargs)

        if num_layers < 1:
            raise ValueError("`num_layers` cannot be less than 1")

        if type(feature_dim) not in [list, tuple]:
            feature_dim = [feature_dim] * num_layers

        if type(output_dim) not in [list, tuple]:
            output_dim = [output_dim] * num_layers

        if len(feature_dim) != num_layers:
            raise ValueError("`feature_dim` must be a list of length `num_layers`")

        if len(output_dim) != num_layers:
            raise ValueError("`output_dim` must be a list of length `num_layers`")

        self.num_layers = num_layers

        layers = []
        layers.append(
            TabNet(
                feature_columns=feature_columns,
                num_features=num_features,
                feature_dim=feature_dim[0],
                output_dim=output_dim[0],
                num_decision_steps=num_decision_steps,
                relaxation_factor=relaxation_factor,
                sparsity_coefficient=sparsity_coefficient,
                norm_type=norm_type,
                batch_momentum=batch_momentum,
                virtual_batch_size=virtual_batch_size,
                num_groups=num_groups,
                epsilon=epsilon,
            )
        )

        for layer_idx in range(1, num_layers):
            layers.append(
                TabNet(
                    feature_columns=None,
                    num_features=output_dim[layer_idx - 1],
                    feature_dim=feature_dim[layer_idx],
                    output_dim=output_dim[layer_idx],
                    num_decision_steps=num_decision_steps,
                    relaxation_factor=relaxation_factor,
                    sparsity_coefficient=sparsity_coefficient,
                    norm_type=norm_type,
                    batch_momentum=batch_momentum,
                    virtual_batch_size=virtual_batch_size,
                    num_groups=num_groups,
                    epsilon=epsilon,
                )
            )

        self.tabnet_layers = layers

    def call(self, inputs, training=None):
        x = self.tabnet_layers[0](inputs, training=training)

        for layer_idx in range(1, self.num_layers):
            x = self.tabnet_layers[layer_idx](x, training=training)

        return x

    @property
    def tabnets(self):
        return self.tabnet_layers

    @property
    def feature_selection_masks(self):
        return [tabnet.feature_selection_masks for tabnet in self.tabnet_layers]

    @property
    def aggregate_feature_selection_mask(self):
        return [tabnet.aggregate_feature_selection_mask for tabnet in self.tabnet_layers]

In [600]:
class StackedTabNetClassifier(tf.keras.Model):
    def __init__(
        self,
        feature_columns,
        num_classes,
        num_layers=1,
        feature_dim=64,
        output_dim=64,
        num_features=None,
        num_decision_steps=5,
        relaxation_factor=1.5,
        sparsity_coefficient=1e-5,
        norm_type="group",
        batch_momentum=0.98,
        virtual_batch_size=None,
        num_groups=2,
        epsilon=1e-5,
        multi_label=False,
        **kwargs
    ):
        """
        Tensorflow 2.0 implementation of [TabNet: Attentive Interpretable Tabular Learning](https://arxiv.org/abs/1908.07442)
        Stacked variant of the TabNet model, which stacks multiple TabNets into a singular model.
        # Hyper Parameter Tuning (Excerpt from the paper)
        We consider datasets ranging from ∼10K to ∼10M training points, with varying degrees of fitting
        difficulty. TabNet obtains high performance for all with a few general principles on hyperparameter
        selection:
            - Most datasets yield the best results for Nsteps ∈ [3, 10]. Typically, larger datasets and
            more complex tasks require a larger Nsteps. A very high value of Nsteps may suffer from
            overfitting and yield poor generalization.
            - Adjustment of the values of Nd and Na is the most efficient way of obtaining a trade-off
            between performance and complexity. Nd = Na is a reasonable choice for most datasets. A
            very high value of Nd and Na may suffer from overfitting and yield poor generalization.
            - An optimal choice of γ can have a major role on the overall performance. Typically a larger
            Nsteps value favors for a larger γ.
            - A large batch size is beneficial for performance - if the memory constraints permit, as large
            as 1-10 % of the total training dataset size is suggested. The virtual batch size is typically
            much smaller than the batch size.
            - Initially large learning rate is important, which should be gradually decayed until convergence.
        Args:
            feature_columns: The Tensorflow feature columns for the dataset.
            num_classes: Number of classes.
            num_layers: Number of TabNets to stack together.
            feature_dim (N_a): Dimensionality of the hidden representation in feature
                transformation block. Each layer first maps the representation to a
                2*feature_dim-dimensional output and half of it is used to determine the
                nonlinearity of the GLU activation where the other half is used as an
                input to GLU, and eventually feature_dim-dimensional output is
                transferred to the next layer. Can be either a single int, or a list of
                integers. If a list, must be of same length as the number of layers.
            output_dim (N_d): Dimensionality of the outputs of each decision step, which is
                later mapped to the final classification or regression output.
                Can be either a single int, or a list of
                integers. If a list, must be of same length as the number of layers.
            num_features: The number of input features (i.e the number of columns for
                tabular data assuming each feature is represented with 1 dimension).
            num_decision_steps(N_steps): Number of sequential decision steps.
            relaxation_factor (gamma): Relaxation factor that promotes the reuse of each
                feature at different decision steps. When it is 1, a feature is enforced
                to be used only at one decision step and as it increases, more
                flexibility is provided to use a feature at multiple decision steps.
            sparsity_coefficient (lambda_sparse): Strength of the sparsity regularization.
                Sparsity may provide a favorable inductive bias for convergence to
                higher accuracy for some datasets where most of the input features are redundant.
            norm_type: Type of normalization to perform for the model. Can be either
                'batch' or 'group'. 'group' is the default.
            batch_momentum: Momentum in ghost batch normalization.
            virtual_batch_size: Virtual batch size in ghost batch normalization. The
                overall batch size should be an integer multiple of virtual_batch_size.
            num_groups: Number of groups used for group normalization.
            epsilon: A small number for numerical stability of the entropy calculations.
        """
        super(StackedTabNetClassifier, self).__init__(**kwargs)

        self.num_classes = num_classes

        self.stacked_tabnet = StackedTabNet(
            feature_columns=feature_columns,
            num_layers=num_layers,
            feature_dim=feature_dim,
            output_dim=output_dim,
            num_features=num_features,
            num_decision_steps=num_decision_steps,
            relaxation_factor=relaxation_factor,
            sparsity_coefficient=sparsity_coefficient,
            norm_type=norm_type,
            batch_momentum=batch_momentum,
            virtual_batch_size=virtual_batch_size,
            num_groups=num_groups,
            epsilon=epsilon,
        )
        if multi_label:

            self.clf = L.Dense(num_classes, activation="sigmoid", use_bias=False)

        else:

            self.clf = L.Dense(num_classes, activation="softmax", use_bias=False)

    def call(self, inputs, training=None):
        self.activations = self.stacked_tabnet(inputs, training=training)
        out = self.clf(self.activations)

        return out

In [601]:
class StackedTabNetRegressor(tf.keras.Model):
    def __init__(
        self,
        feature_columns,
        num_regressors,
        num_layers=1,
        feature_dim=64,
        output_dim=64,
        num_features=None,
        num_decision_steps=5,
        relaxation_factor=1.5,
        sparsity_coefficient=1e-5,
        norm_type="group",
        batch_momentum=0.98,
        virtual_batch_size=None,
        num_groups=2,
        epsilon=1e-5,
        **kwargs
    ):
        """
        Tensorflow 2.0 implementation of [TabNet: Attentive Interpretable Tabular Learning](https://arxiv.org/abs/1908.07442)
        Stacked variant of the TabNet model, which stacks multiple TabNets into a singular model.
        # Hyper Parameter Tuning (Excerpt from the paper)
        We consider datasets ranging from ∼10K to ∼10M training points, with varying degrees of fitting
        difficulty. TabNet obtains high performance for all with a few general principles on hyperparameter
        selection:
            - Most datasets yield the best results for Nsteps ∈ [3, 10]. Typically, larger datasets and
            more complex tasks require a larger Nsteps. A very high value of Nsteps may suffer from
            overfitting and yield poor generalization.
            - Adjustment of the values of Nd and Na is the most efficient way of obtaining a trade-off
            between performance and complexity. Nd = Na is a reasonable choice for most datasets. A
            very high value of Nd and Na may suffer from overfitting and yield poor generalization.
            - An optimal choice of γ can have a major role on the overall performance. Typically a larger
            Nsteps value favors for a larger γ.
            - A large batch size is beneficial for performance - if the memory constraints permit, as large
            as 1-10 % of the total training dataset size is suggested. The virtual batch size is typically
            much smaller than the batch size.
            - Initially large learning rate is important, which should be gradually decayed until convergence.
        Args:
            feature_columns: The Tensorflow feature columns for the dataset.
            num_regressors: Number of regressors.
            num_layers: Number of TabNets to stack together.
            feature_dim (N_a): Dimensionality of the hidden representation in feature
                transformation block. Each layer first maps the representation to a
                2*feature_dim-dimensional output and half of it is used to determine the
                nonlinearity of the GLU activation where the other half is used as an
                input to GLU, and eventually feature_dim-dimensional output is
                transferred to the next layer. Can be either a single int, or a list of
                integers. If a list, must be of same length as the number of layers.
            output_dim (N_d): Dimensionality of the outputs of each decision step, which is
                later mapped to the final classification or regression output.
                Can be either a single int, or a list of
                integers. If a list, must be of same length as the number of layers.
            num_features: The number of input features (i.e the number of columns for
                tabular data assuming each feature is represented with 1 dimension).
            num_decision_steps(N_steps): Number of sequential decision steps.
            relaxation_factor (gamma): Relaxation factor that promotes the reuse of each
                feature at different decision steps. When it is 1, a feature is enforced
                to be used only at one decision step and as it increases, more
                flexibility is provided to use a feature at multiple decision steps.
            sparsity_coefficient (lambda_sparse): Strength of the sparsity regularization.
                Sparsity may provide a favorable inductive bias for convergence to
                higher accuracy for some datasets where most of the input features are redundant.
            norm_type: Type of normalization to perform for the model. Can be either
                'batch' or 'group'. 'group' is the default.
            batch_momentum: Momentum in ghost batch normalization.
            virtual_batch_size: Virtual batch size in ghost batch normalization. The
                overall batch size should be an integer multiple of virtual_batch_size.
            num_groups: Number of groups used for group normalization.
            epsilon: A small number for numerical stability of the entropy calculations.
        """
        super(StackedTabNetRegressor, self).__init__(**kwargs)

        self.num_regressors = num_regressors

        self.stacked_tabnet = StackedTabNet(
            feature_columns=feature_columns,
            num_layers=num_layers,
            feature_dim=feature_dim,
            output_dim=output_dim,
            num_features=num_features,
            num_decision_steps=num_decision_steps,
            relaxation_factor=relaxation_factor,
            sparsity_coefficient=sparsity_coefficient,
            norm_type=norm_type,
            batch_momentum=batch_momentum,
            virtual_batch_size=virtual_batch_size,
            num_groups=num_groups,
            epsilon=epsilon,
        )

        self.regressor = L.Dense(num_regressors, use_bias=False)

    def call(self, inputs, training=None):
        self.activations = self.tabnet(inputs, training=training)
        out = self.regressor(self.activations)
        return outl

In [602]:
def create_model_tabnet(num_col, output_dim):
    model = StackedTabNetClassifier(
        feature_columns=None,
        num_classes=output_dim,
        num_layers=2,
        feature_dim=1024,
        output_dim=1024,
        num_features=num_col,
        num_decision_steps=1,
        relaxation_factor=1.5,
        sparsity_coefficient=0,
        batch_momentum=0.98,
        virtual_batch_size=None,
        norm_type="group",
        num_groups=-1,
        multi_label=True,
    )

    return model

# Learning

In [603]:
# models = ["SimpleNN", "ResNet", "SplitNN", "TabNet"]
models = ["ResNet", "TabNet", "KernelRidge", "SVM"]

In [604]:
# N_STARTS = len(models) * 2
N_STARTS = 7
N_SPLITS = 10

In [605]:
pre_train_models = ["SimpleNN", "ResNet"]

In [606]:
def learning(target, N_STARTS, N_SPLITS, do_predict=False, do_transfer_learning=False):
    oof = {}
    predictions = {}

    for seed in range(N_STARTS):
        model_name = models[seed % len(models)]

        if not do_predict and model_name not in pre_train_models:
            continue

        seed_result = target.copy()
        seed_result.loc[:, target.columns] = 0
        prediction = ss.copy()
        prediction.loc[:, ss.columns] = 0

        # for SplitNN
        split_cols = []
        for _ in range(3):  # len(hidden_units) - 1
            split_cols.append(np.random.choice(range(len(train.columns)), int(np.ceil(0.8 * len(train.columns)))))

        if do_predict:
            kfold_seed = random_seed + seed
        else:
            kfold_seed = seed

        fix_seed(kfold_seed)

        for n, (tr, te) in enumerate(
            MultilabelStratifiedKFold(n_splits=N_SPLITS, random_state=kfold_seed, shuffle=True).split(target, target)
        ):
            start_time = time()

            # Build Model
            if model_name == "SimpleNN":
                model = create_model_simple_nn(len(train.columns), len(target.columns))

                if do_transfer_learning:
                    model_base = create_model_simple_nn(len(train.columns), len(non_target_df.columns))

            elif model_name == "ResNet":
                model = create_model_resnet(len(train.columns), len(train_pca.columns), len(target.columns))

                if do_transfer_learning:
                    model_base = create_model_resnet(
                        len(train.columns), len(train_pca.columns), len(non_target_df.columns)
                    )

            elif model_name == "SplitNN":
                model = create_model_split_nn(len(train.columns), len(target.columns))

                # if do_transfer_learning:
                #    model_base = create_model_split_nn(len(train.columns), len(non_target_df.columns))

            elif model_name == "NODE":
                model = create_model_node(len(target.columns))

                # if do_transfer_learning:
                #    model = create_model_node(len(non_target_df.columns))

            elif model_name == "KernelRidge":
                model = KernelRidge(alpha=80, kernel="rbf")

                # if do_transfer_learning:
                #    model = create_model_node(len(non_target_df.columns))

            elif model_name == "TabNet":
                model = create_model_tabnet(len(train.columns), len(target.columns))

                # if do_transfer_learning:
                #    model_base = create_model_tabnet(len(train.columns), len(non_target_df.columns))

            elif model_name == "SVM":
                continue

            else:
                raise "Model name is invalid."

            # Build Data Sets
            if model_name == "SplitNN":
                x_tr = [
                    train.values[tr][:, split_cols[0]],
                    train.values[tr][:, split_cols[1]],
                    train.values[tr][:, split_cols[2]],
                ]
                x_val = [
                    train.values[te][:, split_cols[0]],
                    train.values[te][:, split_cols[1]],
                    train.values[te][:, split_cols[2]],
                ]
                y_tr, y_val = target.astype(float).values[tr], target.astype(float).values[te]
                x_tt = [test.values[:, split_cols[0]], test.values[:, split_cols[1]], test.values[:, split_cols[2]]]

            elif model_name == "ResNet":
                x_tr = [
                    train.values[tr],
                    train_pca.values[tr],
                ]
                x_val = [
                    train.values[te],
                    train_pca.values[te],
                ]
                y_tr, y_val = target.astype(float).values[tr], target.astype(float).values[te]
                x_tt = [test.values, test_pca.values]

            else:
                x_tr, x_val = train.values[tr], train.values[te]
                y_tr, y_val = target.astype(float).values[tr], target.astype(float).values[te]
                x_tt = test.values

            if model_name == "KernelRidge":
                model.fit(x_tr, y_tr)
            else:
                model.compile(
                    optimizer=tfa.optimizers.AdamW(lr=1e-3, weight_decay=1e-5, clipvalue=756),
                    loss=tf.keras.losses.BinaryCrossentropy(label_smoothing=0.001),
                    metrics=logloss,
                )

                checkpoint_path = f"{model_name}_repeat:{seed}_fold:{n}.hdf5"

                if do_transfer_learning and model_name not in ["SplitNN", "NODE", "TabNet"]:
                    model_base.load_weights(checkpoint_path)
                    for layer in range(len(model_base.layers[:-1])):
                        model.layers[layer].set_weights(model_base.layers[layer].get_weights())

                cb_checkpt = ModelCheckpoint(
                    checkpoint_path,
                    monitor="val_loss",
                    verbose=0,
                    save_best_only=True,
                    save_weights_only=True,
                    mode="min",
                )
                reduce_lr_loss = ReduceLROnPlateau(
                    monitor="val_loss", factor=0.1, patience=3, verbose=0, min_delta=1e-4, mode="min"
                )
                early_stopping = EarlyStopping(
                    monitor="val_loss",
                    patience=10,
                    mode="min",
                    verbose=0,
                    min_delta=1e-4,
                    restore_best_weights=True,
                )
                model.fit(
                    x_tr,
                    y_tr,
                    validation_data=(x_val, y_val),
                    epochs=100,
                    batch_size=128,
                    callbacks=[cb_checkpt, reduce_lr_loss, early_stopping],
                    verbose=0,
                )

            val_predict = model.predict(x_val)
            fold_score = metric(target.loc[te].values, val_predict)
            seed_result.loc[te, target.columns] += val_predict

            if do_predict:
                test_predict = model.predict(x_tt)
                prediction.loc[:, target.columns] += test_predict / N_SPLITS

            print(
                f"[{str(datetime.timedelta(seconds = time() - start_time))[2:7]}] {model_name}: Seed {seed}, Fold {n}:",
                fold_score,
            )

            K.clear_session()
            del model
            x = gc.collect()

        oof[f"{model_name}_{seed}"] = seed_result
        predictions[f"{model_name}_{seed}"] = prediction

    return oof, predictions

In [607]:
# Pre train with non-scored labels
_, _ = learning(non_target_df, N_STARTS, N_SPLITS)

In [608]:
oof, predictions = learning(target_df, N_STARTS, N_SPLITS, True, True)

[00:32] KernelRidge: Seed 0, Fold 0: 0.019315936956815435
[00:31] KernelRidge: Seed 0, Fold 1: 0.01921261577516679
[00:31] KernelRidge: Seed 0, Fold 2: 0.01920328268904671
[00:31] KernelRidge: Seed 0, Fold 3: 0.018988456698320264
[00:30] KernelRidge: Seed 0, Fold 4: 0.0192509069302247
[00:31] KernelRidge: Seed 0, Fold 5: 0.01946980293337863
[00:31] KernelRidge: Seed 0, Fold 6: 0.019217878092141056
[00:31] KernelRidge: Seed 0, Fold 7: 0.019418115592322958
[00:32] KernelRidge: Seed 0, Fold 8: 0.019235729741670836
[00:31] KernelRidge: Seed 0, Fold 9: 0.01915927895220065


## Learning - by columns

In [609]:
for key in oof.keys():
    if "SVM" not in key:
        continue

    start_time = time()

    x_new = oof[key].values
    x_tt_new = predictions[key].values

    seed_result = target_df.copy()
    seed_result.loc[:, target_df.columns] = 0
    prediction = ss.copy()
    prediction.loc[:, ss.columns] = 0

    for col in range(target_df.shape[1]):
        target = target_df.values[:, col]

        if target.sum() >= N_SPLITS:
            kfold_seed = random_seed + N_STARTS + int(key.rsplit("_", 1)[-1]) * 300 + col
            skf = StratifiedKFold(n_splits=N_SPLITS, random_state=kfold_seed, shuffle=True)

            for n, (tr, te) in enumerate(skf.split(target, target)):
                x_tr, x_val = x_new[tr, col].reshape(-1, 1), x_new[te, col].reshape(-1, 1)
                y_tr, y_val = target[tr], target[te]
                x_tt = x_tt_new[:, col].reshape(-1, 1)

                model = SVC(C=40, cache_size=2000)
                model.fit(x_tr, y_tr)

                val_predict = model.decision_function(x_val)
                seed_result.loc[te, target_df.columns[col]] += val_predict

                test_predict = model.decision_function(x_tt)
                prediction.loc[:, target_df.columns[col]] += test_predict / N_SPLITS
        else:
            seed_result.loc[:, target_df.columns[col]] += x_new[:, col]
            prediction.loc[:, target_df.columns[col]] += x_tt_new[:, col]

    seed_score = metric(target_df.values, seed_result.values)
    print(
        f"[{str(datetime.timedelta(seconds = time() - start_time))[2:7]}] {key}: ",
        seed_score,
    )

    oof[key] = seed_result
    predictions[key] = prediction

[01:54] SVM_1:  0.06004785662606853


## Learning - Platt Scaling

https://www.kaggle.com/gogo827jz/kernel-logistic-regression-one-for-206-targets?scriptVersionId=43366198

In [610]:
for key in oof.keys():
    if all(col not in key for col in ["KernelRidge", "SVM"]):
        continue

    start_time = time()

    x_new = oof[key].values
    x_tt_new = predictions[key].values

    seed_result = target_df.copy()
    seed_result.loc[:, target_df.columns] = 0
    prediction = ss.copy()
    prediction.loc[:, ss.columns] = 0

    for col in range(target_df.shape[1]):
        target = target_df.values[:, col]

        if target.sum() >= N_SPLITS:
            kfold_seed = random_seed + (N_STARTS + 1) * 300 + int(key.rsplit("_", 1)[-1]) * 300 + col
            skf = StratifiedKFold(n_splits=N_SPLITS, random_state=kfold_seed, shuffle=True)

            for n, (tr, te) in enumerate(skf.split(target, target)):
                x_tr, x_val = x_new[tr, col].reshape(-1, 1), x_new[te, col].reshape(-1, 1)
                y_tr, y_val = target[tr], target[te]
                x_tt = x_tt_new[:, col].reshape(-1, 1)

                model = LogisticRegression(C=35, max_iter=1000, random_state=int(key.rsplit("_", 1)[-1]))
                model.fit(x_tr, y_tr)

                val_predict = model.predict_proba(x_val)
                seed_result.loc[te, target_df.columns[col]] += val_predict[:, 1]

                test_predict = model.predict_proba(x_tt)
                prediction.loc[:, target_df.columns[col]] += test_predict[:, 1] / N_SPLITS
        else:
            seed_result.loc[:, target_df.columns[col]] += x_new[:, col]
            prediction.loc[:, target_df.columns[col]] += x_tt_new[:, col]

    seed_score = metric(target_df.values, seed_result.values)
    print(
        f"[{str(datetime.timedelta(seconds = time() - start_time))[2:7]}] {key}: ",
        seed_score,
    )

    oof[key] = seed_result
    predictions[key] = prediction

[01:20] KernelRidge_0:  0.01895765781745826
[01:23] SVM_1:  0.022391963780521055


# Cross Validation

In [611]:
initial_weights = [1.0 / N_STARTS for _ in range(N_STARTS)] + [1.0]

# https://www.kaggle.com/gogo827jz/optimise-blending-weights-with-bonus-0#Bonus-(Lagrange-Multiplier)


def lagrange_func(params):
    # weights, _lambda = params
    blend_ = blend(target_df.values.shape, params[:-1], oof)
    return metric(target_df.values, blend_) - params[-1] * (sum(params[:-1]) - 1)


grad_l = grad(lagrange_func)


def lagrange_obj(params):
    # weights, _lambda = params
    d = grad_l(params).tolist()
    return d[:-1] + [sum(params[:-1]) - 1]


blend_ = blend(target_df.values.shape, initial_weights[:-1], oof)
print(f"Initial blend CV: {metric(target_df.values, blend_)}")

optimize = False
if optimize:
    optimized_weights = fsolve(lagrange_obj, initial_weights)
else:
    optimized_weights = initial_weights

blend_ = blend(target_df.values.shape, optimized_weights[:-1], oof)
print(f"Optimized blend CV: {metric(target_df.values, blend_)}")

print(f"Optimized weights: {optimized_weights[:-1]}")
print(f"Check the sum of all weights: {sum(optimized_weights[:-1])}")

Initial blend CV: 0.019694349991266173
Optimized blend CV: 0.019694349991266173
Optimized weights: [0.5, 0.5]
Check the sum of all weights: 1.0


# Postprocessing

In [612]:
# Weighted blend
submit_df.loc[:, target_df.columns] = blend(ss.shape, optimized_weights[:-1], predictions)

In [613]:
# Clipping
submit_df.loc[:, target_df.columns] = submit_df.loc[:, target_df.columns].clip(1e-7, 1 - 1e-7)

In [614]:
submit_df.loc[test_df["cp_type"] == "ctl_vehicle", target_df.columns] = 0

# Output

In [615]:
submit_df.to_csv("submission.csv", index=False)