# Strategy

- Preprocessing
    - Include ctrl_vehicle
    - RankGauss
    - PCA + Existing Features
    - KMeans
    - Basic stats
- Model
    - Multi head ResNet (tensorflow)
    - TabNet (pytorch)
- Training
    - Pre-train with non-scored target.
    - Train with public test pseudo label
    - Optimizer: Adam/AdamW with weight_decay
    - Loss: BCE with Label smoothing + Logits
- Prediction
    - Ensemble above with average.

# Change Log

- v65
    - Remove clipping.
    - Disable Variance Encoding.
- v66
    - Add AUC.
    - CV only with original training data.
- v67
    - Add `train_drug.csv` .
    - Add Drug and MultiLabel Stratification.
- v68
    - Remove public test pseudo label.
    - Enable pseudo labeling.
    - Disable pre-training with non-scored target.
- V69
    - Disable pseudo labeling.
    - Re-enable pre-training with non-scored target.
    - Re-add public test pseudo label.
    - Add correlation.
    - Update label smoothing parameter.
- v70 - **LB: 0.01840**
    - Amend num of seed.
- v71
    - Update model parameters.
        - ResNet network
        - TabNet dimension
- v72
    - Add KMeans and basic stats.
    - Add NODE model.
- v73
    - Update split condition of group multilabel stratified kfold.
    - Update NODE parameters.
- v74
    - Disable pre-train with non-scored target due to execution time reduction.
- v75
    - Fold 5 to 7.
- v76
    - Remove ResNet for execution time reduction.
- v77
    - Use 3 models. ["ResNet", "TabNet", "NODE"]
    - Enable pre-train for ResNet.
- v78 - **LB: 0.01841**
    - Reset fold eash seeds.
- v79
    - Add simple NN model again.
    - Fold 7 to 5.
- v80
    - Remove simple NN and NODE model.
    - Increase num of seed x2 to x3.
- v81
    - Use ctrl_vehicle

# Setup

## for Google Colab

In [None]:
import sys
IN_COLAB = 'google.colab' in sys.modules

In [None]:
COMPETE = "lish-moa"
DATASETS = [
    "imokuri/pytorchtabnet",
    "imokuri/moablendblendblend",
    "imokuri/adabelief010",
    "tolgadincer/autograd",
    "yasufuminakama/iterative-stratification",
    "rahulsd91/moapublictest",
]
PACKAGES = []

In [None]:
if IN_COLAB:
    !pip install -q -U git+https://github.com/IMOKURI/kaggle_on_google_colab.git

    from kaggle_on_google_colab import setup
    kaggle = setup.Setup()
    kaggle.dirs(COMPETE)

    !kaggle competitions download -p /content/zip {COMPETE}
    for line in setup.exec_get_lines(cmd=f"kaggle competitions files --csv {COMPETE} | egrep -v \"Warning: Looks like you're using an outdated API Version|name,size,creationDate\" | cut -d , -f 1"):
        !unzip -q -n /content/zip/{line.decode().strip()}.zip -d /content/{COMPETE}/input/{COMPETE}

    for dataset in DATASETS:
        dataset_name = dataset.split("/")[-1]

        !kaggle datasets download -p /content/zip {dataset}
        !unzip -q -n /content/zip/{dataset_name}.zip -d /content/{COMPETE}/input/{dataset_name}

    for package_ in PACKAGES:
        !pip install {package_}

    !pip install -U tensorflow-addons
    !mv /content/zip/train_drug.csv /content/{COMPETE}/input/{COMPETE}/

    %cd /content/{COMPETE}/output

## Library

In [None]:
import warnings

warnings.filterwarnings("ignore")

In [None]:
import sys

sys.path.append("../input/iterative-stratification/iterative-stratification-master")
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

sys.path.append("../input/autograd")
import autograd.numpy as np
from autograd import grad

sys.path.append("../input/pytorchtabnet")
from pytorch_tabnet.metrics import Metric
from pytorch_tabnet.tab_model import TabNetRegressor

#sys.path.append("../input/adabelief010")
#from AdaBelief import AdaBelief
#from AdaBelief_tf import AdaBeliefOptimizer

In [None]:
import datetime
import gc
import itertools
import os
import random
from collections import defaultdict
from time import time
from typing import Optional

import pandas as pd
import tensorflow as tf
import tensorflow.keras.backend as K
import tensorflow.keras.layers as L
import tensorflow_addons as tfa
import torch
import torch.nn.functional as F
import torch.optim as optim
from scipy.optimize import fsolve, minimize
from scipy.stats import pearsonr
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.feature_selection import VarianceThreshold
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import QuantileTransformer
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from tensorflow_probability import distributions as tfp_distributions
from tensorflow_probability import stats as tfp_stats
from torch import nn
from torch.nn.modules.loss import _WeightedLoss
from torch.optim.lr_scheduler import ReduceLROnPlateau as torch_ReduceLROnPlateau

In [None]:
# import numpy as np
# import optuna

In [None]:
MIXED_PRECISION = False
XLA_ACCELERATE = True

if MIXED_PRECISION:
    from tensorflow.keras.mixed_precision import experimental as mixed_precision

    if tpu:
        policy = tf.keras.mixed_precision.experimental.Policy("mixed_bfloat16")
    else:
        policy = tf.keras.mixed_precision.experimental.Policy("mixed_float16")
    mixed_precision.set_policy(policy)
    print("Mixed precision enabled")

if XLA_ACCELERATE:
    tf.config.optimizer.set_jit(True)
    print("Accelerated Linear Algebra enabled")

In [None]:
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

# Functions

In [None]:
def fix_seed(seed=2020):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)
    torch.manual_seed(seed)

    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False


random_seed = 22
fix_seed(random_seed)

## Metrics

In [None]:
# Evaluation Metric with sigmoid applied and clipping

## for tensorflow
def logloss(y_true, y_pred):
    logits = 1 / (1 + K.exp(-y_pred))
    aux = (1 - y_true) * K.log(1 - logits + 1e-15) + y_true * K.log(logits + 1e-15)
    return K.mean(-aux)

## for pytorch
class LogitsLogLoss(Metric):
    def __init__(self):
        self._name = "logits_ll"
        self._maximize = False

    def __call__(self, y_true, y_pred):
        logits = 1 / (1 + np.exp(-y_pred))
        aux = (1 - y_true) * np.log(1 - logits + 1e-15) + y_true * np.log(logits + 1e-15)
        return np.mean(-aux)

## for overall
## [Fast Numpy Log Loss] https://www.kaggle.com/gogo827jz/optimise-blending-weights-4-5x-faster-log-loss
def metric(y_true, y_pred):
    loss = 0
    for i in range(y_pred.shape[1]):
        loss += -np.mean(y_true[:, i] * np.log(y_pred[:, i] + 1e-15) + (1 - y_true[:, i]) * np.log(1 - y_pred[:, i] + 1e-15))
    return loss / y_pred.shape[1]

## Loss functions

In [None]:
# https://www.kaggle.com/felipebihaiek/torch-continued-from-auxiliary-targets-smoothing
class SmoothBCEwLogits(_WeightedLoss):
    def __init__(self, weight=None, reduction="mean", smoothing=0.0):
        super().__init__(weight=weight, reduction=reduction)
        self.smoothing = smoothing
        self.weight = weight
        self.reduction = reduction

    @staticmethod
    def _smooth(targets: torch.Tensor, n_labels: int, smoothing=0.0):
        assert 0 <= smoothing < 1
        with torch.no_grad():
            targets = targets * (1.0 - smoothing) + 0.5 * smoothing
        return targets

    def forward(self, inputs, targets):
        targets = SmoothBCEwLogits._smooth(targets, inputs.size(-1), self.smoothing)
        loss = F.binary_cross_entropy_with_logits(inputs, targets, self.weight)

        if self.reduction == "sum":
            loss = loss.sum()
        elif self.reduction == "mean":
            loss = loss.mean()

        return loss

## Cross Validation

In [None]:
# Blend oof predictions
def blend(size, weights, oof):
    blend_ = np.zeros(size)
    for i, key in enumerate(oof.keys()):
        blend_ += weights[i] * oof[key].values[:blend_.shape[0], :blend_.shape[1]]
    return blend_

In [None]:
def cross_validation(size, weight, y_true, oof):
    x = size[0]
    blend_ = blend(y_true[:x].shape, weight, oof)

    aucs = []
    for task_id in range(blend_.shape[1]):
        aucs.append(roc_auc_score(y_true=y_true[:x, task_id], y_score=blend_[:, task_id]))
        
    CV = metric(y_true[:x], blend_)
    AUC = np.mean(aucs)
    print(f"Blended CV: {CV}, AUC : {AUC}")

    return CV, AUC

# Load Data

In [None]:
train_df = pd.read_csv("../input/lish-moa/train_features.csv")
test_df = pd.read_csv("../input/lish-moa/test_features.csv")
target_df = pd.read_csv("../input/lish-moa/train_targets_scored.csv")
non_target_df = pd.read_csv("../input/lish-moa/train_targets_nonscored.csv")
submit_df = pd.read_csv("../input/lish-moa/sample_submission.csv")
drug_df = pd.read_csv('../input/lish-moa/train_drug.csv')

pub_test_df = pd.read_csv("../input/moapublictest/test_features.csv")
pub_submit_df = pd.read_csv("../input/moablendblendblend/submission.csv")

In [None]:
train = train_df.copy()
test = test_df.copy()
target = target_df.copy()
non_target = non_target_df.copy()
ss = submit_df.copy()
drug = drug_df.copy()

pub_test = pub_test_df.copy()
pub_ss = pub_submit_df.copy()

## Use public test data for training

In [None]:
# Merge public test data (and pseudo label) into train data
train = pd.concat([train, pub_test]).reset_index(drop=True)
target = pd.concat([target, pub_ss]).reset_index(drop=True)

In [None]:
target

# Preprocessing

In [None]:
train.loc[:, "cp_dose"] = train.loc[:, "cp_dose"].map({"D1": 0, "D2": 1})
test.loc[:, "cp_dose"] = test.loc[:, "cp_dose"].map({"D1": 0, "D2": 1})

In [None]:
train.loc[:, "cp_time"] = train.loc[:, "cp_time"].map({24: 0, 48: 1, 72: 2})
test.loc[:, "cp_time"] = test.loc[:, "cp_time"].map({24: 0, 48: 1, 72: 2})

## Remove ctrl_vehicle



In [None]:
USE_CTRL_VEHICLE = True

if USE_CTRL_VEHICLE:
    train.loc[:, "cp_type"] = train.loc[:, "cp_type"].map({"ctl_vehicle": 0, "trt_cp": 1})
    test.loc[:, "cp_type"] = test.loc[:, "cp_type"].map({"ctl_vehicle": 0, "trt_cp": 1})

else:
    target = target.loc[train["cp_type"] != "ctl_vehicle"].reset_index(drop=True)
    non_target = non_target.loc[train[: train_df.shape[0]]["cp_type"] != "ctl_vehicle"].reset_index(drop=True)

    train = train.loc[train["cp_type"] != "ctl_vehicle"].reset_index(drop=True)

    train = train.drop("cp_type", axis=1)
    test = test.drop("cp_type", axis=1)

## Merge drug_id into training data

https://www.kaggle.com/c/lish-moa/discussion/195195

In [None]:
target_drug = pd.DataFrame(target.loc[:, "sig_id"]).merge(drug, on='sig_id', how='left')
non_target_drug = pd.DataFrame(non_target.loc[:, "sig_id"]).merge(drug, on='sig_id', how='left')

In [None]:
target_drug = target_drug.fillna("xxxxxxxxx")
non_target_drug = non_target_drug.fillna("xxxxxxxxx")

In [None]:
target_drug

## Remove sig_id

In [None]:
del train["sig_id"]
del target["sig_id"]
del non_target["sig_id"]
del test["sig_id"]
del ss["sig_id"]

In [None]:
train

In [None]:
print(train.shape)
print(target.shape)
print(non_target.shape)

print(test.shape)
print(ss.shape)

## Rank Gauss

https://www.kaggle.com/nayuts/moa-pytorch-nn-pca-rankgauss



In [None]:
g_cols = [col for col in train_df.columns if col.startswith("g-")]
c_cols = [col for col in train_df.columns if col.startswith("c-")]

for col in g_cols + c_cols:
    transformer = QuantileTransformer(n_quantiles=100, random_state=random_seed, output_distribution="normal")

    vec_len = len(train[col].values)
    vec_len_test = len(test[col].values)

    raw_vec = train[col].values.reshape(vec_len, 1)
    transformer.fit(raw_vec)

    train[col] = transformer.transform(raw_vec).reshape(1, vec_len)[0]
    test[col] = transformer.transform(test[col].values.reshape(vec_len_test, 1)).reshape(1, vec_len_test)[0]

In [None]:
train

## PCA features (+ Existing features)



In [None]:
# g-
n_comp = 50

data = pd.concat([pd.DataFrame(train[g_cols]), pd.DataFrame(test[g_cols])])
data2 = PCA(n_components=n_comp, random_state=random_seed).fit_transform(data[g_cols])
train2 = data2[: train.shape[0]]
test2 = data2[-test.shape[0] :]

train2 = pd.DataFrame(train2, columns=[f"pca_G-{i}" for i in range(n_comp)])
test2 = pd.DataFrame(test2, columns=[f"pca_G-{i}" for i in range(n_comp)])

train = pd.concat((train, train2), axis=1)
test = pd.concat((test, test2), axis=1)

In [None]:
# c-
n_comp = 15

data = pd.concat([pd.DataFrame(train[c_cols]), pd.DataFrame(test[c_cols])])
data2 = PCA(n_components=n_comp, random_state=random_seed).fit_transform(data[c_cols])
train2 = data2[: train.shape[0]]
test2 = data2[-test.shape[0] :]

train2 = pd.DataFrame(train2, columns=[f"pca_C-{i}" for i in range(n_comp)])
test2 = pd.DataFrame(test2, columns=[f"pca_C-{i}" for i in range(n_comp)])

train = pd.concat((train, train2), axis=1)
test = pd.concat((test, test2), axis=1)

In [None]:
train

In [None]:
train_pca = train.copy()
test_pca = test.copy()

train_pca.drop(g_cols, axis=1, inplace=True)
test_pca.drop(g_cols, axis=1, inplace=True)

train_pca.drop(c_cols, axis=1, inplace=True)
test_pca.drop(c_cols, axis=1, inplace=True)

In [None]:
train_pca

## feature Selection using Variance Encoding



In [None]:
# https://www.kaggle.com/c/lish-moa/discussion/194973#1067941
if False:

    var_threshold = 0.5

    data = train.append(test)
    ve_columns = (data.iloc[:, 2:].var() >= var_threshold).values
    ve_data = data.iloc[:, 2:].loc[:, ve_columns]

    ve_train = ve_data[: train.shape[0]]
    ve_test = ve_data[-test.shape[0] :]


    train = pd.DataFrame(train[["cp_time", "cp_dose"]].values.reshape(-1, 2), columns=["cp_time", "cp_dose"])
    train = pd.concat([train, ve_train], axis=1)


    test = pd.DataFrame(test[["cp_time", "cp_dose"]].values.reshape(-1, 2), columns=["cp_time", "cp_dose"])
    test = pd.concat([test, ve_test], axis=1)

In [None]:
# train

## KMeans

In [None]:
%%time

features_g = [col for col in train.columns if col.startswith("g-")]
features_c = [col for col in train.columns if col.startswith("c-")]

def fe_cluster(train_, test_, n_clusters_g = 35, n_clusters_c = 5):

    def create_cluster(tr, te, features, kind = 'g', n_clusters = n_clusters_g):
        tmp_train_ = tr[features].copy()
        tmp_test_ = te[features].copy()
        data = pd.concat([tmp_train_, tmp_test_], axis = 0)

        kmeans = KMeans(n_clusters = n_clusters, random_state = random_seed).fit(data)
        
        tr[f'clusters_{kind}'] = kmeans.labels_[:tr.shape[0]]
        te[f'clusters_{kind}'] = kmeans.labels_[-te.shape[0]:]
        tr = pd.get_dummies(tr, columns = [f'clusters_{kind}'])
        te = pd.get_dummies(te, columns = [f'clusters_{kind}'])
        return tr, te
    
    train_, test_ = create_cluster(train_, test_, features_g, kind = 'g', n_clusters = n_clusters_g)
    train_, test_ = create_cluster(train_, test_, features_c, kind = 'c', n_clusters = n_clusters_c)
    return train_, test_

train, test = fe_cluster(train, test)

In [None]:
train

## Basic stats

In [None]:
for stats in ["sum", "mean", "std", "kurt", "skew"]:
    train["g_" + stats] = getattr(train[features_g], stats)(axis = 1)
    train["c_" + stats] = getattr(train[features_c], stats)(axis = 1)
    train["gc_" + stats] = getattr(train[features_g + features_c], stats)(axis = 1)
    
    test["g_" + stats] = getattr(test[features_g], stats)(axis = 1)
    test["c_" + stats] = getattr(test[features_c], stats)(axis = 1)
    test["gc_" + stats] = getattr(test[features_g + features_c], stats)(axis = 1)

In [None]:
train

# Model - Simple NN

In [None]:
def create_model_simple_nn(num_col, output_dim):
    model = tf.keras.Sequential(
        [
            L.Input(num_col),
            L.BatchNormalization(),
            L.Dropout(0.4),
            tfa.layers.WeightNormalization(L.Dense(256, activation="elu")),
            L.BatchNormalization(),
            L.Dropout(0.3),
            tfa.layers.WeightNormalization(L.Dense(256, activation="swish")),
            L.BatchNormalization(),
            L.Dropout(0.3),
            tfa.layers.WeightNormalization(L.Dense(128, activation="selu")),
            L.BatchNormalization(),
            L.Dense(output_dim),
        ]
    )

    return model

# Model - Multi input ResNet

https://www.kaggle.com/rahulsd91/moa-multi-input-resnet-model

In [None]:
def create_model_resnet(n_features, n_features_2, n_labels):
    input_1 = L.Input(shape=(n_features,), name="Input1")
    input_2 = L.Input(shape=(n_features_2,), name="Input2")

    head_1 = tf.keras.Sequential(
        [
            L.BatchNormalization(),
            L.Dropout(0.2),
            tfa.layers.WeightNormalization(L.Dense(256, activation="selu")),
            L.BatchNormalization(),
            L.Dropout(0.2),
            tfa.layers.WeightNormalization(L.Dense(1024, activation="swish")),
        ],
        name="Head1",
    )

    input_3 = head_1(input_1)
    input_3_concat = L.Concatenate()([input_2, input_3])

    head_2 = tf.keras.Sequential(
        [
            L.BatchNormalization(),
            L.Dropout(0.2),
            tfa.layers.WeightNormalization(L.Dense(256, activation="swish")),
            L.BatchNormalization(),
            L.Dropout(0.2),
            tfa.layers.WeightNormalization(L.Dense(512, activation="relu")),
            L.BatchNormalization(),
            L.Dropout(0.2),
            tfa.layers.WeightNormalization(L.Dense(1024, activation="relu")),
        ],
        name="Head2",
    )

    input_4 = head_2(input_3_concat)
    input_4_avg = L.Average()([input_3, input_4])

    head_3 = tf.keras.Sequential(
        [
            L.BatchNormalization(),
            tfa.layers.WeightNormalization(L.Dense(128, activation="relu")),
            L.BatchNormalization(),
            L.Dropout(0.2),
            tfa.layers.WeightNormalization(L.Dense(128, activation="swish")),
            L.BatchNormalization(),
            # L.Dense(n_labels, activation="sigmoid"),
            L.Dense(n_labels),  # from_logits=True
        ],
        name="Head3",
    )

    output = head_3(input_4_avg)

    model = tf.keras.models.Model(inputs=[input_1, input_2], outputs=output)

    return model

# Model - TabNet

In [None]:
def create_model_tabnet(seed):
    tabnet_params = dict(
        n_d=32,
        n_a=32,
        n_steps=1,
        n_independent=1,
        n_shared=1,
        gamma=1.3,
        lambda_sparse=0,
        optimizer_fn=optim.Adam,
        optimizer_params=dict(lr=2e-2, weight_decay=1e-5),
        #optimizer_fn=AdaBelief,
        #optimizer_params=dict(lr=2e-2, weight_decay=1e-5, weight_decouple=False),
        mask_type="entmax",
        scheduler_params=dict(mode="min", patience=5, min_lr=1e-5, threshold=1e-5, factor=0.1),
        scheduler_fn=torch_ReduceLROnPlateau,
        seed=seed,
        verbose=0,
    )

    model = TabNetRegressor(**tabnet_params)

    return model

# Model - NODE

Neural Oblivious Decision Ensembles

https://www.kaggle.com/gogo827jz/moa-neural-oblivious-decision-ensembles-tf-keras

In [None]:
@tf.function
def sparsemoid(inputs: tf.Tensor):
    return tf.clip_by_value(0.5 * inputs + 0.5, 0.0, 1.0)

@tf.function
def identity(x: tf.Tensor):
    return x

In [None]:
class ODST(L.Layer):
    def __init__(self, n_trees: int = 3, depth: int = 4, units: int = 1, threshold_init_beta: float = 1.0):
        super(ODST, self).__init__()
        self.initialized = False
        self.n_trees = n_trees
        self.depth = depth
        self.units = units
        self.threshold_init_beta = threshold_init_beta

    def build(self, input_shape: tf.TensorShape):
        feature_selection_logits_init = tf.zeros_initializer()
        self.feature_selection_logits = tf.Variable(
            initial_value=feature_selection_logits_init(
                shape=(input_shape[-1], self.n_trees, self.depth), dtype="float32"
            ),
            trainable=True,
            name="feature_selection_logits",
        )

        feature_thresholds_init = tf.zeros_initializer()
        self.feature_thresholds = tf.Variable(
            initial_value=feature_thresholds_init(shape=(self.n_trees, self.depth), dtype="float32"),
            trainable=True,
            name="feature_thresholds",
        )

        log_temperatures_init = tf.ones_initializer()
        self.log_temperatures = tf.Variable(
            initial_value=log_temperatures_init(shape=(self.n_trees, self.depth), dtype="float32"),
            trainable=True,
            name="log_temperatures",
        )

        indices = K.arange(0, 2 ** self.depth, 1)
        offsets = 2 ** K.arange(0, self.depth, 1)
        bin_codes = tf.reshape(indices, (1, -1)) // tf.reshape(offsets, (-1, 1)) % 2
        bin_codes_1hot = tf.stack([bin_codes, 1 - bin_codes], axis=-1)
        self.bin_codes_1hot = tf.Variable(
            initial_value=tf.cast(bin_codes_1hot, "float32"), trainable=False, name="bin_codes_1hot"
        )

        response_init = tf.ones_initializer()
        self.response = tf.Variable(
            initial_value=response_init(shape=(self.n_trees, self.units, 2 ** self.depth), dtype="float32"),
            trainable=True,
            name="response",
        )

    def initialize(self, inputs):
        feature_values = self.feature_values(inputs)

        # intialize feature_thresholds
        percentiles_q = 100 * tfp_distributions.Beta(self.threshold_init_beta, self.threshold_init_beta).sample(
            [self.n_trees * self.depth]
        )
        flattened_feature_values = tf.map_fn(K.flatten, feature_values)
        init_feature_thresholds = tf.linalg.diag_part(
            tfp_stats.percentile(flattened_feature_values, percentiles_q, axis=0)
        )

        self.feature_thresholds.assign(tf.reshape(init_feature_thresholds, self.feature_thresholds.shape))

        # intialize log_temperatures
        self.log_temperatures.assign(
            tfp_stats.percentile(tf.math.abs(feature_values - self.feature_thresholds), 50, axis=0)
        )

    def feature_values(self, inputs: tf.Tensor, training: bool = None):
        feature_selectors = tfa.activations.sparsemax(self.feature_selection_logits)
        # ^--[in_features, n_trees, depth]

        feature_values = tf.einsum("bi,ind->bnd", inputs, feature_selectors)
        # ^--[batch_size, n_trees, depth]

        return feature_values

    def call(self, inputs: tf.Tensor, training: bool = None):
        if not self.initialized:
            self.initialize(inputs)
            self.initialized = True

        feature_values = self.feature_values(inputs)

        threshold_logits_a = (feature_values - self.feature_thresholds) * tf.math.exp(-self.log_temperatures)

        threshold_logits_b = tf.stack([-threshold_logits_a, threshold_logits_a], axis=-1)
        # ^--[batch_size, n_trees, depth, 2]

        bins = sparsemoid(threshold_logits_b)
        # ^--[batch_size, n_trees, depth, 2], approximately binary

        bin_matches = tf.einsum("btds,dcs->btdc", bins, self.bin_codes_1hot)
        # ^--[batch_size, n_trees, depth, 2 ** depth]

        response_weights = tf.math.reduce_prod(bin_matches, axis=-2)
        # ^-- [batch_size, n_trees, 2 ** depth]

        response = tf.einsum("bnd,ncd->bnc", response_weights, self.response)
        # ^-- [batch_size, n_trees, units]

        return tf.reduce_sum(response, axis=1)

In [None]:
class NODE(tf.keras.Model):
    def __init__(
        self,
        units: int = 1,
        n_layers: int = 1,
        output_dim=1,
        dropout_rate=0.1,
        link: tf.function = tf.identity,
        n_trees: int = 3,
        depth: int = 4,
        threshold_init_beta: float = 1.0,
        feature_column: Optional[L.DenseFeatures] = None,
    ):
        super(NODE, self).__init__()
        self.units = units
        self.n_layers = n_layers
        self.n_trees = n_trees
        self.depth = depth
        self.units = units
        self.threshold_init_beta = threshold_init_beta
        self.feature_column = feature_column
        self.dropout_rate = dropout_rate
        self.output_dim = output_dim

        if feature_column is None:
            self.feature = L.Lambda(identity)
        else:
            self.feature = feature_column

        self.bn = [L.BatchNormalization() for _ in range(n_layers + 1)]
        self.dropout = [L.Dropout(self.dropout_rate) for _ in range(n_layers + 1)]
        self.ensemble = [
            ODST(n_trees=n_trees, depth=depth, units=units, threshold_init_beta=threshold_init_beta)
            for _ in range(n_layers)
        ]

        self.last_layer = L.Dense(self.output_dim)

        self.link = link

    def call(self, inputs, training=None):
        X_a = self.feature(inputs)
        X_b = self.bn[0](X_a, training=training)
        X_c = self.dropout[0](X_b, training=training)

        X = defaultdict(dict)
        X[0][0] = X_c
        for i, tree in enumerate(self.ensemble):
            X[i][1] = tf.concat([X[i][0], tree(X[i][0])], axis=1)
            X[i][2] = self.bn[i + 1](X[i][1], training=training)
            X[i + 1][0] = self.dropout[i + 1](X[i][2], training=training)

        return self.link(self.last_layer(X[i + 1][0]))

In [None]:
def create_model_node(output_dim):
    model = tf.keras.Sequential(
        [
            NODE(
                n_layers=2,
                units=128,
                output_dim=128,
                dropout_rate=0.2,
                depth=3,
                n_trees=2,
            ),
            L.BatchNormalization(),
            L.Dropout(0.2),
            tfa.layers.WeightNormalization(L.Dense(128, activation="elu")),
            L.BatchNormalization(),
            L.Dropout(0.2),
            tfa.layers.WeightNormalization(L.Dense(128, activation="swish")),
            L.BatchNormalization(),
            L.Dense(output_dim),  # from_logits=True
        ]
    )

    return model

# Training

In [None]:
models = ["ResNet", "TabNet"]
N_STARTS = len(models) * 3
N_SPLITS = 5

if IN_COLAB:
    models = ["SimpleNN", "ResNet", "TabNet", "NODE"]
    N_STARTS = len(models) * 1
    N_SPLITS = 5

In [None]:
pre_train_models = ["ResNet", "SimpleNN"]

In [None]:
SAVE_MODEL = True

def learning(train_, train_pca_, target_, drug_, N_STARTS=6, N_SPLITS=5, do_predict=False, transfer_learning_base=None, pseudo_labeling=False):
    oof = {}
    predictions = {}

    for seed in range(N_STARTS):
        model_name = models[seed % len(models)]

        if not do_predict and model_name not in pre_train_models:
            continue

        seed_result = pd.DataFrame(np.zeros(target_.shape))
        prediction = pd.DataFrame(np.zeros(ss.shape))

        if pseudo_labeling:
            kfold_seed = random_seed * 10 + seed
        elif do_predict:
            kfold_seed = random_seed + seed
        else:
            kfold_seed = seed

        fix_seed(kfold_seed)

        if "fold" in drug_.columns:
            drug_.drop(["fold"], axis=1, inplace=True)

        # LOCATE DRUGS
        vc = drug_.drug_id.value_counts()
        vc1 = vc.loc[(vc==6)|(vc==12)|(vc==18)].index.sort_values()
        vc2 = vc.loc[(vc!=6)&(vc!=12)&(vc!=18)].index.sort_values()

        dct1 = {}
        dct2 = {}

        # STRATIFY DRUGS 18X OR LESS
        skf = MultilabelStratifiedKFold(n_splits=N_SPLITS, random_state=kfold_seed, shuffle=True)
        tmp = pd.concat([drug_, target_], axis=1).groupby('drug_id').mean().loc[vc1]
        for fold, (idxT, idxV) in enumerate(skf.split(tmp, tmp)):
            dd = {k:fold for k in tmp.index[idxV].values}
            dct1.update(dd)

        # STRATIFY DRUGS MORE THAN 18X
        skf = MultilabelStratifiedKFold(n_splits=N_SPLITS, random_state=kfold_seed, shuffle=True)
        tmp = drug_.loc[drug_.drug_id.isin(vc2)].reset_index(drop=True)
        for fold, (idxT, idxV) in enumerate(skf.split(tmp, tmp)):
            dd = {k:fold for k in tmp.sig_id[idxV].values}
            dct2.update(dd)

        # ASSIGN FOLDS
        drug_['fold'] = drug_.drug_id.map(dct1)
        drug_.loc[drug_.fold.isna(), 'fold'] = drug_.loc[drug_.fold.isna(), 'sig_id'].map(dct2)
        drug_.fold = drug_.fold.astype('int8')

        for n in range(N_SPLITS):
            tr = drug_[drug_["fold"] != n].index
            te = drug_[drug_["fold"] == n].index

            start_time = time()

            # Build Model
            if model_name == "ResNet":
                model = create_model_resnet(len(train_.columns), len(train_pca_.columns), len(target_.columns))

                if transfer_learning_base is not None:
                    model_base = create_model_resnet(
                        len(train_.columns), len(train_pca_.columns), len(transfer_learning_base.columns)
                    )

            elif model_name == "SimpleNN":
                model = create_model_simple_nn(len(train_.columns), len(target_.columns))

                if transfer_learning_base is not None:
                    model_base = create_model_simple_nn(
                        len(train_.columns), len(transfer_learning_base.columns)
                    )

            elif model_name == "TabNet":
                model = create_model_tabnet(kfold_seed)

            elif model_name == "NODE":
                model = create_model_node(len(target_.columns))

                if transfer_learning_base is not None:
                    model_base = create_model_node(
                        len(transfer_learning_base.columns)
                    )

            else:
                raise "Model name is invalid."

            # Build Data Sets
            if model_name == "ResNet":
                x_tr = [
                    train_.values[tr],
                    train_pca_.values[tr],
                ]
                x_val = [
                    train_.values[te],
                    train_pca_.values[te],
                ]
                y_tr, y_val = target_.astype(float).values[tr], target_.astype(float).values[te]
                x_tt = [test.values, test_pca.values]

            else:
                x_tr, x_val = train_.values[tr], train_.values[te]
                y_tr, y_val = target_.astype(float).values[tr], target_.astype(float).values[te]
                x_tt = test.values

            if model_name == "TabNet":
                checkpoint_path = f"{model_name}_repeat:{seed}_fold:{n}"

                if transfer_learning_base is not None and model_name in pre_train_models:
                    model.load_model(checkpoint_path + ".zip")

                model.fit(
                    X_train=x_tr,
                    y_train=y_tr,
                    eval_set=[(x_val, y_val)],
                    eval_name=["val"],
                    eval_metric=["logits_ll"],
                    max_epochs=200,
                    patience=10,
                    batch_size=1024,
                    virtual_batch_size=32,
                    num_workers=1,
                    drop_last=False,
                    #loss_fn=F.binary_cross_entropy_with_logits,
                    loss_fn=SmoothBCEwLogits(smoothing=1e-6),
                )

                if SAVE_MODEL:
                    try:
                        os.remove(checkpoint_path)
                    except OSError:
                        pass
                    model.save_model(checkpoint_path)

            else:
                model.compile(
                    optimizer=tfa.optimizers.AdamW(lr=1e-3, weight_decay=1e-5),
                    #optimizer=AdaBeliefOptimizer(lr=1e-3, weight_decay=1e-5),
                    #loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
                    loss=tf.keras.losses.BinaryCrossentropy(from_logits=True, label_smoothing=1e-6),
                    metrics=logloss,
                )

                checkpoint_path = f"{model_name}_repeat:{seed}_fold:{n}.hdf5"

                if transfer_learning_base is not None and model_name in pre_train_models:
                    model_base.load_weights(checkpoint_path)
                    for layer in range(len(model_base.layers[:-1])):
                        model.layers[layer].set_weights(model_base.layers[layer].get_weights())

                if SAVE_MODEL:
                    cb_checkpt = ModelCheckpoint(
                        checkpoint_path,
                        monitor="val_loss",
                        verbose=0,
                        save_best_only=True,
                        save_weights_only=True,
                        mode="min",
                    )
                reduce_lr_loss = ReduceLROnPlateau(
                    monitor="val_loss", factor=0.1, patience=5, verbose=0, min_delta=1e-5, min_lr=1e-5, mode="min"
                )
                early_stopping = EarlyStopping(
                    monitor="val_loss",
                    patience=10,
                    mode="min",
                    verbose=0,
                    min_delta=1e-5,
                    restore_best_weights=True,
                )
                if SAVE_MODEL:
                    callbacks = [cb_checkpt, reduce_lr_loss, early_stopping]
                else:
                    callbacks = [reduce_lr_loss, early_stopping]
                model.fit(
                    x_tr,
                    y_tr,
                    validation_data=(x_val, y_val),
                    epochs=200,
                    batch_size=128,
                    callbacks=callbacks,
                    verbose=0,
                )

            val_predict = model.predict(x_val)
            val_predict = 1 / (1 + np.exp(-val_predict))
            seed_result.loc[te, :] += val_predict

            if do_predict:
                test_predict = model.predict(x_tt)
                test_predict = 1 / (1 + np.exp(-test_predict))
                prediction += test_predict / N_SPLITS

            if model_name == "TabNet":
                fold_score = np.min(model.history["val_logits_ll"])
            else:
                fold_score = metric(target_.loc[te].values, val_predict)

            print(
                f"[{str(datetime.timedelta(seconds = time() - start_time))[2:7]}] {model_name}: Seed {seed}, Fold {n}:",
                fold_score,
            )

            K.clear_session()
            del model
            x = gc.collect()

        oof[f"{model_name}_{seed}"] = seed_result
        predictions[f"{model_name}_{seed}"] = prediction

    return oof, predictions

In [None]:
%%time

PRE_TRAIN = True

if PRE_TRAIN:
    _, _ = learning(
        train[: non_target.shape[0]],
        train_pca[: non_target.shape[0]],
        non_target,
        non_target_drug,
        N_STARTS,
        N_SPLITS,
    )

In [None]:
%%time

oof, predictions = learning(
    train,
    train_pca,
    target,
    target_drug,
    N_STARTS,
    N_SPLITS,
    do_predict=True,
    transfer_learning_base=non_target,
    pseudo_labeling=False,
)

## Cross Validation

In [None]:
initial_weights = [1.0 / N_STARTS for _ in range(N_STARTS)] + [1.0]
y_true = target.values[:non_target.shape[0]]

print(f"===== OOF CV =====")
for key, val in oof.items():
    print(f"OOF Key: {key}, CV: {metric(y_true, val.values[:y_true.shape[0]])}")

oof_by_model = {
    model: {k: v for k, v in oof.items() if k.startswith(model)}
    for model in models
}
for model, oof_ in oof_by_model.items():
    print(f"\n===== Model {model} CV =====")
    cross_validation(y_true.shape, initial_weights[:-1], y_true, oof_)

print(f"\n===== Overall CV =====")
cross_validation(y_true.shape, initial_weights[:-1], y_true, oof)

optimize = False

if optimize:
    # https://www.kaggle.com/gogo827jz/optimise-blending-weights-with-bonus-0#Bonus-(Lagrange-Multiplier)

    def lagrange_func(params):
        # weights, _lambda = params
        blend_ = blend(y_true.shape, params[:-1], oof)
        return metric(y_true, blend_) - params[-1] * (sum(params[:-1]) - 1)

    grad_l = grad(lagrange_func)

    def lagrange_obj(params):
        # weights, _lambda = params
        d = grad_l(params).tolist()
        return d[:-1] + [sum(params[:-1]) - 1]

    optimized_weights = fsolve(lagrange_obj, initial_weights)
    cross_validation(y_true.shape, optimized_weights[:-1], y_true, oof)

    print(f"Optimized weights: {optimized_weights[:-1]}")
    print(f"Check the sum of all weights: {sum(optimized_weights[:-1])}")

else:
    optimized_weights = initial_weights

In [None]:
predictions_by_model = {
    model: {k: v for k, v in predictions.items() if k.startswith(model)}
    for model in models
}

blend_by_model = {
    model: pd.DataFrame(blend(ss.shape, initial_weights[:-1], predictions_by_model[model]))
    for model in models
}

if IN_COLAB:
    pub_test_pseudo_label = pub_ss.drop("sig_id", axis=1)
    pub_test_pseudo_label.columns = range(206)
    blend_by_model["pub_test"] = pub_test_pseudo_label

for a, b in itertools.combinations(blend_by_model.keys(), 2):
    corr = blend_by_model[a].corrwith(blend_by_model[b], axis=1)
    print(f"Prediction correlation between {a} and {b}: {corr.mean()}")

# Pseudo Label

## Preparation

In [None]:
PESEUDO_LABELING = False

if PESEUDO_LABELING:
    # Blend Predictions
    pseudo_label_df = submit_df.copy()
    pseudo_label_df.loc[:, target.columns] = blend(ss.shape, optimized_weights[:-1], predictions)

    # Preprocess Pseudo Label
    pseudo_label_df = pseudo_label_df.loc[test_df["cp_type"] != "ctl_vehicle"].reset_index(drop=True)

    pseudo_label_drug = pd.DataFrame(pseudo_label_df.loc[:, "sig_id"]).merge(drug, on='sig_id', how='left')
    pseudo_label_drug = pseudo_label_drug.fillna("yyyyyyyyy")

    target_drug = target_drug.drop(["fold"], axis=1)

    del pseudo_label_df["sig_id"]

    print(train.shape)
    print(test.shape)

    print(train_pca.shape)
    print(test_pca.shape)

    print(target.shape)
    print(pseudo_label_df.shape)

    print(target_drug.shape)
    print(pseudo_label_drug.shape)

    pseudo_label_df    

## Training

In [None]:
if PESEUDO_LABELING:
    oof, predictions = learning(
        pd.concat([train, test], ignore_index=True),
        pd.concat([train_pca, test_pca], ignore_index=True),
        pd.concat([target, pseudo_label_df], ignore_index=True),
        pd.concat([target_drug, pseudo_label_drug], ignore_index=True),
        N_STARTS,
        N_SPLITS,
        do_predict=True,
        transfer_learning_base=target,
        pseudo_labeling=True,
    )

## Cross Validation

In [None]:
if PESEUDO_LABELING:
    for key, val in oof.items():
        print(f"OOF Key: {key}, CV: {metric(y_true, val.values[:y_true.shape[0]])}")
        
    cross_validation(y_true.shape, initial_weights[:-1], y_true, oof)

# Postprocessing

In [None]:
# Weighted blend
submit_df.loc[:, target.columns] = blend(ss.shape, optimized_weights[:-1], predictions)

In [None]:
# Clipping
# submit_df.loc[:, target.columns] = submit_df.loc[:, target.columns].clip(1e-7, 1 - 1e-7)

In [None]:
submit_df.loc[test_df["cp_type"] == "ctl_vehicle", target.columns] = 0

# Output

In [None]:
submit_df.to_csv("submission.csv", index=False)