# Strategy

- Preprocessing
    - Remove ctrl_vehicle
    - RankGauss
    - PCA + Existing Features
    - KMeans
    - Basic stats
- Model
    - Multi head ResNet (tensorflow)
    - TabNet (pytorch)
    - NODE - Neural Oblivious Decision Ensembles (tensorflow)
- Training
    - Pre-train with scored and non-scored target
    - Train with public test pseudo label
    - Add swap noise for train data
    - Optimizer: Adam/AdamW with weight_decay
    - Loss: BCE with Label smoothing + Logits
- Prediction
    - Ensemble above with weight optimization

# Setup

## Variables

In [None]:
import sys

IN_COLAB = "google.colab" in sys.modules

### Datasets

In [None]:
CTRL_VEHICLE = "remove"  # "use", "keep", "remove"
# use: Use cp_type column for training.
# keep: Remove cp_type column, but keep ctrl_vehicle row.
# remove: Remove cp_type column and ctrl_vehicle row.

In [None]:
USE_PUBLIC_TEST_PSEUDO_LABEL = True

### Pre training

In [None]:
PRE_TRAIN_MODEL = "load-others"  # "in-notebook", "load-others", "no"
# in-notebook: Pre-train in this notebook.
# load-others: Load pre-train model that is trained in other notebook.
# no: Disable pre-training.

PRE_TRAIN_MODEL_DIR_RESNET = "../input/pretrain-with-non-scored-target-baseline"
PRE_TRAIN_MODEL_DIR_TABNET = "../input/pretrain-tabnet"
PRE_TRAIN_MODEL_DIR_NODE = "../input/pretrain-node"
PRE_TRAIN_MODEL_DIR_NO_FIT = "../input/ensemble-baseline-pre-training"

### Hyper parameter tuning

In [None]:
HYPER_PARAMETER_TUNING = False

if HYPER_PARAMETER_TUNING:
    TUNING_RESNET = False
    TUNING_TABNET = False
    TUNING_NODE = True

    USE_PUBLIC_TEST_PSEUDO_LABEL = False
    PRE_TRAIN_MODEL = "in-notebook"

### Model & KFold

In [None]:
DRUG_KFOLD = "hard"  # "hard", "soft"
# hard: [vc <= 19]
# soft: [(vc == 6) | (vc == 12) | (vc == 18)]

In [None]:
models = [
    #{"model_name": "ResNet", "cv": "with_drug_id", "fit": True},
    {"model_name": "ResNet", "cv": "without_drug_id", "fit": True},
    #{"model_name": "TabNet", "cv": "with_drug_id", "fit": True},
    {"model_name": "TabNet", "cv": "without_drug_id", "fit": True},
    #{"model_name": "NODE", "cv": "with_drug_id", "fit": True},
    {"model_name": "NODE", "cv": "without_drug_id", "fit": True},

    #{"model_name": "ResNet", "cv": "with_drug_id", "fit": False},
    #{"model_name": "ResNet", "cv": "without_drug_id", "fit": False},
    #{"model_name": "TabNet", "cv": "with_drug_id", "fit": False},
    #{"model_name": "TabNet", "cv": "without_drug_id", "fit": False},
]

N_SEED = 2
N_STARTS = len(models) * N_SEED
N_SPLITS = 7
pre_train_models = ["ResNet", "TabNet", "NODE"]

if IN_COLAB:
    models = [
        {"model_name": "ResNet", "cv": "with_drug_id", "fit": True},
        #{"model_name": "ResNet", "cv": "without_drug_id", "fit": True},
        {"model_name": "TabNet", "cv": "with_drug_id", "fit": True},
        #{"model_name": "TabNet", "cv": "without_drug_id", "fit": True},
        {"model_name": "NODE", "cv": "with_drug_id", "fit": True},
        #{"model_name": "NODE", "cv": "without_drug_id", "fit": True},

        #{"model_name": "ResNet", "cv": "with_drug_id", "fit": False},
        #{"model_name": "ResNet", "cv": "without_drug_id", "fit": False},
        #{"model_name": "TabNet", "cv": "with_drug_id", "fit": False},
        #{"model_name": "TabNet", "cv": "without_drug_id", "fit": False},
    ]

    N_SEED = 3
    N_STARTS = len(models) * N_SEED
    N_SPLITS = 5

### Training

In [None]:
SAVE_WEIGHT = False

### Cross Validation

In [None]:
optimize = "average"
fixed_weight = [0.3, 0.2, 0.3, 0.2]  # if fixed

if IN_COLAB:
    optimize = "lagrange"  # "lagrange", "fixed", "average"

## for Google Colab

In [None]:
COMPETE = "lish-moa"
DATASETS = [
    "imokuri/moapublictestpredictions",
    "optimo/pytorchtabnetpretraining",
    "rahulsd91/moapublictest",
    "tolgadincer/autograd",
    "yasufuminakama/iterative-stratification",
]
KERNEL_OUTPUTS = [
    "imokuri/pretrain-with-non-scored-target-baseline",
    "imokuri/pretrain-tabnet",
    "imokuri/pretrain-node",
]
PACKAGES = ["optuna"]

In [None]:
if IN_COLAB:
    !python2 -m pip uninstall kaggle -y
    !python3 -m pip uninstall kaggle -y
    !python3 -m pip install -U -q kaggle

    !pip install -q -U git+https://github.com/IMOKURI/kaggle_on_google_colab.git

    from kaggle_on_google_colab import setup
    kaggle = setup.Setup()
    kaggle.dirs(COMPETE)

    !kaggle competitions download -p /content/zip {COMPETE}
    !unzip -q -n /content/zip/{COMPETE}.zip -d /content/{COMPETE}/input/{COMPETE}
    #for line in setup.exec_get_lines(cmd=f"kaggle competitions files --csv {COMPETE} | egrep -v \"Warning: Looks like you're using an outdated API Version|name,size,creationDate\" | cut -d , -f 1"):
    #    !unzip -q -n /content/zip/{line.decode().strip()}.zip -d /content/{COMPETE}/input/{COMPETE}

    for dataset in DATASETS:
        dataset_name = dataset.split("/")[-1]

        !kaggle datasets download -p /content/zip {dataset}
        !unzip -q -n /content/zip/{dataset_name}.zip -d /content/{COMPETE}/input/{dataset_name}

    for kernel in KERNEL_OUTPUTS:
        kernel_name = kernel.split("/")[-1]

        !kaggle kernels output -p /content/{COMPETE}/input/{kernel_name} {kernel}

    for package_ in PACKAGES:
        !pip install -q {package_}

    !pip install -q -U tensorflow-addons
    #!mv /content/zip/train_drug.csv /content/{COMPETE}/input/{COMPETE}/

    %cd /content/{COMPETE}/output


## Library

In [None]:
import warnings

warnings.filterwarnings("ignore")

In [None]:
import sys

sys.path.append("../input/iterative-stratification/iterative-stratification-master")
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

sys.path.append("../input/autograd")
import autograd.numpy as np
from autograd import grad

#sys.path.append("../input/pytorchtabnetpretraining/pytorch_tabnet-2.0.1/pytorch_tabnet-2.0.1")
!pip install -q ../input/pytorchtabnetpretraining/pytorch_tabnet-2.0.1-py3-none-any.whl
from pytorch_tabnet.metrics import Metric
from pytorch_tabnet.pretraining import TabNetPretrainer
from pytorch_tabnet.tab_model import TabNetRegressor

In [None]:
import datetime
import gc
import json
import io
import itertools
import os
import random
import zipfile
from collections import defaultdict
from time import time
from typing import Optional

# import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow.keras.backend as K
import tensorflow.keras.layers as L
import tensorflow.keras.regularizers as R
import tensorflow_addons as tfa
import torch
import torch.nn.functional as F
import torch.optim as optim
from scipy.optimize import fsolve, minimize
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA, KernelPCA
from sklearn.feature_selection import VarianceThreshold
from sklearn.manifold import TSNE
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import QuantileTransformer
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from tensorflow_probability import distributions as tfp_distributions
from tensorflow_probability import stats as tfp_stats
from torch import nn
from torch.nn.modules.loss import _WeightedLoss
from torch.optim.lr_scheduler import ReduceLROnPlateau as torch_ReduceLROnPlateau

In [None]:
if HYPER_PARAMETER_TUNING:
    import optuna

In [None]:
if IN_COLAB:
    from IPython.display import SVG, display_svg
    from tensorflow.keras.utils import model_to_dot

In [None]:
MIXED_PRECISION = False
XLA_ACCELERATE = True

if MIXED_PRECISION:
    from tensorflow.keras.mixed_precision import experimental as mixed_precision

    if tpu:
        policy = tf.keras.mixed_precision.experimental.Policy("mixed_bfloat16")
    else:
        policy = tf.keras.mixed_precision.experimental.Policy("mixed_float16")
    mixed_precision.set_policy(policy)
    print("Mixed precision enabled")

if XLA_ACCELERATE:
    tf.config.optimizer.set_jit(True)
    print("Accelerated Linear Algebra enabled")

Accelerated Linear Algebra enabled


In [None]:
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

# Functions

In [None]:
def fix_seed(seed=2020):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)
    torch.manual_seed(seed)

    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False


random_seed = 2222
fix_seed(random_seed)

In [None]:
def reduce_mem_usage(df):
    start_mem = df.memory_usage().sum() / 1024 ** 2
    print("Memory usage of dataframe is {:.2f} MB".format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype("category")

    end_mem = df.memory_usage().sum() / 1024 ** 2
    print("Memory usage after optimization is: {:.2f} MB".format(end_mem))
    print("Decreased by {:.1f}%".format(100 * (start_mem - end_mem) / start_mem))

    return df

## Data

In [None]:
# https://www.kaggle.com/markpeng/deepinsight-efficientnet-b3-noisystudent/comments#1075013

def add_swap_noise(index, X, train_):
    swap_prob=0.15
    swap_portion=0.1

    for i in range(len(index)):
        if np.random.rand() < swap_prob:
            swap_index = np.random.randint(train_.shape[0], size=1)[0]
            # Select only gene expression and cell viability features
            swap_features = np.random.choice(
                np.array(range(2, train_.shape[1])),
                size=int(train_.shape[1] * swap_portion),
                replace=False
            )
            X[i, swap_features] = train_[swap_index, swap_features]

    return X

## Metrics

In [None]:
# Evaluation Metric with sigmoid applied and clipping

## for tensorflow
def logloss(y_true, y_pred):
    logits = 1 / (1 + K.exp(-y_pred))
    aux = (1 - y_true) * K.log(1 - logits + 1e-15) + y_true * K.log(logits + 1e-15)
    return K.mean(-aux)


## for pytorch
class LogitsLogLoss(Metric):
    def __init__(self):
        self._name = "logits_ll"
        self._maximize = False

    def __call__(self, y_true, y_pred):
        logits = 1 / (1 + np.exp(-y_pred))
        aux = (1 - y_true) * np.log(1 - logits + 1e-15) + y_true * np.log(logits + 1e-15)
        return np.mean(-aux)


## for overall
## [Fast Numpy Log Loss] https://www.kaggle.com/gogo827jz/optimise-blending-weights-4-5x-faster-log-loss
def metric(y_true, y_pred):
    loss = 0
    for i in range(y_pred.shape[1]):
        loss += -np.mean(
            y_true[:, i] * np.log(y_pred[:, i] + 1e-15) + (1 - y_true[:, i]) * np.log(1 - y_pred[:, i] + 1e-15)
        )
    return loss / y_pred.shape[1]

## Loss functions

In [None]:
# https://www.kaggle.com/felipebihaiek/torch-continued-from-auxiliary-targets-smoothing
class SmoothBCEwLogits(_WeightedLoss):
    def __init__(self, weight=None, reduction="mean", smoothing=0.0):
        super().__init__(weight=weight, reduction=reduction)
        self.smoothing = smoothing
        self.weight = weight
        self.reduction = reduction

    @staticmethod
    def _smooth(targets: torch.Tensor, n_labels: int, smoothing=0.0):
        assert 0 <= smoothing < 1
        with torch.no_grad():
            targets = targets * (1.0 - smoothing) + 0.5 * smoothing
        return targets

    def forward(self, inputs, targets):
        targets = SmoothBCEwLogits._smooth(targets, inputs.size(-1), self.smoothing)
        loss = F.binary_cross_entropy_with_logits(inputs, targets, self.weight)

        if self.reduction == "sum":
            loss = loss.sum()
        elif self.reduction == "mean":
            loss = loss.mean()

        return loss

## Cross Validation

In [None]:
# Blend oof predictions
def blend(size, weights, oof):
    blend_ = np.zeros(size)
    for i, key in enumerate(oof.keys()):
        blend_ += weights[i] * oof[key].values[: blend_.shape[0], : blend_.shape[1]]
    return blend_

In [None]:
def cross_validation(size, weight, y_true, oof):
    x = size[0]
    blend_ = blend(y_true[:x].shape, weight, oof)

    aucs = []
    for task_id in range(blend_.shape[1]):
        aucs.append(roc_auc_score(y_true=y_true[:x, task_id], y_score=blend_[:, task_id]))

    CV = metric(y_true[:x], blend_)
    AUC = np.mean(aucs)
    print(f"Blended CV: {CV}, AUC : {AUC}")

    return CV, AUC, pd.DataFrame(blend_)

# Load Data

In [None]:
train_df = pd.read_csv("../input/lish-moa/train_features.csv")
test_df = pd.read_csv("../input/lish-moa/test_features.csv")
target_df = pd.read_csv("../input/lish-moa/train_targets_scored.csv")
non_target_df = pd.read_csv("../input/lish-moa/train_targets_nonscored.csv")
submit_df = pd.read_csv("../input/lish-moa/sample_submission.csv")
drug_df = pd.read_csv("../input/lish-moa/train_drug.csv")

pub_test_df = pd.read_csv("../input/moapublictest/test_features.csv")
# pub_submit_df = pd.read_csv("../input/moapublictestpredictions/submission-blendblendblend.csv")
pub_submit_df = pd.read_csv("../input/moapublictestpredictions/submission-v65-old_best_lb.csv")

In [None]:
train = train_df.copy()
test = test_df.copy()
target = target_df.copy()
non_target = non_target_df.copy()
ss = submit_df.copy()
drug = drug_df.copy()

pub_test = pub_test_df.copy()
pub_ss = pub_submit_df.copy()

## Use public test data for training

In [None]:
# Merge public test data (and pseudo label) into train data
if USE_PUBLIC_TEST_PSEUDO_LABEL:
    train = pd.concat([train, pub_test]).reset_index(drop=True)
    target = pd.concat([target, pub_ss]).reset_index(drop=True)

# This is used for CV with ctrl_vehicle.
train_pub_test = train.copy()

In [None]:
target

Unnamed: 0,sig_id,5-alpha_reductase_inhibitor,11-beta-hsd1_inhibitor,acat_inhibitor,acetylcholine_receptor_agonist,acetylcholine_receptor_antagonist,acetylcholinesterase_inhibitor,adenosine_receptor_agonist,adenosine_receptor_antagonist,adenylyl_cyclase_activator,...,tropomyosin_receptor_kinase_inhibitor,trpv_agonist,trpv_antagonist,tubulin_inhibitor,tyrosine_kinase_inhibitor,ubiquitin_specific_protease_inhibitor,vegfr_inhibitor,vitamin_b,vitamin_d_receptor_agonist,wnt_inhibitor
0,id_000644bb2,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,id_000779bfc,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2,id_000a6266a,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,id_0015fd391,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4,id_001626bd3,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27791,id_ff7004b87,0.001187,0.001471,0.001790,0.003865,0.006655,0.003512,0.001677,0.003756,0.001046,...,0.001112,0.004794,0.002926,0.211817,0.011809,0.001845,0.006745,0.002509,0.001243,0.003638
27792,id_ff925dd0d,0.006804,0.004367,0.001244,0.007929,0.025647,0.006798,0.004036,0.004191,0.001031,...,0.000985,0.001218,0.002999,0.002630,0.001974,0.001638,0.001790,0.002058,0.000697,0.001785
27793,id_ffb710450,0.002167,0.001374,0.001716,0.014372,0.040319,0.005958,0.004267,0.003298,0.000597,...,0.001039,0.001140,0.003067,0.001776,0.001588,0.001091,0.001032,0.001740,0.000927,0.001551
27794,id_ffbb869f2,0.002340,0.001462,0.001797,0.022218,0.021918,0.005618,0.008658,0.002461,0.001359,...,0.000921,0.000901,0.002803,0.001641,0.001624,0.000881,0.001040,0.002210,0.000746,0.003842


# Preprocessing

In [None]:
train.loc[:, "cp_dose"] = train.loc[:, "cp_dose"].map({"D1": 0, "D2": 1})
test.loc[:, "cp_dose"] = test.loc[:, "cp_dose"].map({"D1": 0, "D2": 1})

In [None]:
train.loc[:, "cp_time"] = train.loc[:, "cp_time"].map({24: 0, 48: 1, 72: 2})
test.loc[:, "cp_time"] = test.loc[:, "cp_time"].map({24: 0, 48: 1, 72: 2})

## Remove ctrl_vehicle



In [None]:
if CTRL_VEHICLE == "use":
    train.loc[:, "cp_type"] = train.loc[:, "cp_type"].map({"ctl_vehicle": 0, "trt_cp": 1})
    test.loc[:, "cp_type"] = test.loc[:, "cp_type"].map({"ctl_vehicle": 0, "trt_cp": 1})

else:
    if CTRL_VEHICLE == "remove":
        target = target.loc[train["cp_type"] != "ctl_vehicle"].reset_index(drop=True)
        non_target = non_target.loc[train[: train_df.shape[0]]["cp_type"] != "ctl_vehicle"].reset_index(drop=True)

        train = train.loc[train["cp_type"] != "ctl_vehicle"].reset_index(drop=True)

    train = train.drop("cp_type", axis=1)
    test = test.drop("cp_type", axis=1)

## Merge drug_id into training data

https://www.kaggle.com/c/lish-moa/discussion/195195

In [None]:
target_drug = pd.DataFrame(target.loc[:, "sig_id"]).merge(drug, on="sig_id", how="left")
non_target_drug = pd.DataFrame(non_target.loc[:, "sig_id"]).merge(drug, on="sig_id", how="left")

In [None]:
target_drug = target_drug.fillna("xxxxxxxxx")
non_target_drug = non_target_drug.fillna("xxxxxxxxx")

In [None]:
target_drug

Unnamed: 0,sig_id,drug_id
0,id_000644bb2,b68db1d53
1,id_000779bfc,df89a8e5a
2,id_000a6266a,18bb41b2c
3,id_0015fd391,8c7f86626
4,id_001626bd3,7cbed3131
...,...,...
25567,id_ff7004b87,xxxxxxxxx
25568,id_ff925dd0d,xxxxxxxxx
25569,id_ffb710450,xxxxxxxxx
25570,id_ffbb869f2,xxxxxxxxx


## Remove sig_id

In [None]:
del train["sig_id"]
del target["sig_id"]
del non_target["sig_id"]
del test["sig_id"]
del ss["sig_id"]

In [None]:
train

Unnamed: 0,cp_time,cp_dose,g-0,g-1,g-2,g-3,g-4,g-5,g-6,g-7,...,c-90,c-91,c-92,c-93,c-94,c-95,c-96,c-97,c-98,c-99
0,0,0,1.0620,0.5577,-0.2479,-0.6208,-0.1944,-1.0120,-1.0220,-0.0326,...,0.2862,0.2584,0.8076,0.5523,-0.1912,0.6584,-0.3981,0.2139,0.3801,0.4176
1,2,0,0.0743,0.4087,0.2991,0.0604,1.0190,0.5207,0.2341,0.3372,...,-0.4265,0.7543,0.4708,0.0230,0.2957,0.4899,0.1522,0.1241,0.6077,0.7371
2,1,0,0.6280,0.5817,1.5540,-0.0764,-0.0323,1.2390,0.1715,0.2155,...,-0.7250,-0.6297,0.6103,0.0223,-1.3240,-0.3174,-0.6417,-0.2187,-1.4080,0.6931
3,1,0,-0.5138,-0.2491,-0.2656,0.5288,4.0620,-0.8095,-1.9590,0.1792,...,-2.0990,-0.6441,-5.6300,-1.3780,-0.8632,-1.2880,-1.6210,-0.8784,-0.3876,-0.8154
4,2,1,-0.3254,-0.4009,0.9700,0.6919,1.4180,-0.8244,-0.2800,-0.1498,...,0.0042,0.0048,0.6670,1.0690,0.5523,-0.3031,0.1094,0.2885,-0.3786,0.7125
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25567,0,0,0.4571,-0.5743,3.3930,-0.6202,0.8557,1.6240,0.0640,-0.6316,...,-1.1790,-0.6422,-0.4367,0.0159,-0.6539,-0.4791,-1.2680,-1.1280,-0.4167,-0.6600
25568,0,0,-0.5885,-0.2548,2.5850,0.3456,0.4401,0.3107,-0.7437,-0.0143,...,0.0210,0.5780,-0.5888,0.8057,0.9312,1.2730,0.2614,-0.2790,-0.0131,-0.0934
25569,2,0,-0.3985,-0.1554,0.2677,-0.6813,0.0152,0.4791,-0.0166,0.7501,...,0.4418,0.9153,-0.1862,0.4049,0.9568,0.4666,0.0461,0.5888,-0.4205,-0.1504
25570,1,1,-1.0960,-1.7750,-0.3977,1.0160,-1.3350,-0.2207,-0.3611,-1.3020,...,0.3079,-0.4473,-0.8192,0.7785,0.3133,0.1286,-0.2618,0.5074,0.7430,-0.0484


In [None]:
print(train.shape)
print(target.shape)
print(non_target.shape)

print(test.shape)
print(ss.shape)

(25572, 874)
(25572, 206)
(21948, 402)
(3982, 874)
(3982, 206)


## Rank Gauss

https://www.kaggle.com/nayuts/moa-pytorch-nn-pca-rankgauss



In [None]:
g_cols = [col for col in train_df.columns if col.startswith("g-")]
c_cols = [col for col in train_df.columns if col.startswith("c-")]

for col in g_cols + c_cols:
    transformer = QuantileTransformer(n_quantiles=100, random_state=random_seed, output_distribution="normal")

    vec_len = len(train[col].values)
    vec_len_test = len(test[col].values)

    raw_vec = train[col].values.reshape(vec_len, 1)
    transformer.fit(raw_vec)

    train[col] = transformer.transform(raw_vec).reshape(1, vec_len)[0]
    test[col] = transformer.transform(test[col].values.reshape(vec_len_test, 1)).reshape(1, vec_len_test)[0]

In [None]:
train

Unnamed: 0,cp_time,cp_dose,g-0,g-1,g-2,g-3,g-4,g-5,g-6,g-7,...,c-90,c-91,c-92,c-93,c-94,c-95,c-96,c-97,c-98,c-99
0,0,0,1.124260,0.896698,-0.436214,-0.965311,-0.287443,-1.016437,-1.360774,-0.045876,...,0.428869,0.384250,1.300482,0.879422,-0.206096,1.046155,-0.479268,0.339234,0.583214,0.696712
1,2,0,0.117451,0.667759,0.260124,0.097531,1.204172,0.692876,0.356691,0.559630,...,-0.499745,1.147297,0.728062,0.089253,0.453665,0.770909,0.226300,0.202945,0.955497,1.219730
2,1,0,0.777229,0.935347,1.414044,-0.113563,-0.025489,1.494313,0.277364,0.357917,...,-0.800373,-0.721883,0.960080,0.088259,-1.182700,-0.358059,-0.732238,-0.253014,-1.085791,1.140342
3,1,0,-0.749489,-0.299404,-0.459100,0.774708,2.344556,-0.856449,-2.323390,0.298781,...,-1.391931,-0.736149,-1.612415,-1.219207,-0.912980,-1.194806,-1.288428,-0.950502,-0.445204,-0.884754
4,2,1,-0.460555,-0.508226,0.959313,0.984009,1.451890,-0.867329,-0.342599,-0.234770,...,0.038727,0.021330,1.056779,1.734597,0.843756,-0.341198,0.169668,0.451146,-0.434772,1.174162
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25567,0,0,0.599819,-0.723658,2.333805,-0.964385,1.075072,1.786462,0.131113,-0.812649,...,-1.126325,-0.734273,-0.505298,0.079622,-0.730794,-0.547424,-1.172338,-1.131847,-0.476173,-0.740089
25568,0,0,-0.865375,-0.307222,1.988666,0.523322,0.644383,0.437283,-0.989775,-0.015526,...,0.062985,0.868007,-0.660578,1.300272,1.418983,2.041320,0.385704,-0.325379,0.004373,-0.059129
25569,2,0,-0.575003,-0.170529,0.221620,-1.053665,0.049099,0.641676,0.016354,1.208632,...,0.670012,1.400519,-0.220545,0.641140,1.461002,0.734826,0.081096,0.924382,-0.479982,-0.136455
25570,1,1,-1.635710,-1.976022,-0.636890,1.330359,-1.719197,-0.259467,-0.455103,-1.308468,...,0.463639,-0.527331,-0.857951,1.256633,0.481844,0.236350,-0.320390,0.796356,1.191530,-0.001347


## PCA features (+ Existing features)



In [None]:
# g-
n_comp = 50

data = pd.concat([pd.DataFrame(train[g_cols]), pd.DataFrame(test[g_cols])])
data2 = PCA(n_components=n_comp, random_state=random_seed).fit_transform(data[g_cols])
train2 = data2[: train.shape[0]]
test2 = data2[-test.shape[0] :]

train2 = pd.DataFrame(train2, columns=[f"pca_G-{i}" for i in range(n_comp)])
test2 = pd.DataFrame(test2, columns=[f"pca_G-{i}" for i in range(n_comp)])

train = pd.concat((train, train2), axis=1)
test = pd.concat((test, test2), axis=1)

In [None]:
# c-
n_comp = 15

data = pd.concat([pd.DataFrame(train[c_cols]), pd.DataFrame(test[c_cols])])
data2 = PCA(n_components=n_comp, random_state=random_seed).fit_transform(data[c_cols])
train2 = data2[: train.shape[0]]
test2 = data2[-test.shape[0] :]

train2 = pd.DataFrame(train2, columns=[f"pca_C-{i}" for i in range(n_comp)])
test2 = pd.DataFrame(test2, columns=[f"pca_C-{i}" for i in range(n_comp)])

train = pd.concat((train, train2), axis=1)
test = pd.concat((test, test2), axis=1)

In [None]:
train

Unnamed: 0,cp_time,cp_dose,g-0,g-1,g-2,g-3,g-4,g-5,g-6,g-7,...,pca_C-5,pca_C-6,pca_C-7,pca_C-8,pca_C-9,pca_C-10,pca_C-11,pca_C-12,pca_C-13,pca_C-14
0,0,0,1.124260,0.896698,-0.436214,-0.965311,-0.287443,-1.016437,-1.360774,-0.045876,...,0.984003,-0.450318,0.336426,0.015646,0.244094,-0.336008,0.062116,-0.398600,-0.044412,-1.430930
1,2,0,0.117451,0.667759,0.260124,0.097531,1.204172,0.692876,0.356691,0.559630,...,-0.505982,0.099885,0.527028,1.249065,-0.497161,-1.017310,0.180813,-0.877796,1.173441,-0.533822
2,1,0,0.777229,0.935347,1.414044,-0.113563,-0.025489,1.494313,0.277364,0.357917,...,0.972868,0.623104,0.151891,0.986884,-0.530927,-0.623886,-0.024449,-0.122968,-0.843806,0.147631
3,1,0,-0.749489,-0.299404,-0.459100,0.774708,2.344556,-0.856449,-2.323390,0.298781,...,0.968908,-0.512865,0.940177,0.797558,0.696072,-2.188802,-0.587456,-1.290193,0.755513,0.681749
4,2,1,-0.460555,-0.508226,0.959313,0.984009,1.451890,-0.867329,-0.342599,-0.234770,...,-0.408597,0.222136,-0.328591,0.115461,-0.613891,-0.266831,1.858980,0.385555,0.229644,0.888441
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25567,0,0,0.599819,-0.723658,2.333805,-0.964385,1.075072,1.786462,0.131113,-0.812649,...,1.074865,-0.015002,0.154258,-0.425031,0.060053,-0.950791,0.272566,-0.709891,0.183526,-0.035548
25568,0,0,-0.865375,-0.307222,1.988666,0.523322,0.644383,0.437283,-0.989775,-0.015526,...,-1.420880,-0.435511,1.148459,0.378976,-1.053755,-1.137182,-0.378525,0.875301,2.879233,-2.048089
25569,2,0,-0.575003,-0.170529,0.221620,-1.053665,0.049099,0.641676,0.016354,1.208632,...,0.902644,-1.134607,0.028057,-0.469159,-1.043663,0.210421,-0.909664,-0.352517,-0.612421,-0.247028
25570,1,1,-1.635710,-1.976022,-0.636890,1.330359,-1.719197,-0.259467,-0.455103,-1.308468,...,0.193346,-0.775846,-1.107324,0.405084,0.886731,-0.560762,-0.039118,0.349216,-0.017659,-0.805854


In [None]:
train_pca = train.copy()
test_pca = test.copy()

train_pca.drop(g_cols, axis=1, inplace=True)
test_pca.drop(g_cols, axis=1, inplace=True)

train_pca.drop(c_cols, axis=1, inplace=True)
test_pca.drop(c_cols, axis=1, inplace=True)

In [None]:
train_pca

Unnamed: 0,cp_time,cp_dose,pca_G-0,pca_G-1,pca_G-2,pca_G-3,pca_G-4,pca_G-5,pca_G-6,pca_G-7,...,pca_C-5,pca_C-6,pca_C-7,pca_C-8,pca_C-9,pca_C-10,pca_C-11,pca_C-12,pca_C-13,pca_C-14
0,0,0,-5.778899,6.154613,8.561315,-7.442511,4.386002,1.258147,3.520685,1.828526,...,0.984003,-0.450318,0.336426,0.015646,0.244094,-0.336008,0.062116,-0.398600,-0.044412,-1.430930
1,2,0,-5.035246,1.003536,-12.642795,4.682019,0.934481,0.017921,0.817860,-1.085419,...,-0.505982,0.099885,0.527028,1.249065,-0.497161,-1.017310,0.180813,-0.877796,1.173441,-0.533822
2,1,0,0.849837,-8.534120,-2.961085,0.234691,0.712903,3.226471,-1.540530,3.543482,...,0.972868,0.623104,0.151891,0.986884,-0.530927,-0.623886,-0.024449,-0.122968,-0.843806,0.147631
3,1,0,11.053726,-10.088315,-0.812731,-4.941979,-7.323094,-2.490876,-2.273711,6.357738,...,0.968908,-0.512865,0.940177,0.797558,0.696072,-2.188802,-0.587456,-1.290193,0.755513,0.681749
4,2,1,-6.813030,-5.481174,-9.282727,-4.827295,-7.899419,-8.227711,-3.362620,-3.581454,...,-0.408597,0.222136,-0.328591,0.115461,-0.613891,-0.266831,1.858980,0.385555,0.229644,0.888441
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25567,0,0,3.034423,-2.604996,0.378983,1.080481,4.262611,2.492815,3.584577,-0.012864,...,1.074865,-0.015002,0.154258,-0.425031,0.060053,-0.950791,0.272566,-0.709891,0.183526,-0.035548
25568,0,0,-7.989301,-0.778425,-4.860383,0.376690,-1.113370,-2.287973,-5.796794,1.580867,...,-1.420880,-0.435511,1.148459,0.378976,-1.053755,-1.137182,-0.378525,0.875301,2.879233,-2.048089
25569,2,0,-6.872688,6.782780,1.654480,-7.876308,1.163434,2.100182,4.330693,-0.996571,...,0.902644,-1.134607,0.028057,-0.469159,-1.043663,0.210421,-0.909664,-0.352517,-0.612421,-0.247028
25570,1,1,-1.134083,-9.890526,11.790893,7.032540,2.695275,-2.669481,2.486435,-0.267855,...,0.193346,-0.775846,-1.107324,0.405084,0.886731,-0.560762,-0.039118,0.349216,-0.017659,-0.805854


## feature Selection using Variance Encoding



In [None]:
# https://www.kaggle.com/c/lish-moa/discussion/194973#1067941
if False:

    var_threshold = 0.5

    data = train.append(test)
    ve_columns = (data.iloc[:, 2:].var() >= var_threshold).values
    ve_data = data.iloc[:, 2:].loc[:, ve_columns]

    ve_train = ve_data[: train.shape[0]]
    ve_test = ve_data[-test.shape[0] :]

    train = pd.DataFrame(train[["cp_time", "cp_dose"]].values.reshape(-1, 2), columns=["cp_time", "cp_dose"])
    train = pd.concat([train, ve_train], axis=1)

    test = pd.DataFrame(test[["cp_time", "cp_dose"]].values.reshape(-1, 2), columns=["cp_time", "cp_dose"])
    test = pd.concat([test, ve_test], axis=1)

In [None]:
# train

## KMeans

In [None]:
%%time

features_g = [col for col in train.columns if col.startswith("g-")]
features_c = [col for col in train.columns if col.startswith("c-")]


def fe_cluster(train_, test_, n_clusters_g=35, n_clusters_c=5):
    def create_cluster(tr, te, features, kind="g", n_clusters=n_clusters_g):
        tmp_train_ = tr[features].copy()
        tmp_test_ = te[features].copy()
        data = pd.concat([tmp_train_, tmp_test_], axis=0)

        kmeans = KMeans(n_clusters=n_clusters, random_state=random_seed).fit(data)

        tr[f"clusters_{kind}"] = kmeans.labels_[: tr.shape[0]]
        te[f"clusters_{kind}"] = kmeans.labels_[-te.shape[0] :]
        tr = pd.get_dummies(tr, columns=[f"clusters_{kind}"])
        te = pd.get_dummies(te, columns=[f"clusters_{kind}"])
        return tr, te

    train_, test_ = create_cluster(train_, test_, features_g, kind="g", n_clusters=n_clusters_g)
    train_, test_ = create_cluster(train_, test_, features_c, kind="c", n_clusters=n_clusters_c)
    return train_, test_


train, test = fe_cluster(train, test)

CPU times: user 1min 26s, sys: 449 ms, total: 1min 26s
Wall time: 1min 27s


In [None]:
train

Unnamed: 0,cp_time,cp_dose,g-0,g-1,g-2,g-3,g-4,g-5,g-6,g-7,...,clusters_g_30,clusters_g_31,clusters_g_32,clusters_g_33,clusters_g_34,clusters_c_0,clusters_c_1,clusters_c_2,clusters_c_3,clusters_c_4
0,0,0,1.124260,0.896698,-0.436214,-0.965311,-0.287443,-1.016437,-1.360774,-0.045876,...,0,0,0,0,0,1,0,0,0,0
1,2,0,0.117451,0.667759,0.260124,0.097531,1.204172,0.692876,0.356691,0.559630,...,0,0,1,0,0,1,0,0,0,0
2,1,0,0.777229,0.935347,1.414044,-0.113563,-0.025489,1.494313,0.277364,0.357917,...,0,0,0,0,0,0,1,0,0,0
3,1,0,-0.749489,-0.299404,-0.459100,0.774708,2.344556,-0.856449,-2.323390,0.298781,...,0,0,0,0,0,0,0,0,1,0
4,2,1,-0.460555,-0.508226,0.959313,0.984009,1.451890,-0.867329,-0.342599,-0.234770,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25567,0,0,0.599819,-0.723658,2.333805,-0.964385,1.075072,1.786462,0.131113,-0.812649,...,0,0,0,0,0,0,1,0,0,0
25568,0,0,-0.865375,-0.307222,1.988666,0.523322,0.644383,0.437283,-0.989775,-0.015526,...,0,0,0,0,0,1,0,0,0,0
25569,2,0,-0.575003,-0.170529,0.221620,-1.053665,0.049099,0.641676,0.016354,1.208632,...,0,0,0,0,0,1,0,0,0,0
25570,1,1,-1.635710,-1.976022,-0.636890,1.330359,-1.719197,-0.259467,-0.455103,-1.308468,...,0,1,0,0,0,0,0,0,0,1


## Basic stats

In [None]:
for stats in ["sum", "mean", "std", "kurt", "skew"]:
    train["g_" + stats] = getattr(train[features_g], stats)(axis=1)
    train["c_" + stats] = getattr(train[features_c], stats)(axis=1)
    train["gc_" + stats] = getattr(train[features_g + features_c], stats)(axis=1)

    test["g_" + stats] = getattr(test[features_g], stats)(axis=1)
    test["c_" + stats] = getattr(test[features_c], stats)(axis=1)
    test["gc_" + stats] = getattr(test[features_g + features_c], stats)(axis=1)

In [None]:
train

Unnamed: 0,cp_time,cp_dose,g-0,g-1,g-2,g-3,g-4,g-5,g-6,g-7,...,gc_mean,g_std,c_std,gc_std,g_kurt,c_kurt,gc_kurt,g_skew,c_skew,gc_skew
0,0,0,1.124260,0.896698,-0.436214,-0.965311,-0.287443,-1.016437,-1.360774,-0.045876,...,0.050884,0.868307,0.731294,0.869209,-0.270006,-0.321285,-0.270608,0.019115,0.073814,-0.015508
1,2,0,0.117451,0.667759,0.260124,0.097531,1.204172,0.692876,0.356691,0.559630,...,0.062270,0.850889,0.608372,0.842821,-0.217545,0.088938,-0.233240,0.045890,-0.163448,-0.041249
2,1,0,0.777229,0.935347,1.414044,-0.113563,-0.025489,1.494313,0.277364,0.357917,...,-0.038900,0.941310,0.665178,0.914129,-0.356922,-0.182024,-0.286903,-0.044156,0.385872,-0.008376
3,1,0,-0.749489,-0.299404,-0.459100,0.774708,2.344556,-0.856449,-2.323390,0.298781,...,-0.136704,1.080671,0.576449,1.088267,-0.918764,3.952398,-0.959980,0.086528,1.953350,0.245358
4,2,1,-0.460555,-0.508226,0.959313,0.984009,1.451890,-0.867329,-0.342599,-0.234770,...,0.020415,1.103348,0.677183,1.070526,-0.214614,-0.723722,-0.102022,-0.187344,0.076016,-0.251234
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25567,0,0,0.599819,-0.723658,2.333805,-0.964385,1.075072,1.786462,0.131113,-0.812649,...,-0.105162,0.778264,0.472869,0.783517,-0.299977,0.946507,-0.386378,0.085385,0.903288,0.210787
25568,0,0,-0.865375,-0.307222,1.988666,0.523322,0.644383,0.437283,-0.989775,-0.015526,...,0.084645,0.737451,0.794342,0.776206,-0.242835,-0.665963,-0.291044,0.075617,-0.237591,0.122949
25569,2,0,-0.575003,-0.170529,0.221620,-1.053665,0.049099,0.641676,0.016354,1.208632,...,-0.007073,0.817300,0.636256,0.822376,-0.345356,-0.251386,-0.375475,0.012237,-0.120099,-0.052690
25570,1,1,-1.635710,-1.976022,-0.636890,1.330359,-1.719197,-0.259467,-0.455103,-1.308468,...,0.139976,0.988002,0.631845,0.953971,-0.607105,-0.670234,-0.498840,-0.313186,0.111856,-0.319974


## Pick up categorical features

In [None]:
if False:
    features_cat = [col for col in train.columns if col.startswith("cp_") or col.startswith("clusters_")]
    features_cat_index = [train.columns.get_loc(col) for col in features_cat]

    train_cat = train[features_cat].values.astype("int8")
    test_cat = test[features_cat].values.astype("int8")

In [None]:
#features_cat

## Reduce mem

In [None]:
train = reduce_mem_usage(train)
test = reduce_mem_usage(test)

train_pca = reduce_mem_usage(train_pca)
test_pca = reduce_mem_usage(test_pca)

target = reduce_mem_usage(target)
non_target = reduce_mem_usage(non_target)

Memory usage of dataframe is 187.10 MB
Memory usage after optimization is: 48.43 MB
Decreased by 74.1%
Memory usage of dataframe is 29.13 MB
Memory usage after optimization is: 7.54 MB
Decreased by 74.1%
Memory usage of dataframe is 13.07 MB
Memory usage after optimization is: 3.22 MB
Decreased by 75.4%
Memory usage of dataframe is 2.04 MB
Memory usage after optimization is: 0.50 MB
Decreased by 75.4%
Memory usage of dataframe is 40.19 MB
Memory usage after optimization is: 10.05 MB
Decreased by 75.0%
Memory usage of dataframe is 67.32 MB
Memory usage after optimization is: 8.41 MB
Decreased by 87.5%


# Model

## Multi input ResNet

[MoA: Multi Input ResNet Model](https://www.kaggle.com/rahulsd91/moa-multi-input-resnet-model)

In [None]:
def create_model_resnet(n_features, n_features_2, n_labels):
    input_1 = L.Input(shape=(n_features,), name="Input1")
    input_2 = L.Input(shape=(n_features_2,), name="Input2")

    block_1 = tf.keras.Sequential(
        [
            L.BatchNormalization(),
            L.Dropout(0.2),
            tfa.layers.WeightNormalization(L.Dense(128, activation="elu")),
            L.BatchNormalization(),
            L.Dropout(0.2),
            tfa.layers.WeightNormalization(L.Dense(512, activation="selu")),
        ],
        name="Block1",
    )

    output_1 = block_1(input_1)
    connection_1 = L.Concatenate(name="Connection1")([input_2, output_1])

    block_2 = tf.keras.Sequential(
        [
            L.BatchNormalization(),
            L.Dropout(0.2),
            tfa.layers.WeightNormalization(L.Dense(256, activation="swish")),
            L.BatchNormalization(),
            L.Dropout(0.2),
            tfa.layers.WeightNormalization(L.Dense(256, activation="elu")),
            L.BatchNormalization(),
            L.Dropout(0.2),
            tfa.layers.WeightNormalization(L.Dense(256, activation="relu")),
            L.BatchNormalization(),
            L.Dropout(0.2),
            tfa.layers.WeightNormalization(L.Dense(1024, activation="relu")),
            L.BatchNormalization(),
            L.Dropout(0.2),
            tfa.layers.WeightNormalization(L.Dense(512, activation="relu")),
        ],
        name="Block2",
    )

    output_2 = block_2(connection_1)
    connection_2 = L.Average(name="Connection2")([output_1, output_2])

    block_3 = tf.keras.Sequential(
        [
            L.BatchNormalization(),
            tfa.layers.WeightNormalization(L.Dense(128, activation="selu")),
            L.BatchNormalization(),
            L.Dropout(0.2),
            tfa.layers.WeightNormalization(L.Dense(1024, activation="elu")),
            L.BatchNormalization(),
        ],
        name="Block3",
    )

    output_3 = block_3(connection_2)

    # output = L.Dense(n_labels, activation="sigmoid", name="Output")(output_3)
    output = L.Dense(n_labels, name="Output")(output_3)

    model = tf.keras.models.Model(inputs=[input_1, input_2], outputs=output)

    return model

In [None]:
if IN_COLAB:
    model_test = create_model_resnet(len(train.columns), len(train_pca.columns), len(target.columns))
    model_test.summary()
    display_svg(SVG(model_to_dot(model_test, show_shapes=True, dpi=72).create(prog="dot", format="svg")))

In [None]:
def create_model_resnet_tuning(n_features, n_features_2, n_labels, params):
    n_hidden_layers = params["n_layers"]
    units = params["units"]
    activations = params["activations"]

    input_1 = L.Input(shape=(n_features,), name="Input1")
    input_2 = L.Input(shape=(n_features_2,), name="Input2")

    block_1 = tf.keras.Sequential(
        [
            L.BatchNormalization(),
            L.Dropout(0.2),
            tfa.layers.WeightNormalization(L.Dense(units[-3], activation=activations[-4])),
            L.BatchNormalization(),
            L.Dropout(0.2),
            tfa.layers.WeightNormalization(L.Dense(units[n_hidden_layers - 1], activation=activations[-3])),
        ],
        name="Block1",
    )

    output_1 = block_1(input_1)
    connection_1 = L.Concatenate(name="Connection1")([input_2, output_1])

    layers_2 = []
    for i in range(n_hidden_layers):
        layers_2 += [
            L.BatchNormalization(),
            L.Dropout(0.2),
            tfa.layers.WeightNormalization(L.Dense(units[i], activation=activations[i])),
        ]
    block_2 = tf.keras.Sequential(layers_2, name="Block2")

    output_2 = block_2(connection_1)
    connection_2 = L.Average(name="Connection2")([output_1, output_2])

    block_3 = tf.keras.Sequential(
        [
            L.BatchNormalization(),
            tfa.layers.WeightNormalization(L.Dense(units[-2], activation=activations[-2])),
            L.BatchNormalization(),
            L.Dropout(0.2),
            tfa.layers.WeightNormalization(L.Dense(units[-1], activation=activations[-1])),
            L.BatchNormalization(),
        ],
        name="Block3",
    )

    output_3 = block_3(connection_2)

    # output = L.Dense(n_labels, activation="sigmoid", name="Output")(output_3)
    output = L.Dense(n_labels, name="Output")(output_3)

    model = tf.keras.models.Model(inputs=[input_1, input_2], outputs=output)

    return model

## TabNet

[TabNet : Attentive Interpretable Tabular Learning](https://github.com/dreamquark-ai/tabnet)

In [None]:
def create_model_tabnet(seed, pre_train=False):
    tabnet_params = dict(
        n_d=32,
        n_a=32,
        n_steps=1,
        n_independent=1,  # 2 is better CV than 1, but need more time
        n_shared=1,  # same above
        gamma=1.3,
        lambda_sparse=0,
        #cat_dims=[len(np.unique(train_cat[:, i])) for i in range(train_cat.shape[1])],
        #cat_emb_dim=[1] * train_cat.shape[1],
        #cat_idxs=features_cat_index,
        optimizer_fn=optim.Adam,
        optimizer_params=dict(lr=2e-2, weight_decay=1e-5),
        mask_type="entmax",
        scheduler_params=dict(mode="min", patience=5, min_lr=1e-5, threshold=1e-5, factor=0.1),
        scheduler_fn=torch_ReduceLROnPlateau,
        seed=seed,
        verbose=0,
    )

    if pre_train:
        model = TabNetPretrainer(**tabnet_params)
    else:
        model = TabNetRegressor(**tabnet_params)

    return model

In [None]:
def create_model_tabnet_tuning(seed, params=None):
    tabnet_params = dict(
        n_d=params["n_d"],
        n_a=params["n_a"],
        n_steps=1,
        n_independent=1,
        n_shared=1,
        gamma=1.3,
        lambda_sparse=0,
        optimizer_fn=optim.Adam,
        optimizer_params=dict(lr=2e-2, weight_decay=1e-5),
        mask_type="entmax",
        scheduler_params=dict(mode="min", patience=5, min_lr=1e-5, threshold=1e-5, factor=0.1),
        scheduler_fn=torch_ReduceLROnPlateau,
        seed=seed,
        verbose=0,
    )

    model = TabNetRegressor(**tabnet_params)

    return model

## NODE

Neural Oblivious Decision Ensembles

[MoA Neural Oblivious Decision Ensembles (TF Keras)](https://www.kaggle.com/gogo827jz/moa-neural-oblivious-decision-ensembles-tf-keras)

[NODE for Tensorflow](https://www.kaggle.com/marcusgawronsky/differentiable-catboost-node-in-tensorflow-2-0)


In [None]:
@tf.function
def sparsemoid(inputs: tf.Tensor):
    return tf.clip_by_value(0.5 * inputs + 0.5, 0.0, 1.0)

@tf.function
def identity(x: tf.Tensor):
    return x

In [None]:
class ODST(L.Layer):
    def __init__(self, n_trees: int = 3, depth: int = 4, units: int = 1, threshold_init_beta: float = 1.0):
        super(ODST, self).__init__()
        self.initialized = False
        self.n_trees = n_trees
        self.depth = depth
        self.units = units
        self.threshold_init_beta = threshold_init_beta

    def build(self, input_shape: tf.TensorShape):
        feature_selection_logits_init = tf.zeros_initializer()
        self.feature_selection_logits = tf.Variable(
            initial_value=feature_selection_logits_init(
                shape=(input_shape[-1], self.n_trees, self.depth), dtype="float32"
            ),
            trainable=True,
            name="feature_selection_logits",
        )

        feature_thresholds_init = tf.zeros_initializer()
        self.feature_thresholds = tf.Variable(
            initial_value=feature_thresholds_init(shape=(self.n_trees, self.depth), dtype="float32"),
            trainable=True,
            name="feature_thresholds",
        )

        log_temperatures_init = tf.ones_initializer()
        self.log_temperatures = tf.Variable(
            initial_value=log_temperatures_init(shape=(self.n_trees, self.depth), dtype="float32"),
            trainable=True,
            name="log_temperatures",
        )

        indices = K.arange(0, 2 ** self.depth, 1)
        offsets = 2 ** K.arange(0, self.depth, 1)
        bin_codes = tf.reshape(indices, (1, -1)) // tf.reshape(offsets, (-1, 1)) % 2
        bin_codes_1hot = tf.stack([bin_codes, 1 - bin_codes], axis=-1)
        self.bin_codes_1hot = tf.Variable(
            initial_value=tf.cast(bin_codes_1hot, "float32"), trainable=False, name="bin_codes_1hot"
        )

        response_init = tf.ones_initializer()
        self.response = tf.Variable(
            initial_value=response_init(shape=(self.n_trees, self.units, 2 ** self.depth), dtype="float32"),
            trainable=True,
            name="response",
        )

    def initialize(self, inputs):
        feature_values = self.feature_values(inputs)

        # intialize feature_thresholds
        percentiles_q = 100 * tfp_distributions.Beta(self.threshold_init_beta, self.threshold_init_beta).sample(
            [self.n_trees * self.depth]
        )
        flattened_feature_values = tf.map_fn(K.flatten, feature_values)
        init_feature_thresholds = tf.linalg.diag_part(
            tfp_stats.percentile(flattened_feature_values, percentiles_q, axis=0)
        )

        self.feature_thresholds.assign(tf.reshape(init_feature_thresholds, self.feature_thresholds.shape))

        # intialize log_temperatures
        self.log_temperatures.assign(
            tfp_stats.percentile(tf.math.abs(feature_values - self.feature_thresholds), 50, axis=0)
        )

    def feature_values(self, inputs: tf.Tensor, training: bool = None):
        feature_selectors = tfa.activations.sparsemax(self.feature_selection_logits)
        # ^--[in_features, n_trees, depth]

        feature_values = tf.einsum("bi,ind->bnd", inputs, feature_selectors)
        # ^--[batch_size, n_trees, depth]

        return feature_values

    def call(self, inputs: tf.Tensor, training: bool = None):
        if not self.initialized:
            self.initialize(inputs)
            self.initialized = True

        feature_values = self.feature_values(inputs)

        threshold_logits_a = (feature_values - self.feature_thresholds) * tf.math.exp(-self.log_temperatures)

        threshold_logits_b = tf.stack([-threshold_logits_a, threshold_logits_a], axis=-1)
        # ^--[batch_size, n_trees, depth, 2]

        bins = sparsemoid(threshold_logits_b)
        # ^--[batch_size, n_trees, depth, 2], approximately binary

        bin_matches = tf.einsum("btds,dcs->btdc", bins, self.bin_codes_1hot)
        # ^--[batch_size, n_trees, depth, 2 ** depth]

        response_weights = tf.math.reduce_prod(bin_matches, axis=-2)
        # ^-- [batch_size, n_trees, 2 ** depth]

        response = tf.einsum("bnd,ncd->bnc", response_weights, self.response)
        # ^-- [batch_size, n_trees, units]

        return tf.reduce_sum(response, axis=1)

In [None]:
class NODE(tf.keras.Model):
    def __init__(
        self,
        units: int = 1,
        n_layers: int = 1,
        output_dim=1,
        dropout_rate=0.1,
        link: tf.function = tf.identity,
        n_trees: int = 3,
        depth: int = 4,
        threshold_init_beta: float = 1.0,
        feature_column: Optional[L.DenseFeatures] = None,
    ):
        super(NODE, self).__init__()
        self.units = units
        self.n_layers = n_layers
        self.n_trees = n_trees
        self.depth = depth
        self.units = units
        self.threshold_init_beta = threshold_init_beta
        self.feature_column = feature_column
        self.dropout_rate = dropout_rate
        self.output_dim = output_dim

        if feature_column is None:
            self.feature = L.Lambda(identity)
        else:
            self.feature = feature_column

        self.bn = [L.BatchNormalization() for _ in range(n_layers + 1)]
        self.dropout = [L.Dropout(self.dropout_rate) for _ in range(n_layers + 1)]
        self.ensemble = [
            ODST(n_trees=n_trees, depth=depth, units=units, threshold_init_beta=threshold_init_beta)
            for _ in range(n_layers)
        ]

        self.last_layer = L.Dense(self.output_dim)

        self.link = link

    def call(self, inputs, training=None):
        X_a = self.feature(inputs)
        X_b = self.bn[0](X_a, training=training)
        X_c = self.dropout[0](X_b, training=training)

        X = defaultdict(dict)
        X[0][0] = X_c
        for i, tree in enumerate(self.ensemble):
            X[i][1] = tf.concat([X[i][0], tree(X[i][0])], axis=1)
            X[i][2] = self.bn[i + 1](X[i][1], training=training)
            X[i + 1][0] = self.dropout[i + 1](X[i][2], training=training)

        return self.link(self.last_layer(X[i + 1][0]))

In [None]:
def create_model_node(input_dim, output_dim):
    model = tf.keras.Sequential(
        [
            L.Input(shape=(input_dim,)),
            L.BatchNormalization(),
            NODE(
                n_layers=2,
                units=256,
                output_dim=1024,
                dropout_rate=0.2,
                depth=2,
                n_trees=3,
            ),
            L.BatchNormalization(),
            L.Dropout(0.2),
            tfa.layers.WeightNormalization(L.Dense(512, activation="elu")),
            L.BatchNormalization(),
            L.Dropout(0.2),
            tfa.layers.WeightNormalization(L.Dense(256, activation="swish")),
            L.BatchNormalization(),
            L.Dense(output_dim),  # from_logits=True
        ]
    )

    return model

In [None]:
if IN_COLAB:
    model_test = create_model_node(len(train.columns), len(target.columns))
    model_test.summary()
    # display_svg(SVG(model_to_dot(model_test, show_shapes=True, dpi=72).create(prog="dot", format="svg")))

In [None]:
def create_model_node_tuning(input_dim, output_dim, params=None):
    model = tf.keras.Sequential(
        [
            L.Input(shape=(input_dim,)),
            L.BatchNormalization(),
            NODE(
                n_layers=params["n_layers"],
                units=params["units"],
                output_dim=params["output_dim"],
                dropout_rate=0.2,
                depth=params["depth"],
                n_trees=params["n_trees"],
            ),
            L.BatchNormalization(),
            L.Dropout(0.2),
            tfa.layers.WeightNormalization(L.Dense(params["d_1"], activation="elu")),
            L.BatchNormalization(),
            L.Dropout(0.2),
            tfa.layers.WeightNormalization(L.Dense(params["d_2"], activation="swish")),
            L.BatchNormalization(),
            L.Dense(output_dim),  # from_logits=True
        ]
    )

    return model

# Training

In [None]:
def learning(
    train_,
    train_pca_,
    target_,
    drug_,
    test_,
    test_pca_,
    N_STARTS=6,
    N_SPLITS=5,
    train_flags=["normal"],
    transfer_learning_base=None,
    params=None,
):
    oof = {}
    predictions = {}

    for seed in range(N_STARTS):
        model_name = models[seed % len(models)]["model_name"]
        cv = models[seed % len(models)]["cv"]
        fit = models[seed % len(models)]["fit"]

        if ("pre_train" in train_flags and model_name not in pre_train_models):
            continue
            
        seed_result = pd.DataFrame(np.zeros(target_.shape))
        prediction = pd.DataFrame(np.zeros(ss.shape))

        kfold_seed = random_seed + seed
        if "pre_train" in train_flags:
            kfold_seed += random_seed

        fix_seed(kfold_seed)

        if "fold" in drug_.columns:
            drug_.drop(["fold"], axis=1, inplace=True)

        # LOCATE DRUGS
        vc = drug_.drug_id.value_counts()
        if DRUG_KFOLD == "soft":
            vc1 = vc.loc[(vc == 6) | (vc == 12) | (vc == 18)].index.sort_values()
            vc2 = vc.loc[(vc != 6) & (vc != 12) & (vc != 18)].index.sort_values()
        else:
            vc1 = vc.loc[vc <= 19].index.sort_values()
            vc2 = vc.loc[vc > 19].index.sort_values()
        
        dct1 = {}
        dct2 = {}

        # STRATIFY DRUGS 18X OR LESS
        skf = MultilabelStratifiedKFold(n_splits=N_SPLITS, random_state=kfold_seed, shuffle=True)
        tmp = pd.concat([drug_, target_], axis=1).groupby("drug_id").mean().loc[vc1]
        for fold, (idxT, idxV) in enumerate(skf.split(tmp, tmp)):
            dd = {k: fold for k in tmp.index[idxV].values}
            dct1.update(dd)

        # STRATIFY DRUGS MORE THAN 18X
        skf = MultilabelStratifiedKFold(n_splits=N_SPLITS, random_state=kfold_seed, shuffle=True)
        tmp = drug_.loc[drug_.drug_id.isin(vc2)].reset_index(drop=True)
        for fold, (idxT, idxV) in enumerate(skf.split(tmp, tmp)):
            dd = {k: fold for k in tmp.sig_id[idxV].values}
            dct2.update(dd)

        # ASSIGN FOLDS
        drug_["fold"] = drug_.drug_id.map(dct1)
        drug_.loc[drug_.fold.isna(), "fold"] = drug_.loc[drug_.fold.isna(), "sig_id"].map(dct2)
        drug_.fold = drug_.fold.astype("int8")

        for n, (tr, te) in enumerate(
            MultilabelStratifiedKFold(n_splits=N_SPLITS, random_state=kfold_seed, shuffle=True).split(target_, target_)
        ):
            if cv == "with_drug_id":
                tr = drug_[drug_["fold"] != n].index
                te = drug_[drug_["fold"] == n].index

            start_time = time()

            # Build Data Sets
            if model_name == "ResNet":
                x_tr = [
                    add_swap_noise(tr, train_.values[tr], train_.values[tr]),
                    # train_.values[tr],
                    train_pca_.values[tr],
                ]
                x_val = [
                    train_.values[te],
                    train_pca_.values[te],
                ]
                y_tr, y_val = target_.astype(float).values[tr], target_.astype(float).values[te]
                x_tt = [test_.values, test_pca_.values]

            else:
                x_tr, x_val = add_swap_noise(tr, train_.values[tr], train_.values[tr]), train_.values[te]
                # x_tr, x_val = train_.values[tr], train_.values[te]
                y_tr, y_val = target_.astype(float).values[tr], target_.astype(float).values[te]
                x_tt = test_.values

            # Build Model
            if model_name == "ResNet":
                if "hyperparameter_tuning" in train_flags:
                    model = create_model_resnet_tuning(
                        len(train_.columns), len(train_pca_.columns), len(target_.columns), params[model_name]
                    )
                    
                    if "fine_tuning" in train_flags:
                        model_base = create_model_resnet_tuning(
                            len(train_.columns),
                            len(train_pca_.columns),
                            len(transfer_learning_base.columns),
                            params[model_name],
                        )
                    
                else:
                    model = create_model_resnet(len(train_.columns), len(train_pca_.columns), len(target_.columns))

                    if "fine_tuning" in train_flags:
                        if not fit:
                            model_base = create_model_resnet(
                                len(train_.columns), len(train_pca_.columns), len(target_.columns)
                            )
                        else:
                            model_base = create_model_resnet(
                                len(train_.columns), len(train_pca_.columns), len(transfer_learning_base.columns)
                            )

            elif model_name == "TabNet":
                if "hyperparameter_tuning" in train_flags:
                    model = create_model_tabnet_tuning(kfold_seed, params[model_name])
                    
                elif "pre_train" in train_flags:
                    #model = create_model_tabnet(kfold_seed, pre_train=True)
                    model = create_model_tabnet(kfold_seed, pre_train=False)

                else:
                    model = create_model_tabnet(kfold_seed)
                    
                    if "fine_tuning" in train_flags:
                        model_base = create_model_tabnet(kfold_seed, pre_train=True)
                        
            elif model_name == "NODE":
                if "hyperparameter_tuning" in train_flags:
                    model = create_model_node_tuning(len(train_.columns), len(target_.columns), params[model_name])
                    
                    if "fine_tuning" in train_flags:
                        model_base = create_model_node_tuning(
                            len(train_.columns), len(transfer_learning_base.columns), params[model_name]
                        )

                else:
                    model = create_model_node(len(train_.columns), len(target_.columns))
                    
                    if "fine_tuning" in train_flags:
                        model_base = create_model_node(len(train_.columns), len(transfer_learning_base.columns))
                        
            else:
                raise "Model name is invalid."

            if model_name == "TabNet":
                checkpoint_path = f"{model_name}_repeat:{seed // len(models)}_fold:{n}"
                
                if fit:
                    MODEL_DIR = PRE_TRAIN_MODEL_DIR_TABNET
                else:
                    MODEL_DIR = PRE_TRAIN_MODEL_DIR_NO_FIT

                if PRE_TRAIN_MODEL == "load-others":
                    checkpoint_path = os.path.join(
                        MODEL_DIR,
                        checkpoint_path
                    )
                    
                if "fine_tuning" in train_flags and os.path.exists(checkpoint_path) and fit:
                    model_base.load_model(checkpoint_path + ".zip")

                    model.fit(
                        X_train=x_tr,
                        y_train=y_tr,
                        eval_set=[(x_val, y_val)],
                        eval_name=["val"],
                        eval_metric=["logits_ll"],
                        max_epochs=200,
                        patience=10,
                        batch_size=1024,
                        virtual_batch_size=32,
                        num_workers=1,
                        drop_last=True,
                        # loss_fn=F.binary_cross_entropy_with_logits,
                        loss_fn=SmoothBCEwLogits(smoothing=1e-6),
                        from_unsupervised=model_base,
                    )
                    
                elif fit:
                    model.fit(
                        X_train=x_tr,
                        y_train=y_tr,
                        eval_set=[(x_val, y_val)],
                        eval_name=["val"],
                        eval_metric=["logits_ll"],
                        max_epochs=200,
                        patience=10,
                        batch_size=1024,
                        virtual_batch_size=32,
                        num_workers=1,
                        drop_last=True,
                        # loss_fn=F.binary_cross_entropy_with_logits,
                        loss_fn=SmoothBCEwLogits(smoothing=1e-6),
                    )

                else:
                    model.load_model(checkpoint_path + ".zip")
                    
                if "pre_train" in train_flags or SAVE_WEIGHT:
                    try:
                        os.remove(os.path.basename(checkpoint_path))
                    except OSError:
                        pass
                    model.save_model(os.path.basename(checkpoint_path))

            else:
                model.compile(
                    optimizer=tfa.optimizers.AdamW(lr=1e-3, weight_decay=1e-5),
                    loss=tf.keras.losses.BinaryCrossentropy(from_logits=True, label_smoothing=1e-6),
                    metrics=logloss,
                )

                checkpoint_path = f"{model_name}_repeat:{seed // len(models)}_fold:{n}.hdf5"

                if fit:
                    if model_name == "ResNet":
                        MODEL_DIR = PRE_TRAIN_MODEL_DIR_RESNET
                    else:
                        MODEL_DIR = PRE_TRAIN_MODEL_DIR_NODE
                else:
                    MODEL_DIR = PRE_TRAIN_MODEL_DIR_NO_FIT

                if PRE_TRAIN_MODEL == "load-others":
                    checkpoint_path = os.path.join(
                        MODEL_DIR,
                        checkpoint_path
                    )

                if "fine_tuning" in train_flags and os.path.exists(checkpoint_path):
                    model_base.load_weights(checkpoint_path)
                    for layer in range(len(model_base.layers[:-1])):
                        model.layers[layer].set_weights(model_base.layers[layer].get_weights())
                    if not fit:
                        model.layers[-1].set_weights(model_base.layers[-1].get_weights())

                if "pre_train" in train_flags or SAVE_WEIGHT:
                    cb_checkpt = ModelCheckpoint(
                        os.path.basename(checkpoint_path),
                        monitor="val_loss",
                        verbose=0,
                        save_best_only=True,
                        save_weights_only=True,
                        mode="min",
                    )
                reduce_lr_loss = ReduceLROnPlateau(
                    monitor="val_loss", factor=0.1, patience=5, verbose=0, min_delta=1e-5, min_lr=1e-5, mode="min"
                )
                early_stopping = EarlyStopping(
                    monitor="val_loss",
                    patience=10,
                    mode="min",
                    verbose=0,
                    min_delta=1e-5,
                    restore_best_weights=True,
                )
                if "pre_train" in train_flags or SAVE_WEIGHT:
                    callbacks = [cb_checkpt, reduce_lr_loss, early_stopping]
                else:
                    callbacks = [reduce_lr_loss, early_stopping]

                if fit:
                    model.fit(
                        x_tr,
                        y_tr,
                        validation_data=(x_val, y_val),
                        epochs=200,
                        batch_size=128,
                        callbacks=callbacks,
                        verbose=0,
                    )

            val_predict = model.predict(x_val)
            val_predict = 1 / (1 + np.exp(-val_predict))
            seed_result.loc[te, :] += val_predict
            fold_score = metric(target_.loc[te].values, val_predict)

            if any(flag in train_flags for flag in ["normal", "fine_tuning"]):
                test_predict = model.predict(x_tt)
                test_predict = 1 / (1 + np.exp(-test_predict))
                prediction += test_predict / N_SPLITS

            print(
                f"[{str(datetime.timedelta(seconds = time() - start_time))[2:7]}] {model_name} {cv}: Seed {seed}, Fold {n}:",
                fold_score,
            )

            K.clear_session()
            del model
            if "model_base" in globals():
                del model_base
            gc.collect()

        oof[f"{model_name}_{cv}_fit({str(fit)})_{seed}"] = seed_result
        predictions[f"{model_name}_{cv}_fit({str(fit)})_{seed}"] = prediction

    return oof, predictions

## Hyper parameter tuning

In [None]:
def objective(trial):
    params = {model["model_name"]: None for model in models}

    if TUNING_RESNET:
        n_layers = trial.suggest_int("n_layers", 2, 5)
        # n_layers = 5

        units = []
        for i in range(n_layers + 3):
            u = trial.suggest_categorical(f"units_{i}", [128, 256, 512, 1024])
            units.append(u)

        activations = []
        for i in range(n_layers + 4):
            a = trial.suggest_categorical(f"activations_{i}", ["relu", "elu", "selu", "swish"])
            activations.append(a)

        params["ResNet"] = {
            "n_layers": n_layers,
            "units": units,
            "activations": activations,
        }

    if TUNING_TABNET:
        n_d = trial.suggest_categorical("n_d", [16, 24, 32, 48, 64, 96, 128, 160])
        n_a = trial.suggest_categorical("n_a", [16, 24, 32, 48, 64, 96, 128, 160])

        params["TabNet"] = {
            "n_d": n_d,
            "n_a": n_a,
        }

    if TUNING_NODE:
        n_layers = 2  # trial.suggest_int("n_layers", 2, 3)
        depth = 2  # trial.suggest_int("depth", 2, 3)
        n_trees = 3  # trial.suggest_int("n_trees", 2, 3)
        units = trial.suggest_categorical("units", [128, 256, 512, 1024])
        output_dim = trial.suggest_categorical("output_dim", [128, 256, 512, 1024])
        d_1 = trial.suggest_categorical("d_1", [128, 256, 512, 1024])
        d_2 = trial.suggest_categorical("d_2", [128, 256, 512, 1024])

        params["NODE"] = {
            "n_layers": n_layers,
            "depth": depth,
            "n_trees": n_trees,
            "units": units,
            "output_dim": output_dim,
            "d_1": d_1,
            "d_2": d_2,
        }

    # Training
    if PRE_TRAIN_MODEL == "in-notebook":
        _, _ = learning(
            train[: non_target.shape[0]],
            train_pca[: non_target.shape[0]],
            pd.concat([non_target, target[: non_target.shape[0]]], axis=1),
            non_target_drug,
            test,
            test_pca,
            N_STARTS,
            N_SPLITS,
            train_flags=["pre_train", "hyperparameter_tuning"],
            params=params,
        )

    oof, predictions = learning(
        train,
        train_pca,
        target,
        target_drug,
        test,
        test_pca,
        N_STARTS,
        N_SPLITS,
        train_flags=["fine_tuning", "hyperparameter_tuning"],  # "normal", "fine_tuning"
        transfer_learning_base=pd.concat([non_target, target[: non_target.shape[0]]], axis=1),
        params=params,
    )

    initial_weights = [1.0 / N_STARTS for _ in range(N_STARTS)]
    y_true = target.values[: non_target.shape[0]]

    cv, auc, _ = cross_validation(y_true.shape, initial_weights, y_true, oof)

    return cv * 1000 / auc

In [None]:
if HYPER_PARAMETER_TUNING:
    study = optuna.create_study(direction='minimize')
    study.optimize(objective, n_trials=20, gc_after_trial=True)

In [None]:
if HYPER_PARAMETER_TUNING:
    print("Best trial:")
    trial = study.best_trial

    print("  CV:  {}".format(trial.value))
    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))

    print(optuna.importance.get_param_importances(study))

In [None]:
if HYPER_PARAMETER_TUNING:
    raise "Finished parameter tuning."

## Normal training

In [None]:
%%time

if PRE_TRAIN_MODEL == "in-notebook":
    _, _ = learning(
        train[: non_target.shape[0]],
        train_pca[: non_target.shape[0]],
        pd.concat([non_target, target[: non_target.shape[0]]], axis=1),
        non_target_drug,
        test,
        test_pca,
        N_STARTS,
        N_SPLITS,
        train_flags=["pre_train"],
    )

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 6.91 µs


In [None]:
%%time

oof, predictions = learning(
    train,
    train_pca,
    target,
    target_drug,
    test,
    test_pca,
    N_STARTS,
    N_SPLITS,
    train_flags=["fine_tuning"],  # "normal", "fine_tuning"
    transfer_learning_base=pd.concat([non_target, target[: non_target.shape[0]]], axis=1),
)

[02:05] ResNet without_drug_id: Seed 0, Fold 0: 0.015988617666400244
[02:00] ResNet without_drug_id: Seed 0, Fold 1: 0.016561587230930812
[02:02] ResNet without_drug_id: Seed 0, Fold 2: 0.016460160299281146
[02:02] ResNet without_drug_id: Seed 0, Fold 3: 0.01748774524775867
[02:07] ResNet without_drug_id: Seed 0, Fold 4: 0.017378904586047548
[01:50] ResNet without_drug_id: Seed 0, Fold 5: 0.016428686192403865
[01:58] ResNet without_drug_id: Seed 0, Fold 6: 0.016393450192781617
Device used : cuda
Device used : cuda

Early stopping occured at epoch 49 with best_epoch = 39 and best_val_logits_ll = 0.01771
Best weights from best epoch are automatically used!
[01:42] TabNet without_drug_id: Seed 1, Fold 0: 0.017711500060807878
Device used : cuda
Device used : cuda

Early stopping occured at epoch 57 with best_epoch = 47 and best_val_logits_ll = 0.01683
Best weights from best epoch are automatically used!
[01:57] TabNet without_drug_id: Seed 1, Fold 1: 0.016834965008491812
Device used : cuda

## Cross Validation

In [None]:
oof_weights = [1.0 / N_STARTS for _ in range(N_STARTS)]
model_weights = [1.0 / len(models) for _ in range(len(models))]
seed_weights = [1.0 / N_SEED for _ in range(N_SEED)]

y_true = target.values[: non_target.shape[0]]

print(f"===== OOF - CV =====")
for key, val in oof.items():
    if CTRL_VEHICLE in ("use", "keep"):
        oof[key].loc[train_pub_test["cp_type"] == "ctl_vehicle", :] = 0

    print(f"OOF Key: {key}, CV: {metric(y_true, val.values[:y_true.shape[0]])}")

oof_by_model = {
    f"{model['model_name']}_{model['cv']}_fit({str(model['fit'])})": {
        k: v
        for k, v in oof.items()
        if k.startswith(f"{model['model_name']}_{model['cv']}_fit({str(model['fit'])})")
    }
    for model in models
}

blend_by_model = {}
for model, oof_ in oof_by_model.items():
    print(f"\n===== Model {model} - CV =====")
    _, _, blend_by_model[model] = cross_validation(y_true.shape, seed_weights, y_true, oof_)

print(f"\n===== Overall - CV =====")
_ = cross_validation(y_true.shape, model_weights, y_true, blend_by_model)

===== OOF - CV =====
OOF Key: ResNet_without_drug_id_fit(True)_0, CV: 0.015549920786218762
OOF Key: TabNet_without_drug_id_fit(True)_1, CV: 0.016379545413917004
OOF Key: NODE_without_drug_id_fit(True)_2, CV: 0.015617632925199512
OOF Key: ResNet_without_drug_id_fit(True)_3, CV: 0.015571681574280447
OOF Key: TabNet_without_drug_id_fit(True)_4, CV: 0.016357813363721805
OOF Key: NODE_without_drug_id_fit(True)_5, CV: 0.015517192821919377

===== Model ResNet_without_drug_id_fit(True) - CV =====
Blended CV: 0.015187612866172193, AUC : 0.8413028495809948

===== Model TabNet_without_drug_id_fit(True) - CV =====
Blended CV: 0.01615445874266707, AUC : 0.7883251866030232

===== Model NODE_without_drug_id_fit(True) - CV =====
Blended CV: 0.015304891187367803, AUC : 0.8276664422474292

===== Overall - CV =====
Blended CV: 0.01512241439772212, AUC : 0.8415474216651967


In [None]:
if optimize == "fixed":
    model_weights = fixed_weight
    print(f"Fixed weights: {model_weights}")

    cross_validation(y_true.shape, model_weights, y_true, blend_by_model)

elif optimize == "lagrange":
    # https://www.kaggle.com/gogo827jz/optimise-blending-weights-with-bonus-0#Bonus-(Lagrange-Multiplier)

    def lagrange_func(params):
        # weights, _lambda = params
        blend_ = blend(y_true.shape, params[:-1], blend_by_model)
        return metric(y_true, blend_) - params[-1] * (sum(params[:-1]) - 1)

    grad_l = grad(lagrange_func)

    def lagrange_obj(params):
        # weights, _lambda = params
        d = grad_l(params).tolist()
        return d[:-1] + [sum(params[:-1]) - 1]

    optimized_weights = fsolve(lagrange_obj, model_weights + [1.0])

    print(f"Optimized weights: {optimized_weights[:-1]}")
    print(f"Check the sum of all weights: {sum(optimized_weights[:-1])}")

    cross_validation(y_true.shape, optimized_weights[:-1], y_true, blend_by_model)

    model_weights = optimized_weights[:-1]

In [None]:
predictions_by_model = {
    f"{model['model_name']}_{model['cv']}_fit({str(model['fit'])})": {
        k: v
        for k, v in predictions.items()
        if k.startswith(f"{model['model_name']}_{model['cv']}_fit({str(model['fit'])})")
    }
    for model in models
}

blend_by_model = {
    f"{model['model_name']}_{model['cv']}_fit({str(model['fit'])})": pd.DataFrame(blend(
        ss.shape,
        seed_weights,
        predictions_by_model[f"{model['model_name']}_{model['cv']}_fit({str(model['fit'])})"]
    ))
    for model in models
}

for a, b in itertools.combinations(blend_by_model.keys(), 2):
    corr = blend_by_model[a].corrwith(blend_by_model[b], axis=1)
    print(f"Prediction correlation between {a} and {b}: {corr.mean()}")

Prediction correlation between ResNet_without_drug_id_fit(True) and TabNet_without_drug_id_fit(True): 0.8500191513483923
Prediction correlation between ResNet_without_drug_id_fit(True) and NODE_without_drug_id_fit(True): 0.9447938261897578
Prediction correlation between TabNet_without_drug_id_fit(True) and NODE_without_drug_id_fit(True): 0.8546747993252847


# Postprocessing

In [None]:
# Weighted blend
submit_df.loc[:, target.columns] = blend(ss.shape, model_weights, blend_by_model)

In [None]:
submit_df.loc[test_df["cp_type"] == "ctl_vehicle", target.columns] = 0

# Output

In [None]:
submit_df.to_csv("submission.csv", index=False)