# Strategy

- Preprocessing
    - RankGauss
    - PCA + Existing Features
    - Variance Encoding
- Model
    - DeepTables
- Learning
    - Optimizer: AdamW with weight_decay
    - Label smoothing
- Prediction
    - Ensemble above with weight optimization
    - With clipping

# Library

In [None]:
import warnings

warnings.filterwarnings("ignore")

In [None]:
import sys

sys.path.append("../input/iterative-stratification/iterative-stratification-master")
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

sys.path.append("../input/autograd")
import autograd.numpy as np
from autograd import grad

sys.path.append("../../../../github/DeepTables")
from deeptables.models.deepnets import AFM, DCN, FGCNN, PNN, AutoInt, DeepFM, WideDeep, xDeepFM
from deeptables.models.deeptable import DeepTable, ModelConfig
from deeptables.models.preprocessor import DefaultPreprocessor

In [None]:
import datetime
import gc
import os
import random
from collections import defaultdict
from time import time
from typing import Optional

# import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow.keras.backend as K
import tensorflow.keras.layers as L
import tensorflow_addons as tfa
import tensorflow_probability as tfp

# import optuna
from scipy.optimize import fsolve, minimize
from sklearn.decomposition import PCA
from sklearn.feature_selection import VarianceThreshold
from sklearn.metrics import log_loss
from sklearn.preprocessing import MinMaxScaler, QuantileTransformer
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau

In [None]:
MIXED_PRECISION = False
XLA_ACCELERATE = True

if MIXED_PRECISION:
    from tensorflow.keras.mixed_precision import experimental as mixed_precision

    if tpu:
        policy = tf.keras.mixed_precision.experimental.Policy("mixed_bfloat16")
    else:
        policy = tf.keras.mixed_precision.experimental.Policy("mixed_float16")
    mixed_precision.set_policy(policy)
    print("Mixed precision enabled")

if XLA_ACCELERATE:
    tf.config.optimizer.set_jit(True)
    print("Accelerated Linear Algebra enabled")

# Functions

In [None]:
def fix_seed(seed=2020):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)


random_seed = 22
fix_seed(random_seed)

In [None]:
# https://www.kaggle.com/c/lish-moa/discussion/189857#1043953

# Prediction Clipping Thresholds
p_min = 0.001
p_max = 0.999

# Evaluation Metric with clipping and no label smoothing
def logloss(y_true, y_pred):
    # y_pred = tf.clip_by_value(y_pred, p_min, p_max)
    return -K.mean(y_true * K.log(y_pred) + (1 - y_true) * K.log(1 - y_pred))

In [None]:
# [Fast Numpy Log Loss] https://www.kaggle.com/gogo827jz/optimise-blending-weights-4-5x-faster-log-loss
def metric(y_true, y_pred):
    loss = 0
    y_pred_clip = np.clip(y_pred, 1e-7, 1 - 1e-7)
    for i in range(y_pred.shape[1]):
        loss += -np.mean(y_true[:, i] * np.log(y_pred_clip[:, i]) + (1 - y_true[:, i]) * np.log(1 - y_pred_clip[:, i]))
    return loss / y_pred.shape[1]

In [None]:
def blend(size, weights, oof):
    blend_ = np.zeros(size)
    for i, key in enumerate(oof.keys()):
        blend_ += weights[i] * oof[key].values
    return blend_

# Load Data

In [None]:
train_df = pd.read_csv("../input/lish-moa/train_features.csv")
test_df = pd.read_csv("../input/lish-moa/test_features.csv")
target_df = pd.read_csv("../input/lish-moa/train_targets_scored.csv")
non_target_df = pd.read_csv("../input/lish-moa/train_targets_nonscored.csv")
submit_df = pd.read_csv("../input/lish-moa/sample_submission.csv")

In [None]:
train = train_df.copy()
test = test_df.copy()
ss = submit_df.copy()

# Preprocessing

In [None]:
train.loc[:, "cp_dose"] = train.loc[:, "cp_dose"].map({"D1": 0, "D2": 1})
test.loc[:, "cp_dose"] = test.loc[:, "cp_dose"].map({"D1": 0, "D2": 1})

train.loc[:, "cp_time"] = train.loc[:, "cp_time"].map({24: 0, 48: 1, 72: 2})
test.loc[:, "cp_time"] = test.loc[:, "cp_time"].map({24: 0, 48: 1, 72: 2})

In [None]:
g_cols = [col for col in train_df.columns if col.startswith("g-")]
c_cols = [col for col in train_df.columns if col.startswith("c-")]

## cp_type が ctrl_vehicle なものは MoA を持たない

ので、学習から除外する

In [None]:
target_df = target_df.loc[train["cp_type"] != "ctl_vehicle"].reset_index(drop=True)
non_target_df = non_target_df.loc[train["cp_type"] != "ctl_vehicle"].reset_index(drop=True)
train = train.loc[train["cp_type"] != "ctl_vehicle"].reset_index(drop=True)

In [None]:
train = train.drop("cp_type", axis=1)
test = test.drop("cp_type", axis=1)

In [None]:
del train["sig_id"]
del target_df["sig_id"]
del non_target_df["sig_id"]
del test["sig_id"]
del ss["sig_id"]

In [None]:
train

## Rank Gauss

https://www.kaggle.com/nayuts/moa-pytorch-nn-pca-rankgauss

連続値を特定の範囲の閉域に押し込めて、分布の偏りを解消する方法です。

In [None]:
for col in g_cols + c_cols:
    transformer = QuantileTransformer(n_quantiles=100, random_state=random_seed, output_distribution="normal")

    vec_len = len(train[col].values)
    vec_len_test = len(test[col].values)

    raw_vec = train[col].values.reshape(vec_len, 1)
    transformer.fit(raw_vec)

    train[col] = transformer.transform(raw_vec).reshape(1, vec_len)[0]
    test[col] = transformer.transform(test[col].values.reshape(vec_len_test, 1)).reshape(1, vec_len_test)[0]

In [None]:
train

## PCA features (+ Existing features)

既存のカラムは残したほうがいいのだろうか？？
→ このコンペでは残したほうがいい成績が出ている。

In [None]:
# g-
n_comp = 50

data = pd.concat([pd.DataFrame(train[g_cols]), pd.DataFrame(test[g_cols])])
data2 = PCA(n_components=n_comp, random_state=random_seed).fit_transform(data[g_cols])
train2 = data2[: train.shape[0]]
test2 = data2[-test.shape[0] :]

train2 = pd.DataFrame(train2, columns=[f"pca_G-{i}" for i in range(n_comp)])
test2 = pd.DataFrame(test2, columns=[f"pca_G-{i}" for i in range(n_comp)])

# train.drop(g_cols, axis=1, inplace=True)
# test.drop(g_cols, axis=1, inplace=True)

train = pd.concat((train, train2), axis=1)
test = pd.concat((test, test2), axis=1)

In [None]:
# c-
n_comp = 15

data = pd.concat([pd.DataFrame(train[c_cols]), pd.DataFrame(test[c_cols])])
data2 = PCA(n_components=n_comp, random_state=random_seed).fit_transform(data[c_cols])
train2 = data2[: train.shape[0]]
test2 = data2[-test.shape[0] :]

train2 = pd.DataFrame(train2, columns=[f"pca_C-{i}" for i in range(n_comp)])
test2 = pd.DataFrame(test2, columns=[f"pca_C-{i}" for i in range(n_comp)])

# train.drop(c_cols, axis=1, inplace=True)
# test.drop(c_cols, axis=1, inplace=True)

train = pd.concat((train, train2), axis=1)
test = pd.concat((test, test2), axis=1)

In [None]:
train

## feature Selection using Variance Encoding

分散がしきい値以下の特徴量を捨てます。

In [None]:
var_thresh = VarianceThreshold(threshold=0.5)

data = train.append(test)
data_transformed = var_thresh.fit_transform(data.iloc[:, 2:])

train_transformed = data_transformed[: train.shape[0]]
test_transformed = data_transformed[-test.shape[0] :]

train = pd.DataFrame(train[["cp_time", "cp_dose"]].values.reshape(-1, 2), columns=["cp_time", "cp_dose"])
train = pd.concat([train, pd.DataFrame(train_transformed)], axis=1, ignore_index=True)

test = pd.DataFrame(test[["cp_time", "cp_dose"]].values.reshape(-1, 2), columns=["cp_time", "cp_dose"])
test = pd.concat([test, pd.DataFrame(test_transformed)], axis=1, ignore_index=True)

In [None]:
train

# Create Model

In [None]:
def create_model_dt(y):

    dt_conf = ModelConfig(
        metrics=[logloss],
        loss=tf.keras.losses.BinaryCrossentropy(label_smoothing=0.001),
        optimizer=tfa.optimizers.AdamW(lr=1e-3, weight_decay=1e-5, clipvalue=756),
        nets=["dnn_nets"],
        # nets=DeepFM,
        apply_gbm_features=False,
        task="multilabel",
        earlystopping_patience=10,
        dnn_params={
            "hidden_units": (
                (2 ** 11, 0.3, True),
                (2 ** 10, 0.3, True),
                (2 ** 9, 0.3, True),
            ),  # hidden_units
            "dnn_activation": "relu",
        },
    )

    dt_preprocessor = DefaultPreprocessor(dt_conf)
    dt_preprocessor.fit_transform_y(y)

    return DeepTable(config=dt_conf, preprocessor=dt_preprocessor)

# Learning

In [None]:
def learning(target, N_STARTS, N_SPLITS, do_predict=False, do_transfer_learning=False):
    oof = {}
    predictions = {}

    for seed in range(N_STARTS):
        seed_result = target.copy()
        seed_result.loc[:, target.columns] = 0
        prediction = ss.copy()
        prediction.loc[:, ss.columns] = 0

        fix_seed(random_seed)

        start_time = time()

        model_name = "DeepTables"
        model = create_model_dt(target)

        if not do_predict:
            continue

        oof_predict, _, test_predict = model.fit_cross_validation(
            train,
            target,
            X_eval=None,
            X_test=test,
            iterators=MultilabelStratifiedKFold(n_splits=N_SPLITS, random_state=random_seed, shuffle=True),
            random_state=random_seed,
            batch_size=128,
            epochs=100,
            verbose=0,
        )

        seed_score = metric(target.values, oof_predict)
        seed_result.loc[:, target.columns] += oof_predict

        if do_predict:
            prediction.loc[:, target.columns] += test_predict / N_SPLITS

        print(
            f"===== Result ===== [{str(datetime.timedelta(seconds = time() - start_time))[2:7]}] {model_name}: Seed {seed}: {seed_score}\n"
        )

        K.clear_session()
        del model
        x = gc.collect()

        oof[f"{model_name}_{seed}"] = seed_result
        predictions[f"{model_name}_{seed}"] = prediction

    return oof, predictions

In [None]:
N_STARTS = 5
N_SPLITS = 7

In [None]:
# Pre train with non-scored labels
_, _ = learning(non_target_df, N_STARTS, N_SPLITS)

In [None]:
oof, predictions = learning(target_df, N_STARTS, N_SPLITS, True, True)

# Cross Validation

In [None]:
initial_weights = [1.0 / N_STARTS for _ in range(N_STARTS)] + [1.0]
print(f"Initial weights: {initial_weights[:-1]}")

# https://www.kaggle.com/gogo827jz/optimise-blending-weights-with-bonus-0#Bonus-(Lagrange-Multiplier)


def lagrange_func(params):
    # weights, _lambda = params
    blend_ = blend(target_df.values.shape, params[:-1], oof)
    return metric(target_df.values, blend_) - params[-1] * (sum(params[:-1]) - 1)


grad_l = grad(lagrange_func)


def lagrange_obj(params):
    # weights, _lambda = params
    d = grad_l(params).tolist()
    return d[:-1] + [sum(params[:-1]) - 1]


blend_ = blend(target_df.values.shape, initial_weights[:-1], oof)
print(f"Initial blend CV: {metric(target_df.values, blend_)}")

optimized_weights = fsolve(lagrange_obj, initial_weights)
blend_ = blend(target_df.values.shape, optimized_weights[:-1], oof)
print(f"Optimized blend CV: {metric(target_df.values, blend_)}")

print(f"Optimized weights: {optimized_weights[:-1]}")
print(f"Check the sum of all weights: {sum(optimized_weights[:-1])}")

# Postprocessing

In [None]:
# Weighted blend
submit_df.loc[:, target_df.columns] = blend(ss.shape, optimized_weights[:-1], predictions)

In [None]:
# Clipping
submit_df.loc[:, target_df.columns] = submit_df.loc[:, target_df.columns].clip(1e-7, 1 - 1e-7)

In [None]:
submit_df.loc[test_df["cp_type"] == "ctl_vehicle", target_df.columns] = 0

# Output

In [None]:
submit_df.to_csv("submission.csv", index=False)