In [2]:
import os
import gc
import time
import warnings
from datetime import datetime

from numba import njit, jit
from tqdm import tqdm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from numpy import iinfo, finfo, int8, int16, int32, int64, float32, float64

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import losses
from tensorflow.keras import optimizers
from tensorflow.keras import callbacks
from tensorflow.keras import models
from tensorflow.keras import activations

from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.model_selection._split import _BaseKFold, indexable, _num_samples
from sklearn.utils.validation import _deprecate_positional_args

from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.io as pio
from IPython.display import HTML

# 设置plotly为暗黑模式
pio.templates.default = "plotly_dark"
plot_config = dict({'scrollZoom': True, 'displayModeBar': True, 'displaylogo': False})
sns.set(style="ticks", font_scale=1.2, palette='deep', color_codes=True)
colors = ["C" + str(i) for i in range(0, 9+1)]

# 默认plotly色号
default_color_list = [
    '#1f77b4',  # muted blue
    '#ff7f0e',  # safety orange
    '#2ca02c',  # cooked asparagus green
    '#d62728',  # brick red
    '#9467bd',  # muted purple
    '#8c564b',  # chestnut brown
    '#e377c2',  # raspberry yogurt pink
    '#7f7f7f',  # middle gray
    '#bcbd22',  # curry yellow-green
    '#17becf'   # blue-teal
    ]

# 设定全局随机种子，并且屏蔽warnings
GLOBAL_RANDOM_SEED = 2022
np.random.seed(GLOBAL_RANDOM_SEED)
tf.random.set_seed(GLOBAL_RANDOM_SEED)

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', 50)

In [3]:
# 导入数据
load_data_start_time = time.time()
train_df  = pd.read_csv(
    './data/jane-street-market-prediction/train.csv', nrows=None)
feat_df = pd.read_csv(
    './data/jane-street-market-prediction/features.csv')
example_test_df = pd.read_csv(
    './data/jane-street-market-prediction/example_test.csv')
example_prediction_df = pd.read_csv(
    './data/jane-street-market-prediction/example_sample_submission.csv')
load_data_end_time = time.time()

# 打印数据基本情况
print("[INFO] {} End Reading ! It took {:.2f} seconds !".format(
    str(datetime.now())[:-4], load_data_end_time-load_data_start_time))
print("[INFO] {} Basic data description: ".format(str(datetime.now())[:-4]))
print("    -- train_df shape: {}".format(
    train_df.shape))
print("    -- example_test_df shape: {}".format(
    example_test_df.shape))
print("    -- feat_df shape: {}".format(
    feat_df.shape))
print("    -- example_prediction_df shape: {}".format(
    example_prediction_df.shape))

[INFO] 2021-01-04 20:17:13.00 End Reading ! It took 57.14 seconds !
[INFO] 2021-01-04 20:17:13.00 Basic data description: 
    -- train_df shape: (2390491, 138)
    -- example_test_df shape: (15219, 133)
    -- feat_df shape: (130, 30)
    -- example_prediction_df shape: (15219, 2)


In [8]:
# 挑选策略变化之后的数据
train = train_df.query('date > 85').reset_index(drop=True)
train = train.query('weight > 0').reset_index(drop = True)

feature_name_list = [c for c in train.columns if 'feature' in c]
resp_name_list = ["resp", "resp_1", "resp_2", "resp_3", "resp_4"]

# 使用均值填充缺失值
mean_val_list = []
for name in feature_name_list:
    mean_val = train[name].mean()
    train[name].fillna(mean_val, inplace=True)
    mean_val_list.append(mean_val)
mean_val_array = np.array(mean_val_list)

# 构造自编码器的输入输出
X = train[feature_name_list].values
y = train[resp_name_list].values

train_dates = train["date"].values
train_weights = train["weight"].values
train_resp = train["resp"].values
train_resp_all = train[resp_name_list].values

print("[INFO] {} Autoencoder data prepared !".format(
    str(datetime.now())[:-4]))

[INFO] 2021-01-04 20:22:24.10 Autoencoder data prepared !


In [9]:
def gen_test_data(test_df=None, pred_df=None):
    """测试数据生成器。用于模拟测试数据生成过程，测试模型提交正确性与效率。"""
    n_test = len(test_df)

    for i in range(n_test):
        yield test_df.iloc[i], pred_df.iloc[i]


@jit
def njit_fillna(array, values):
    """利用即时编译（jit）对array数组的NaN值借助values进行填充。

    @References:
    ----------
    [1] https://www.kaggle.com/gogo827jz/optimise-speed-of-filling-nan-function
    """
    if np.isnan(array.sum()):
        array = np.where(np.isnan(array), values, array)
    return array


def custom_metric(dates_array=None,
                  weights_array=None,
                  resp_array=None,
                  action_label_array=None):
    """依据官方要求的Metric，计算分数。

    @References:
    ----------
    [1] https://www.kaggle.com/c/jane-street-market-prediction/discussion/199107
    [2] https://www.kaggle.com/c/jane-street-market-prediction/overview/evaluation
    [3] 
    """
    tmp_df = pd.DataFrame({"date": dates_array,
                           "weight": weights_array,
                           "resp": resp_array,
                           "action": action_label_array})
    tmp_df["p"] = tmp_df["weight"]  * tmp_df["resp"] * tmp_df["action"]
    # tmp_df = tmp_df.query("weight != 0").reset_index(drop=True)
    p_i_val = tmp_df.groupby(["date"])["p"].sum().values

    n_dates = len(p_i_val)
    t = np.sum(p_i_val) / np.sqrt(np.sum(p_i_val ** 2)) * (np.sqrt(250 / n_dates))
    return min(max(t, 0), 6) * np.sum(p_i_val)

In [6]:
class PurgedGroupTimeSeriesSplit(_BaseKFold):
    """针对带有Group id（组id）数据的时间序列交叉验证集合生成类。

    生成针对带有Group id的数据的时序交叉验证集。其中训练与验证的
    Group之间可以指定group_gap，用来隔离时间上的关系。这种情况下
    group_id通常是时间id，例如天或者小时。

    @Parameters:
    ----------
        n_splits: {int-like}, default=5
            切分的集合数目。
        max_train_group_size: {int-like}, default=+inf
            训练集单个组的最大样本数据限制。
        group_gap: {int-like}, default=None
            依据group_id切分组时，训练组与测试组的id的gap数目。
        max_test_group_size: {int-like}, default=+inf
            测试集单个组的最大样本数据限制。

    @References:
    ----------
    [1] https://www.kaggle.com/gogo827jz/jane-street-ffill-xgboost-purgedtimeseriescv
    """
    @_deprecate_positional_args
    def __init__(self, n_splits=5,
                 *,
                 max_train_group_size=np.inf,
                 max_test_group_size=np.inf,
                 group_gap=None,
                 verbose=False
                 ):
        super().__init__(n_splits, shuffle=False, random_state=None)
        self.max_train_group_size = max_train_group_size
        self.group_gap = group_gap
        self.max_test_group_size = max_test_group_size
        self.verbose = verbose

    def split(self, X, y=None, groups=None):
        """生成训练组与测试组的id索引，返回组索引的生成器。

        @Parameters:
        ----------
            X: {array-like} {n_samples, n_features}
                训练数据，输入形状为{n_samples, n_features}。
            y: {array-like} {n_samples, }
                标签数据，形状为{n_samples, }。
            groups: {array-like} {n_samples, }
                用来依据组来划分训练集与测试集的组id，必须为连续的组id。

        @Yields:
        ----------
            train: ndarray
                依据group_id切分的训练组id。
            test: ndarray
                依据group_id切分的测试组id。
        """
        if groups is None:
            raise ValueError(
                "The 'groups' parameter should not be None ！")

        # 初始化基本参数信息
        X, y, groups = indexable(X, y, groups)
        n_samples, n_splits, group_gap = _num_samples(X), self.n_splits, self.group_gap
        n_folds = n_splits + 1

        max_test_group_size = self.max_test_group_size
        max_train_group_size = self.max_train_group_size

        group_dict = {}
        u, ind = np.unique(groups, return_index=True)
        unique_groups = np.argsort(ind)
        n_groups = _num_samples(unique_groups)

        # 扫描整个数据id list，构建group_dcit，{group_id: 属于该group的样本的idx}
        for idx in np.arange(n_samples):
            if (groups[idx] in group_dict):
                group_dict[groups[idx]].append(idx)
            else:
                group_dict[groups[idx]] = [idx]

        if n_folds > n_groups:
            raise ValueError(
                ("Cannot have number of folds={0} greater than"
                 " the number of groups={1}").format(n_folds, n_groups))

        # group_test_size: 每个fold预留的test group的大小
        group_test_size = min(n_groups // n_folds, max_test_group_size)
        group_test_starts = range(n_groups - n_splits * group_test_size,
                                  n_groups, group_test_size)
        for group_test_start in group_test_starts:
            train_array, test_array = [], []

            group_st = max(0, group_test_start - group_gap - max_train_group_size)
            for train_group_idx in unique_groups[group_st:(group_test_start - group_gap)]:
                train_array_tmp = group_dict[train_group_idx]
                train_array = np.sort(np.unique(
                                      np.concatenate((train_array,
                                                      train_array_tmp)),
                                      axis=None), axis=None)
            train_end = train_array.size
 
            for test_group_idx in unique_groups[group_test_start:
                                                group_test_start +
                                                group_test_size]:
                test_array_tmp = group_dict[test_group_idx]
                test_array = np.sort(np.unique(
                                              np.concatenate((test_array,
                                                              test_array_tmp)),
                                     axis=None), axis=None)
            test_array  = test_array[group_gap:]

            if self.verbose > 0:
                    pass
            yield [int(i) for i in train_array], [int(i) for i in test_array]


def test_purged_group_time_series_split():
    X = train_df.query('date > 85').reset_index(drop=True)[["ts_id", "feature_0"]].values
    y = train_df.query('date > 85').reset_index(drop=True)["resp"].values
    groups = train_df.query('date > 85').reset_index(drop=True)["date"].values

    group_ts_kfolds = PurgedGroupTimeSeriesSplit(
        n_splits=4, group_gap=31, max_test_group_size=31)
    train_idx, valid_idx = [], []
    for train_idx_tmp, valid_idx_tmp in group_ts_kfolds.split(X=X, y=y, groups=groups):
        train_idx.append(train_idx_tmp)
        valid_idx.append(valid_idx_tmp)

    print([[train_df.iloc[min(item)]["date"].astype(int),
            train_df.iloc[max(item)]["date"].astype(int)] for item in train_idx])
    print([[train_df.iloc[min(item)]["date"].astype(int),
            train_df.iloc[max(item)]["date"].astype(int)] for item in valid_idx])


In [10]:
def build_tabular_autoencoder(verbose=False, is_compile=True,
                              stddev=0.05, **kwargs):
    """降噪自编码器实现。针对表格形式数据的降噪自编码器，噪声等级由高斯噪声的stddev参数指定"""
    input_dim = kwargs.pop("input_dim", None)
    n_outputs = kwargs.pop("n_outputs", None)

    # 构建降噪自编码器
    layer_input = layers.Input(input_dim, dtype='float32')

    layer_encoded = layers.BatchNormalization()(layer_input)
    layer_encoded = layers.GaussianNoise(stddev=stddev)(layer_encoded)
    layer_encoded = layers.Dense(256, activation='relu')(layer_encoded)

    # 解码层1：针对输入的重构
    layer_decoded = layers.Dropout(0.2)(layer_encoded)
    layer_decoded = layers.Dense(input_dim, name='reconstruct_output')(layer_decoded)

    # 解码层2：针对resp的重构
    layer_output = layers.Dense(128, activation='relu')(layer_decoded)
    layer_output = layers.BatchNormalization()(layer_output)
    layer_output = layers.Dropout(0.2)(layer_output)
    layer_output = layers.Dense(n_outputs, activation='sigmoid', 
                                name='label_output')(layer_output)

    # 输出层
    encoder_model = models.Model(inputs=layer_input, outputs=layer_decoded)
    autoencoder_model = models.Model(inputs=layer_input, outputs=[layer_decoded, layer_output])

    if verbose:
        autoencoder_model.summary()
    if is_compile:
        autoencoder_model.compile(loss={'reconstruct_output':'mae', 'label_output':'mae'}, optimizer=optimizers.Adam(0.003))
    return encoder_model, autoencoder_model

encoder_model, autoencoder_model = build_tabular_autoencoder(
    input_dim=X.shape[1], n_outputs=y.shape[1], stddev=0.2, verbose=False)

print("[INFO] {} Build autoencoder successed !".format(
    str(datetime.now())[:-4]))

[INFO] 2021-01-04 20:22:29.14 Build autoencoder successed !


In [12]:
early_stop = callbacks.EarlyStopping(monitor="val_loss", mode="min",
                                     verbose=1, patience=40,
                                     restore_best_weights=True)
autoencoder_model.fit(x=X, y=[X, y],
                      batch_size=32768,
                      epochs=15,
                      verbose=2,
                      validation_split=0.1,
                      callbacks=[early_stop])

Train on 1414273 samples, validate on 157142 samples
Epoch 1/15
1414273/1414273 - 10s - loss: 0.3973 - reconstruct_output_loss: 0.3861 - label_output_loss: 0.0111 - val_loss: 0.2360 - val_reconstruct_output_loss: 0.2246 - val_label_output_loss: 0.0113
Epoch 2/15
1414273/1414273 - 10s - loss: 0.3896 - reconstruct_output_loss: 0.3786 - label_output_loss: 0.0111 - val_loss: 0.2318 - val_reconstruct_output_loss: 0.2204 - val_label_output_loss: 0.0113
Epoch 3/15
1414273/1414273 - 10s - loss: 0.3825 - reconstruct_output_loss: 0.3716 - label_output_loss: 0.0111 - val_loss: 0.2293 - val_reconstruct_output_loss: 0.2179 - val_label_output_loss: 0.0113
Epoch 4/15
1414273/1414273 - 10s - loss: 0.3766 - reconstruct_output_loss: 0.3655 - label_output_loss: 0.0110 - val_loss: 0.2324 - val_reconstruct_output_loss: 0.2209 - val_label_output_loss: 0.0113
Epoch 5/15
1414273/1414273 - 10s - loss: 0.3693 - reconstruct_output_loss: 0.3583 - label_output_loss: 0.0110 - val_loss: 0.2218 - val_reconstruct_outp

<tensorflow.python.keras.callbacks.History at 0x7fe6ba7e0450>

In [24]:
def utility_score_loss(y_true, y_pred):
    """带有正则项的MAE LOSS，参考自项目[1]。

    @References:
    ----------
    [1] https://www.kaggle.com/satorushibata/lightgbm-classifier-pca-logit-on-utility-score#Plot-&-Summaries
    """
    regularization_val = 0.01

    residual = (y_true - y_pred)
    sign_matching = (y_true * y_pred) >= 0
    utility_mse_loss = tf.where(sign_matching,
                                tf.square(residual),
                                tf.square(residual) - y_true * y_pred * regularization_val)
    utility_mse_loss = tf.reduce_mean(utility_mse_loss, axis=-1)
    return utility_mse_loss


def build_model(verbose=False, is_compile=True, encoder_model=None, **kwargs):
    """针对二分类任务的MLP模型，使用自编码器的编码层作为预训练层。"""
    input_dim = kwargs.pop("input_dim", None)
    output_dim = kwargs.pop("output_dim", None)
    n_labels = kwargs.pop("n_labels", None)

    # 构造网络结构
    layer_input = layers.Input(input_dim, dtype='float32')
    layer_encoded = encoder_model(layer_input)

    layer_feats = layers.concatenate([layer_input, layer_encoded])
    layer_feats = layers.BatchNormalization()(layer_feats)

    # 特征抽取
    layer_dense = layers.Dense(64, activation="relu")(layer_feats)
    layer_dense = layers.BatchNormalization()(layer_dense)
    layer_dense = layers.Dropout(0.2)(layer_dense)

    layer_dense = layers.Dense(32, activation="relu")(layer_dense)
    layer_dense = layers.BatchNormalization()(layer_dense)
    layer_dense = layers.Dropout(0.2)(layer_dense)

    # 输出层构造与模型构造
    layer_output = layers.Dense(output_dim, activation='sigmoid', name="label_output")(layer_dense)
    model = models.Model(layer_input, layer_output)

    if verbose:
        model.summary()
    if is_compile:
        model.compile(loss=utility_score_loss,
                      metrics=["mae"],
                      optimizer=optimizers.Adam(0.003))
    return model

def test_build_model():
    # 构造mlp模型
    mlp_model = build_model(verbose=False, encoder_model=encoder_model,
                            input_dim=X.shape[1], output_dim=y.shape[1])

def test_utility_score_loss():
    X_true = X[:20, :5]
    X_pred = X[50:70, :5]

    print(utility_score_loss(X_true, X_pred))
    print(utility_score_loss(X_true, X_pred).shape)

print("[INFO] {} Build MLP Model successed !".format(
    str(datetime.now())[:-4]))

[INFO] 2021-01-04 21:18:40.89 Build MLP Model successed !


In [23]:
# 训练前全局准备
MODELS = []
encoder_model.trainable = False
early_stop = callbacks.EarlyStopping(monitor="val_mae", mode="min",
                                     verbose=1, patience=30,
                                     restore_best_weights=True)
group_ts_kfolds = PurgedGroupTimeSeriesSplit(
        n_splits=6, group_gap=20)
min_date = min(train_dates)
train_dates = train_dates - min_date

# 开始训练模型
valid_acc_total, valid_roc_auc_total, valid_custom_total = [], [], []

print("[INFO] {} Model training start:".format(str(datetime.now())[:-4]))
print("=========================================")
for fold, (train_idx, valid_idx) in enumerate(group_ts_kfolds.split(X=X, y=y, groups=train_dates)):
    X_train, X_val = X[train_idx], X[valid_idx]
    y_train, y_val = y[train_idx], y[valid_idx]

    X_train_weight, X_val_weight = train_weights[train_idx], train_weights[valid_idx]
    X_train_resp, X_val_resp = train_resp[train_idx], train_resp[valid_idx]
    X_train_dates, X_val_dates = train_dates[train_idx], train_dates[valid_idx]

    # 准备模型
    mlp_model = build_model(verbose=False, encoder_model=encoder_model,
                            input_dim=X_train.shape[1], output_dim=y_train.shape[1])

    mlp_model.fit(x=X_train, y=y_train,
                  validation_data=(X_val, y_val),
                  batch_size=32768,
                  epochs=20,
                  verbose=0,
                  callbacks=[early_stop])

    # valid预测结果
    valid_pred_res = mlp_model.predict(X_val)
    valid_pred_label_mat = np.where(
            valid_pred_res>=0, 1, 0).astype(int)

    # valid data上按照官方metric进行结果评估
    valid_pred_label = np.where(
            np.mean(valid_pred_res, axis=1)>=0, 1, 0).astype(int)
    valid_custom_metric = custom_metric(dates_array=X_val_dates,
                                        weights_array=X_val_weight,
                                        action_label_array=valid_pred_label,
                                        resp_array=X_val_resp)

    # Accuracy与ROC AUC进行评估
    # valid_acc = accuracy_score(valid_pred_label_mat, y_val)

    # valid_roc_auc_list = []
    # for i in range(y_val.shape[1]):
    #     valid_pred_tmp = valid_pred_proba[:, i].reshape(-1, 1)
    #     y_val_tmp = y_val[:, i].reshape(-1, 1)

    #     valid_roc_auc_list.append(roc_auc_score(y_val_tmp, valid_pred_tmp))
    # valid_roc_auc = np.mean(valid_roc_auc_list)

    # 标准打印训练信息
    print("-- folds {}, valid_custom: {:.4f}".format(
            fold, valid_custom_metric))

    # 保存模型与关键训练指标
    MODELS.append(mlp_model)
    valid_custom_total.append(valid_custom_metric)

# 打印总体分数指标
print("-- total metric, valid_custom: {:.4f}".format(np.mean(valid_custom_total)))

print("=========================================")
print("[INFO] {} Model training end".format(str(datetime.now())[:-4]))

[INFO] 2021-01-04 21:14:42.60 Model training start:
-- folds 0, valid_custom: 43.4213
-- folds 1, valid_custom: -0.0000
-- folds 2, valid_custom: -0.0000
-- folds 3, valid_custom: 4.6847
-- folds 4, valid_custom: -0.0000
-- folds 5, valid_custom: -0.0000
-- total metric, valid_custom: 8.0177
[INFO] 2021-01-04 21:17:16.03 Model training end
