In [1]:
import os
import gc
import time
import warnings
from datetime import datetime

from tqdm import tqdm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from numpy import iinfo, finfo, int8, int16, int32, int64, float32, float64

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import losses
from tensorflow.keras import optimizers
from tensorflow.keras import callbacks
from tensorflow.keras import models
from tensorflow.keras import activations

from sklearn.model_selection import KFold
from sklearn.model_selection._split import _BaseKFold, indexable, _num_samples
from sklearn.utils.validation import _deprecate_positional_args

from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.io as pio
from IPython.display import HTML

# 设置plotly为暗黑模式
pio.templates.default = "plotly_dark"
plot_config = dict({'scrollZoom': True, 'displayModeBar': True, 'displaylogo': False})
sns.set(style="ticks", font_scale=1.2, palette='deep', color_codes=True)
colors = ["C" + str(i) for i in range(0, 9+1)]

# 默认plotly色号
default_color_list = [
    '#1f77b4',  # muted blue
    '#ff7f0e',  # safety orange
    '#2ca02c',  # cooked asparagus green
    '#d62728',  # brick red
    '#9467bd',  # muted purple
    '#8c564b',  # chestnut brown
    '#e377c2',  # raspberry yogurt pink
    '#7f7f7f',  # middle gray
    '#bcbd22',  # curry yellow-green
    '#17becf'   # blue-teal
    ]

# 设定全局随机种子，并且屏蔽warnings
GLOBAL_RANDOM_SEED = 2022
np.random.seed(GLOBAL_RANDOM_SEED)
tf.random.set_seed(GLOBAL_RANDOM_SEED)

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', 50)


In [2]:

# 导入数据
load_data_start_time = time.time()
train_df  = pd.read_csv(
    './data/jane-street-market-prediction/train.csv', nrows=None)
feat_df = pd.read_csv(
    './data/jane-street-market-prediction/features.csv')
example_test_df = pd.read_csv(
    './data/jane-street-market-prediction/example_test.csv')
sample_prediction_df = pd.read_csv(
    './data/jane-street-market-prediction/example_sample_submission.csv')
load_data_end_time = time.time()

# 打印数据基本情况
print("[INFO] {} End Reading ! It took {:.2f} seconds !".format(
    str(datetime.now())[:-4], load_data_end_time-load_data_start_time))
print("[INFO] {} Basic data description: ".format(str(datetime.now())[:-4]))
print("    -- train_df shape: {}".format(
    train_df.shape))
print("    -- example_test_df shape: {}".format(
    example_test_df.shape))
print("    -- feat_df shape: {}".format(
    feat_df.shape))
print("    -- sample_prediction_df shape: {}".format(
    sample_prediction_df.shape))

[INFO] 2020-12-30 20:19:57.30 End Reading ! It took 55.44 seconds !
[INFO] 2020-12-30 20:19:57.30 Basic data description: 
    -- train_df shape: (2390491, 138)
    -- example_test_df shape: (15219, 133)
    -- feat_df shape: (130, 30)
    -- sample_prediction_df shape: (15219, 2)


In [3]:
class ReduceMemoryUsage():
    """通过pandas的DataFrame的每一列的类型转换，降低DataFrame的内存消耗。

    扫描pandas的DataFrame的每一列（column），获取每一列的最大最小值。判断
    其是否落在某一具体浮点数或者整型数范围之内，并进行强制类型转换。例如：
    某列（float64表示）最大最小值为[l, h]区间，而l与h分别小于float32最大最小
    表示范围[-k, +k]范围内，则可以进行类型转换为float32来表示该列元素。

    @Parameters:
    ----------
        data_table: {pandas DataFrame-like}
            pandas的DataFrame类型。
        verbose: {bool-like}
            是否打印内存精简的相关信息。

    @Return:
    ----------
        经过内存精简的DataFrame

    @References:
    ----------
    [1] https://docs.scipy.org/doc/numpy/reference/generated/numpy.iinfo.html
    [2] https://wizardforcel.gitbooks.io/ts-numpy-tut/content/3.html
    """
    def __init__(self, data_table=None, verbose=True):
        self._data_table = data_table
        self._verbose = verbose

    def type_report(self, data_table):
        """Reporting basic characteristics of the tabular data data_table."""
        data_types = list(map(str, data_table.dtypes.values))
        basic_report = pd.DataFrame(data_types, columns=["types"])
        basic_report["feature_name"] = list(data_table.columns)
        return basic_report

    def reduce_memory_usage(self):
        memory_reduced_data = self.__reduce_memory()
        return memory_reduced_data

    def __reduce_memory(self):
        print("\nReduce memory process:")
        print("-------------------------------------------")
        memory_before_reduced = self._data_table.memory_usage(
            deep=True).sum() / 1024**2
        types = self.type_report(self._data_table)
        if self._verbose is True:
            print("@Memory usage of data is {:.5f} MB.".format(
                memory_before_reduced))

        # Scan each feature in data_table, reduce the memory usage for features
        for ind, name in enumerate(types["feature_name"].values):
            # ToBeFixed: Unstable query.
            feature_type = str(
                types[types["feature_name"] == name]["types"].iloc[0])

            if (feature_type in "object") and (feature_type in "datetime64[ns]"):
                try:
                    feature_min = self._data_table[name].min()
                    feature_max = self._data_table[name].max()

                    if "int" in feature_type:
                        if feature_min > iinfo(int8).min and feature_max < iinfo(int8).max:
                            self._data_table[name] = self._data_table[name].astype(int8)
                        elif feature_min > iinfo(int16).min and feature_max < iinfo(int16).max:
                            self._data_table[name] = self._data_table[name].astype(int16)
                        elif feature_min > iinfo(int32).min and feature_max < iinfo(int32).max:
                            self._data_table[name] = self._data_table[name].astype(int32)
                        else:
                            self._data_table[name] = self._data_table[name].astype(int64)
                    else:
                        if feature_min > finfo(float32).min and feature_max < finfo(float32).max:
                            self._data_table[name] = self._data_table[name].astype(float32)
                        else:
                            self._data_table[name] = self._data_table[name].astype(float64)
                except Exception as error_msg:
                    print("\n--------ERROR INFORMATION---------")
                    print(error_msg)
                    print("Error on the {}".format(name))
                    print("--------ERROR INFORMATION---------\n")
            if self._verbose is True:
                print("Processed {} feature({}), total is {}.".format(
                    ind + 1, name, len(types)))

        memory_after_reduced = self._data_table.memory_usage(
            deep=True).sum() / 1024**2
        if self._verbose is True:
            print("@Memory usage after optimization: {:.5f} MB.".format(
                memory_after_reduced))
            print("@Decreased by {:.5f}%.".format(
                100 * (memory_before_reduced - memory_after_reduced) / memory_before_reduced))
        print("-------------------------------------------")
        return self._data_table


In [24]:
class PurgedGroupTimeSeriesSplit(_BaseKFold):
    """针对带有Group id（组id）数据的时间序列交叉验证集合生成类。

    生成针对带有Group id的数据的时序交叉验证集。其中训练与验证的
    Group之间可以指定group_gap，用来隔离时间上的关系。这种情况下
    group_id通常是时间id，例如天或者小时。

    @Parameters:
    ----------
        n_splits: {int-like}, default=5
            切分的集合数目。
        max_train_group_size: {int-like}, default=+inf
            训练集单个组的最大样本数据限制。
        group_gap: {int-like}, default=None
            依据group_id切分组时，训练组与测试组的id的gap数目。
        max_test_group_size: {int-like}, default=+inf
            测试集单个组的最大样本数据限制。

    @References:
    ----------
    [1] https://www.kaggle.com/gogo827jz/jane-street-ffill-xgboost-purgedtimeseriescv
    """
    @_deprecate_positional_args
    def __init__(self, n_splits=5,
                 *,
                 max_train_group_size=np.inf,
                 max_test_group_size=np.inf,
                 group_gap=None,
                 verbose=False
                 ):
        super().__init__(n_splits, shuffle=False, random_state=None)
        self.max_train_group_size = max_train_group_size
        self.group_gap = group_gap
        self.max_test_group_size = max_test_group_size
        self.verbose = verbose

    def split(self, X, y=None, groups=None):
        """生成训练组与测试组的id索引，返回组索引的生成器。

        @Parameters:
        ----------
            X: {array-like} {n_samples, n_features}
                训练数据，输入形状为{n_samples, n_features}。
            y: {array-like} {n_samples, }
                标签数据，形状为{n_samples, }。
            groups: {array-like} {n_samples, }
                用来依据组来划分训练集与测试集的组id，必须为连续的组id。

        @Yields:
        ----------
            train: ndarray
                依据group_id切分的训练组id。
            test: ndarray
                依据group_id切分的测试组id。
        """
        if groups is None:
            raise ValueError(
                "The 'groups' parameter should not be None ！")

        # 初始化基本参数信息
        X, y, groups = indexable(X, y, groups)
        n_samples, n_splits, group_gap = _num_samples(X), self.n_splits, self.group_gap
        n_folds = n_splits + 1

        max_test_group_size = self.max_test_group_size
        max_train_group_size = self.max_train_group_size

        group_dict = {}
        u, ind = np.unique(groups, return_index=True)
        unique_groups = np.argsort(ind)
        n_groups = _num_samples(unique_groups)

        # 扫描整个数据id list，构建group_dcit，{group_id: 属于该group的样本的idx}
        for idx in np.arange(n_samples):
            if (groups[idx] in group_dict):
                group_dict[groups[idx]].append(idx)
            else:
                group_dict[groups[idx]] = [idx]

        if n_folds > n_groups:
            raise ValueError(
                ("Cannot have number of folds={0} greater than"
                 " the number of groups={1}").format(n_folds, n_groups))

        # group_test_size: 每个fold预留的test group的大小
        group_test_size = min(n_groups // n_folds, max_test_group_size)
        group_test_starts = range(n_groups - n_splits * group_test_size,
                                  n_groups, group_test_size)
        for group_test_start in group_test_starts:
            train_array, test_array = [], []

            group_st = max(0, group_test_start - group_gap - max_train_group_size)
            for train_group_idx in unique_groups[group_st:(group_test_start - group_gap)]:
                train_array_tmp = group_dict[train_group_idx]
                train_array = np.sort(np.unique(
                                      np.concatenate((train_array,
                                                      train_array_tmp)),
                                      axis=None), axis=None)
            train_end = train_array.size
 
            for test_group_idx in unique_groups[group_test_start:
                                                group_test_start +
                                                group_test_size]:
                test_array_tmp = group_dict[test_group_idx]
                test_array = np.sort(np.unique(
                                              np.concatenate((test_array,
                                                              test_array_tmp)),
                                     axis=None), axis=None)
            test_array  = test_array[group_gap:]

            if self.verbose > 0:
                    pass
            yield [int(i) for i in train_array], [int(i) for i in test_array]


def test_purged_group_time_series_split():
    X = train_df.query('date > 85').reset_index(drop=True)[["ts_id", "feature_0"]].values
    y = train_df.query('date > 85').reset_index(drop=True)["resp"].values
    groups = train_df.query('date > 85').reset_index(drop=True)["date"].values

    group_ts_kfolds = PurgedGroupTimeSeriesSplit(
        n_splits=4, group_gap=31, max_test_group_size=31)
    train_idx, valid_idx = [], []
    for train_idx_tmp, valid_idx_tmp in group_ts_kfolds.split(X=X, y=y, groups=groups):
        train_idx.append(train_idx_tmp)
        valid_idx.append(valid_idx_tmp)

    print([[train_df.iloc[min(item)]["date"].astype(int),
            train_df.iloc[max(item)]["date"].astype(int)] for item in train_idx])
    print([[train_df.iloc[min(item)]["date"].astype(int),
            train_df.iloc[max(item)]["date"].astype(int)] for item in valid_idx])


[[0, 227], [0, 262], [0, 295], [0, 327]]
[[262, 295], [295, 327], [327, 363], [363, 396]]


In [21]:
train = train_df.query('date > 85').reset_index(drop=True)
train.fillna(method="bfill", inplace=True)

# 构造标签
train = train.query('weight > 0').reset_index(drop = True)
train['action'] =  ((train['resp_1'] > 0.00001) & \
                    (train['resp_2'] > 0.00001 ) & \
                    (train['resp_3'] > 0.00001) & \
                    (train['resp_4'] > 0.00001 ) &  \
                    (train['resp'] > 0.00001)).astype('int')
feature_name_list = [c for c in train.columns if 'feature' in c]
resp_name_list = ["resp", "resp_1", "resp_2", "resp_3", "resp_4"]

# 构造自编码器的输入输出
X = train[feature_name_list].values
y = np.stack([(train[c] > 0.000001).astype('int') for c in resp_name_list]).T
group_ids = train["date"].values
print("[INFO] {} Autoencoder data prepared !".format(
    str(datetime.now())[:-4]))

[INFO] 2020-12-30 16:04:18.25 Autoencoder data prepared !


In [5]:
def build_tabular_autoencoder(verbose=False, is_compile=True,
                              stddev=0.05, **kwargs):
    """降噪自编码器实现。针对表格形式数据的降噪自编码器，噪声等级由高斯噪声的stddev参数指定"""
    input_dim = kwargs.pop("input_dim", None)
    n_labels = kwargs.pop("n_labels", None)

    # 构建降噪自编码器
    layer_input = layers.Input(input_dim, dtype='float32')

    layer_encoded = layers.BatchNormalization()(layer_input)
    layer_encoded = layers.GaussianNoise(stddev=stddev)(layer_encoded)
    layer_encoded = layers.Dense(640, activation='relu')(layer_encoded)

    # 解码层1：针对输入的重构
    layer_decoded = layers.Dropout(0.2)(layer_encoded)
    layer_decoded = layers.Dense(input_dim, name='reconstruct_output')(layer_decoded)

    # 解码层2：针对resp的重构
    layer_output = layers.Dense(320, activation='relu')(layer_decoded)
    layer_output = layers.BatchNormalization()(layer_output)
    layer_output = layers.Dropout(0.2)(layer_output)
    layer_output = layers.Dense(n_labels, activation='sigmoid', 
                                name='label_output')(layer_output)

    # 输出层
    encoder_model = models.Model(inputs=layer_input, outputs=layer_decoded)
    autoencoder_model = models.Model(inputs=layer_input, outputs=[layer_decoded, layer_output])

    if verbose:
        autoencoder_model.summary()
    if is_compile:
        autoencoder_model.compile(loss={'reconstruct_output':'mae', 'label_output':'binary_crossentropy'},
                                  metrics={'label_output':'acc'}, 
                                  optimizer=optimizers.Adam(0.006))
    return encoder_model, autoencoder_model

encoder_model, autoencoder_model = build_tabular_autoencoder(
    input_dim=X.shape[1], n_labels=y.shape[1], verbose=False)
print("[INFO] {} Build autoencoder successed !".format(
    str(datetime.now())[:-4]))

[INFO] 2020-12-30 15:17:25.41 Build autoencoder successed !


In [9]:
early_stop = callbacks.EarlyStopping(monitor="val_loss", mode="min",
                                     verbose=1, patience=40,
                                     restore_best_weights=True)
autoencoder_model.fit(x=X, y=[X, y],
                      batch_size=32768,
                      epochs=40,
                      verbose=2,
                      validation_split=0.1,
                      callbacks=[early_stop])

Train on 1414273 samples, validate on 157142 samples
Epoch 1/40
1414273/1414273 - 20s - loss: 1.4524 - reconstruct_output_loss: 0.6946 - label_output_loss: 0.7512 - label_output_acc: 0.5079 - val_loss: 1.3237 - val_reconstruct_output_loss: 0.6246 - val_label_output_loss: 0.6983 - val_label_output_acc: 0.5250
Epoch 2/40
1414273/1414273 - 19s - loss: 1.0493 - reconstruct_output_loss: 0.3514 - label_output_loss: 0.6974 - label_output_acc: 0.5173 - val_loss: 1.0311 - val_reconstruct_output_loss: 0.3408 - val_label_output_loss: 0.6898 - val_label_output_acc: 0.5288
Epoch 3/40
1414273/1414273 - 20s - loss: 1.0005 - reconstruct_output_loss: 0.3084 - label_output_loss: 0.6921 - label_output_acc: 0.5230 - val_loss: 0.9522 - val_reconstruct_output_loss: 0.2628 - val_label_output_loss: 0.6890 - val_label_output_acc: 0.5303
Epoch 4/40
1414273/1414273 - 20s - loss: 0.9866 - reconstruct_output_loss: 0.2960 - label_output_loss: 0.6905 - label_output_acc: 0.5261 - val_loss: 0.8564 - val_reconstruct_ou

<tensorflow.python.keras.callbacks.History at 0x7fb7903e1690>

In [28]:
def build_model(verbose=False, is_compile=True, encoder_model=None, **kwargs):
    """针对二分类任务的MLP模型，使用自编码器的编码层作为预训练层。"""
    input_dim = kwargs.pop("input_dim", None)
    output_dim = kwargs.pop("output_dim", None)
    n_labels = kwargs.pop("n_labels", None)

    # 构造网络结构
    layer_input = layers.Input(input_dim, dtype='float32')
    layer_encoded = encoder_model(layer_input)

    layer_feats = layers.concatenate([layer_input, layer_encoded])
    layer_feats = layers.BatchNormalization()(layer_feats)
    layer_feats = layers.Lambda(tf.keras.activations.selu)(layer_feats)

    # 特征抽取
    layer_dense = layers.Dense(128, activation="relu")(layer_feats)
    layer_dense = layers.BatchNormalization()(layer_dense)
    layer_dense = layers.Dropout(0.3)(layer_dense)

    layer_dense = layers.Dense(32, activation="relu")(layer_dense)
    layer_dense = layers.BatchNormalization()(layer_dense)
    layer_dense = layers.Dropout(0.3)(layer_dense)

    # 输出层构造与模型构造
    layer_output = layers.Dense(output_dim, activation='sigmoid', name="label_output")(layer_dense)
    model = models.Model(layer_input, layer_output)

    if verbose:
        model.summary()
    if is_compile:
        model.compile(loss={'label_output':'binary_crossentropy'},
                      metrics=[tf.keras.metrics.AUC(name='auc')],
                      optimizer=optimizers.Adam(0.006))
    return model

print("[INFO] {} Build MLP Model successed !".format(
    str(datetime.now())[:-4]))

# 构造mlp模型
# mlp_model = build_model(verbose=False, encoder_model=encoder_model,
#                         input_dim=X.shape[1], output_dim=y.shape[1])
# print("[INFO] {} Build MLP successed !".format(
#     str(datetime.now())[:-4]))


In [29]:
# 训练前全局准备
MODELS = []
encoder_model.trainable = False
early_stop = callbacks.EarlyStopping(monitor="val_auc", mode="max",
                                     verbose=1, patience=30,
                                     restore_best_weights=True)
group_ts_kfolds = PurgedGroupTimeSeriesSplit(
        n_splits=6, group_gap=31, max_test_group_size=31)
group_ids = group_ids - min(group_ids)

# 开始训练模型
for train_idx, valid_idx in group_ts_kfolds.split(X=X, y=y, groups=group_ids):
    X_train, X_val = X[train_idx], X[valid_idx]
    y_train, y_val = y[train_idx], y[valid_idx]

    # 准备模型
    mlp_model = build_model(verbose=False, encoder_model=encoder_model,
                            input_dim=X_train.shape[1], output_dim=y_train.shape[1])

    mlp_model.fit(x=X_train, y=y_train,
                  validation_data=(X_val, y_val),
                  batch_size=32768,
                  epochs=5,
                  verbose=2,
                  callbacks=[early_stop])

    # 保存模型
    MODELS.append(mlp_model)



Train on 666858 samples, validate on 119439 samples
Epoch 1/5
666858/666858 - 9s - loss: 0.7650 - auc: 0.5078 - val_loss: 0.8711 - val_auc: 0.5156
Epoch 2/5
666858/666858 - 9s - loss: 0.7102 - auc: 0.5126 - val_loss: 0.7210 - val_auc: 0.5258
Epoch 3/5
666858/666858 - 9s - loss: 0.6989 - auc: 0.5198 - val_loss: 0.6969 - val_auc: 0.5315
Epoch 4/5
666858/666858 - 9s - loss: 0.6944 - auc: 0.5270 - val_loss: 0.6915 - val_auc: 0.5378
Epoch 5/5
666858/666858 - 9s - loss: 0.6922 - auc: 0.5331 - val_loss: 0.6904 - val_auc: 0.5408
Train on 783335 samples, validate on 130051 samples
Epoch 1/5
783335/783335 - 11s - loss: 0.7458 - auc: 0.5083 - val_loss: 0.7381 - val_auc: 0.5181
Epoch 2/5
783335/783335 - 10s - loss: 0.7039 - auc: 0.5147 - val_loss: 0.6975 - val_auc: 0.5230
Epoch 3/5
783335/783335 - 11s - loss: 0.6957 - auc: 0.5239 - val_loss: 0.6927 - val_auc: 0.5288
Epoch 4/5
783335/783335 - 11s - loss: 0.6927 - auc: 0.5318 - val_loss: 0.6915 - val_auc: 0.5327
Epoch 5/5
783335/783335 - 11s - loss:

In [None]:
train_df.info()