In [None]:
import os
import gc
import time
import warnings
from datetime import datetime

from tqdm import tqdm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import losses
from tensorflow.keras import optimizers
from tensorflow.keras import callbacks

from sklearn.model_selection import KFold
from sklearn.model_selection._split import _BaseKFold, indexable, _num_samples
from sklearn.utils.validation import _deprecate_positional_args

from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.io as pio
from IPython.display import HTML

# 设置plotly为暗黑模式
pio.templates.default = "plotly_dark"
plot_config = dict({'scrollZoom': True, 'displayModeBar': True, 'displaylogo': False})
sns.set(style="ticks", font_scale=1.2, palette='deep', color_codes=True)
colors = ["C" + str(i) for i in range(0, 9+1)]

# 默认plotly色号
default_color_list = [
    '#1f77b4',  # muted blue
    '#ff7f0e',  # safety orange
    '#2ca02c',  # cooked asparagus green
    '#d62728',  # brick red
    '#9467bd',  # muted purple
    '#8c564b',  # chestnut brown
    '#e377c2',  # raspberry yogurt pink
    '#7f7f7f',  # middle gray
    '#bcbd22',  # curry yellow-green
    '#17becf'   # blue-teal
    ]

# 设定全局随机种子，并且屏蔽warnings
GLOBAL_RANDOM_SEED = 2022
np.random.seed(GLOBAL_RANDOM_SEED)
tf.random.set_seed(GLOBAL_RANDOM_SEED)

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', 50)


In [None]:

# 导入数据
load_data_start_time = time.time()
train_df  = pd.read_csv(
    './data/jane-street-market-prediction/train.csv', nrows=None)
feat_df = pd.read_csv(
    './data/jane-street-market-prediction/features.csv')
example_test_df = pd.read_csv(
    './data/jane-street-market-prediction/example_test.csv')
sample_prediction_df = pd.read_csv(
    './data/jane-street-market-prediction/example_sample_submission.csv')
load_data_end_time = time.time()

# 打印数据基本情况
print("[INFO] {} End Reading ! It took {:.2f} seconds !".format(
    str(datetime.now())[:-4], load_data_end_time-load_data_start_time))
print("[INFO] {} Basic data description: ".format(str(datetime.now())[:-4]))
print("    -- train_df shape: {}".format(
    train_df.shape))
print("    -- example_test_df shape: {}".format(
    example_test_df.shape))
print("    -- feat_df shape: {}".format(
    feat_df.shape))
print("    -- sample_prediction_df shape: {}".format(
    sample_prediction_df.shape))

In [77]:
class PurgedGroupTimeSeriesSplit(_BaseKFold):
    """针对带有Group id（组id）数据的时间序列交叉验证集合生成类。

    生成针对带有Group id的数据的时序交叉验证集。其中训练与验证的
    Group之间可以指定group_gap，用来隔离时间上的关系。这种情况下
    group_id通常是时间id，例如天或者小时。

    @Parameters:
    ----------
        n_splits: {int-like}, default=5
            切分的集合数目。
        max_train_group_size: {int-like}, default=+inf
            训练集单个组的最大样本数据限制。
        group_gap: {int-like}, default=None
            依据group_id切分组时，训练组与测试组的id的gap数目。
        max_test_group_size: {int-like}, default=+inf
            测试集单个组的最大样本数据限制。

    @References:
    ----------
    [1] https://www.kaggle.com/gogo827jz/jane-street-ffill-xgboost-purgedtimeseriescv
    """
    @_deprecate_positional_args
    def __init__(self, n_splits=5,
                 *,
                 max_train_group_size=np.inf,
                 max_test_group_size=np.inf,
                 group_gap=None,
                 verbose=False
                 ):
        super().__init__(n_splits, shuffle=False, random_state=None)
        self.max_train_group_size = max_train_group_size
        self.group_gap = group_gap
        self.max_test_group_size = max_test_group_size
        self.verbose = verbose

    def split(self, X, y=None, groups=None):
        """生成训练组与测试组的id索引，返回组索引的生成器。

        @Parameters:
        ----------
            X: {array-like} {n_samples, n_features}
                训练数据，输入形状为{n_samples, n_features}。
            y: {array-like} {n_samples, }
                标签数据，形状为{n_samples, }。
            groups: {array-like} {n_samples, }
                用来依据组来划分训练集与测试集的组id，必须为连续的组id。

        @Yields:
        ----------
            train: ndarray
                依据group_id切分的训练组id。
            test: ndarray
                依据group_id切分的测试组id。
        """
        if groups is None:
            raise ValueError(
                "The 'groups' parameter should not be None ！")

        # 初始化基本参数信息
        X, y, groups = indexable(X, y, groups)
        n_samples, n_splits, group_gap = _num_samples(X), self.n_splits, self.group_gap
        n_folds = n_splits + 1

        max_test_group_size = self.max_test_group_size
        max_train_group_size = self.max_train_group_size

        group_dict = {}
        u, ind = np.unique(groups, return_index=True)
        unique_groups = np.argsort(ind)
        n_groups = _num_samples(unique_groups)

        # 扫描整个数据id list，构建group_dcit，{group_id: 属于该group的样本的idx}
        for idx in np.arange(n_samples):
            if (groups[idx] in group_dict):
                group_dict[groups[idx]].append(idx)
            else:
                group_dict[groups[idx]] = [idx]

        if n_folds > n_groups:
            raise ValueError(
                ("Cannot have number of folds={0} greater than"
                 " the number of groups={1}").format(n_folds, n_groups))

        # group_test_size: 每个fold预留的test group的大小
        group_test_size = min(n_groups // n_folds, max_test_group_size)
        group_test_starts = range(n_groups - n_splits * group_test_size,
                                  n_groups, group_test_size)
        for group_test_start in group_test_starts:
            train_array, test_array = [], []

            group_st = max(0, group_test_start - group_gap - max_train_group_size)
            for train_group_idx in unique_groups[group_st:(group_test_start - group_gap)]:
                train_array_tmp = group_dict[train_group_idx]
                train_array = np.sort(np.unique(
                                      np.concatenate((train_array,
                                                      train_array_tmp)),
                                      axis=None), axis=None)
            train_end = train_array.size
 
            for test_group_idx in unique_groups[group_test_start:
                                                group_test_start +
                                                group_test_size]:
                test_array_tmp = group_dict[test_group_idx]
                test_array = np.sort(np.unique(
                                              np.concatenate((test_array,
                                                              test_array_tmp)),
                                     axis=None), axis=None)
            test_array  = test_array[group_gap:]

            if self.verbose > 0:
                    pass
            yield [int(i) for i in train_array], [int(i) for i in test_array]


def test_purged_group_time_series_split():
    X = train_df[["ts_id", "feature_0"]].values
    y = train_df["resp"].values
    groups = train_df["date"].values

    group_ts_kfolds = PurgedGroupTimeSeriesSplit(
        n_splits=4, group_gap=31, max_test_group_size=31)
    train_idx, valid_idx = [], []
    for train_idx_tmp, valid_idx_tmp in group_ts_kfolds.split(X=X, y=y, groups=groups):
        train_idx.append(train_idx_tmp)
        valid_idx.append(valid_idx_tmp)

    print([[train_df.iloc[min(item)]["date"].astype(int),
            train_df.iloc[max(item)]["date"].astype(int)] for item in train_idx])
    print([[train_df.iloc[min(item)]["date"].astype(int),
            train_df.iloc[max(item)]["date"].astype(int)] for item in valid_idx])

test_purged_group_time_series_split()

[[0, 344], [0, 375], [0, 406], [0, 437]]
[[376, 406], [407, 437], [438, 468], [469, 499]]


In [None]:
def build_tabular_autoencoder(verbose=False, is_compile=True,
                              stddev=0.05, **kwargs):
    """降噪自编码器实现。针对表格形式数据的降噪自编码器，噪声等级由gaussian_noise指定"""
    input_dim = kwargs.pop("input_dim")
    layer_input = Input(input_dim, dtype='float32')

    encoded = BatchNormalization()(i)
    encoded = GaussianNoise(stddev=stddev)(encoded)
    encoded = Dense(640,activation='relu')(encoded)
    decoded = Dropout(0.2)(encoded)
    decoded = Dense(input_dim,name='decoded')(decoded)
    x = Dense(320,activation='relu')(decoded)
    x = BatchNormalization()(x)
    x = Dropout(0.2)(x)
    x = Dense(output_dim,activation='sigmoid',name='label_output')(x)

    encoder = Model(inputs=i,outputs=encoded)
    autoencoder = Model(inputs=i,outputs=[decoded,x])
    
    autoencoder.compile(optimizer=Adam(0.001),loss={'decoded':'mse','label_output':'binary_crossentropy'})
    return autoencoder, encoder

In [None]:
train_df.head(25)[["date", "weight", "resp_1", "resp_2", "resp_3", "resp_4", "resp",
                   "feature_0", "feature_129", "ts_id"]]

In [None]:
train_df.info()

In [None]:
plot_config = dict({'scrollZoom': False, 'displayModeBar': True, 'displaylogo': False})
fig = make_subplots(rows=2, cols=2)
ind = 1
for row in range(1, 2+1):
    for col in range(1, 2+1):
        resp_tmp = np.cumsum(train_df["resp_{}".format(ind)].head(200000).values)
        ts_id = train_df["ts_id"].head(200000).values

        fig.add_trace(go.Scatter(x=ts_id, y=resp_tmp,
                                 mode="lines", line_width=2,
                                 line_color=default_color_list[ind],
                                 name="resp_{}".format(row)), row=row, col=col)
        fig.update_yaxes(title_text="resp_{}".format(ind), ticks="outside",
                         row=row, col=col, automargin=True)
        ind += 1

# fig.update_layout(go.Layout(title="resp plot", width=900, height=800,
#                             showlegend=False))
# fig.show(config=plot_config)