In [None]:
import os
import gc
import time
import warnings
from datetime import datetime

from numba import njit, jit
from tqdm import tqdm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from numpy import iinfo, finfo, int8, int16, int32, int64, float32, float64

import tensorflow as tf
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.model_selection._split import _BaseKFold, indexable, _num_samples
from sklearn.utils.validation import _deprecate_positional_args

import lightgbm as lgb
import xgboost as xgb

from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.io as pio
from IPython.display import HTML

# 设置plotly为暗黑模式
pio.templates.default = "plotly_dark"
plot_config = dict({'scrollZoom': True, 'displayModeBar': True, 'displaylogo': False})
sns.set(style="ticks", font_scale=1.2, palette='deep', color_codes=True)
colors = ["C" + str(i) for i in range(0, 9+1)]

# 默认plotly色号
default_color_list = [
    '#1f77b4',  # muted blue
    '#ff7f0e',  # safety orange
    '#2ca02c',  # cooked asparagus green
    '#d62728',  # brick red
    '#9467bd',  # muted purple
    '#8c564b',  # chestnut brown
    '#e377c2',  # raspberry yogurt pink
    '#7f7f7f',  # middle gray
    '#bcbd22',  # curry yellow-green
    '#17becf'   # blue-teal
    ]

# 设定全局随机种子，并且屏蔽warnings
GLOBAL_RANDOM_SEED = 2148
np.random.seed(GLOBAL_RANDOM_SEED)
tf.random.set_seed(GLOBAL_RANDOM_SEED)

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', 50)


In [None]:
# 导入数据
load_data_start_time = time.time()
train_df  = pd.read_csv(
    './data/jane-street-market-prediction/train.csv', nrows=None)
feat_df = pd.read_csv(
    './data/jane-street-market-prediction/features.csv')
example_test_df = pd.read_csv(
    './data/jane-street-market-prediction/example_test.csv')
example_prediction_df = pd.read_csv(
    './data/jane-street-market-prediction/example_sample_submission.csv')
load_data_end_time = time.time()

# 打印数据基本情况
print("[INFO] {} End Reading ! It took {:.2f} seconds !".format(
    str(datetime.now())[:-4], load_data_end_time-load_data_start_time))
print("[INFO] {} Basic data description: ".format(str(datetime.now())[:-4]))
print("    -- train_df shape: {}".format(
    train_df.shape))
print("    -- example_test_df shape: {}".format(
    example_test_df.shape))
print("    -- feat_df shape: {}".format(
    feat_df.shape))
print("    -- example_prediction_df shape: {}".format(
    example_prediction_df.shape))

In [None]:
# 挑选策略变化之后的数据
train = train_df.query('date > 85').reset_index(drop=True)

# 构造标签
train = train.query('weight > 0').reset_index(drop = True)
train['action'] =  ((train['resp_1'] > 0.00001) & \
                    (train['resp_2'] > 0.00001 ) & \
                    (train['resp_3'] > 0.00001) & \
                    (train['resp_4'] > 0.00001 ) &  \
                    (train['resp'] > 0.00001)).astype('int')
feature_name_list = [c for c in train.columns if 'feature' in c]
resp_name_list = ["resp", "resp_1", "resp_2", "resp_3", "resp_4"]

# 使用均值填充缺失值
mean_val_list = []
for name in feature_name_list:
    mean_val = train[name].mean()
    train[name].fillna(mean_val, inplace=True)
    mean_val_list.append(mean_val)
mean_val_array = np.array(mean_val_list)

# 构造自编码器的输入输出
X = train[feature_name_list].values
y = np.stack([(train[c] > 0.000001).astype('int') for c in resp_name_list]).T
train_dates = train["date"].values
train_weights = train["weight"].values
train_resp = train["resp"].values

print("[INFO] {} Autoencoder data prepared !".format(
    str(datetime.now())[:-4]))