In [1]:
import os
import gc
import time
import warnings
from datetime import datetime

from numba import njit, jit
from tqdm import tqdm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from numpy import iinfo, finfo, int8, int16, int32, int64, float32, float64

import tensorflow as tf
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.model_selection._split import _BaseKFold, indexable, _num_samples
from sklearn.utils.validation import _deprecate_positional_args
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

import lightgbm as lgb
import xgboost as xgb

from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.io as pio
from IPython.display import HTML

# 设置plotly为暗黑模式
pio.templates.default = "plotly_dark"
plot_config = dict({'scrollZoom': True, 'displayModeBar': True, 'displaylogo': False})
sns.set(style="ticks", font_scale=1.2, palette='deep', color_codes=True)
colors = ["C" + str(i) for i in range(0, 9+1)]

# 默认plotly色号
default_color_list = [
    '#1f77b4',  # muted blue
    '#ff7f0e',  # safety orange
    '#2ca02c',  # cooked asparagus green
    '#d62728',  # brick red
    '#9467bd',  # muted purple
    '#8c564b',  # chestnut brown
    '#e377c2',  # raspberry yogurt pink
    '#7f7f7f',  # middle gray
    '#bcbd22',  # curry yellow-green
    '#17becf'   # blue-teal
    ]

# 设定全局随机种子，并且屏蔽warnings
GLOBAL_RANDOM_SEED = 2022
np.random.seed(GLOBAL_RANDOM_SEED)
tf.random.set_seed(GLOBAL_RANDOM_SEED)

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', 50)

# 检查GPU设备情况
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        # 限制只能使用第一块GPU（通过GPU的List的id指定）
        tf.config.experimental.set_visible_devices(gpus[0], 'GPU')

        # Currently, memory growth needs to be the same across GPUs
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        logical_gpus = tf.config.experimental.list_logical_devices('GPU')
        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
    except RuntimeError as e:
        # Memory growth must be set before GPUs have been initialized
        print(e)

1 Physical GPUs, 1 Logical GPUs


In [2]:
# 导入数据
load_data_start_time = time.time()
train_df  = pd.read_csv(
    './data/jane-street-market-prediction/train.csv', nrows=None)
feat_df = pd.read_csv(
    './data/jane-street-market-prediction/features.csv')
example_test_df = pd.read_csv(
    './data/jane-street-market-prediction/example_test.csv')
example_prediction_df = pd.read_csv(
    './data/jane-street-market-prediction/example_sample_submission.csv')
load_data_end_time = time.time()

# 打印数据基本情况
print("[INFO] {} End Reading ! It took {:.2f} seconds !".format(
    str(datetime.now())[:-4], load_data_end_time-load_data_start_time))
print("[INFO] {} Basic data description: ".format(str(datetime.now())[:-4]))
print("    -- train_df shape: {}".format(
    train_df.shape))
print("    -- example_test_df shape: {}".format(
    example_test_df.shape))
print("    -- feat_df shape: {}".format(
    feat_df.shape))
print("    -- example_prediction_df shape: {}".format(
    example_prediction_df.shape))

[INFO] 2021-01-14 12:01:04.42 End Reading ! It took 95.12 seconds !
[INFO] 2021-01-14 12:01:04.42 Basic data description: 
    -- train_df shape: (2390491, 138)
    -- example_test_df shape: (15219, 133)
    -- feat_df shape: (130, 30)
    -- example_prediction_df shape: (15219, 2)


In [21]:
def gen_test_data(test_df=None, pred_df=None):
    """测试数据生成器。用于模拟测试数据生成过程，测试模型提交正确性与效率。"""
    n_test = len(test_df)

    for i in range(n_test):
        yield test_df.iloc[i], pred_df.iloc[i]


@jit
def njit_fillna(array, values):
    """利用即时编译（jit）对array数组的NaN值借助values进行填充。

    @References:
    ----------
    [1] https://www.kaggle.com/gogo827jz/optimise-speed-of-filling-nan-function
    """
    if np.isnan(array.sum()):
        array = np.where(np.isnan(array), values, array)
    return array


def custom_metric(dates_array=None,
                  weights_array=None,
                  resp_array=None,
                  action_label_array=None):
    """依据官方要求的Metric，计算分数。

    @References:
    ----------
    [1] https://www.kaggle.com/c/jane-street-market-prediction/discussion/199107
    [2] https://www.kaggle.com/c/jane-street-market-prediction/overview/evaluation
    [3] 
    """
    tmp_df = pd.DataFrame({"date": dates_array,
                           "weight": weights_array,
                           "resp": resp_array,
                           "action": action_label_array})
    tmp_df["p"] = tmp_df["weight"]  * tmp_df["resp"] * tmp_df["action"]
    # tmp_df = tmp_df.query("weight != 0").reset_index(drop=True)
    p_i_val = tmp_df.groupby(["date"])["p"].sum().values

    n_dates = len(p_i_val)
    t = np.sum(p_i_val) / np.sqrt(np.sum(p_i_val ** 2)) * (np.sqrt(250 / n_dates))
    return min(max(t, 0), 6) * np.sum(p_i_val)


In [7]:
"""
数据预处理部分。包括标签生成、数据统计值获取。
"""
# 挑选策略变化之后的数据
train = train_df.query('date > 85').reset_index(drop=True)
train = train.query('weight > 0').reset_index(drop = True)

target_threshold = 0.00001

# 构造标签
train['action'] =  ((train['resp_1'] > target_threshold) & \
                    (train['resp_2'] > target_threshold) & \
                    (train['resp_3'] > target_threshold) & \
                    (train['resp_4'] > target_threshold) &  \
                    (train['resp'] > target_threshold)).astype('int')
feature_name_list = [c for c in train.columns if 'feature' in c]
resp_name_list = ["resp", "resp_1", "resp_2", "resp_3", "resp_4"]

# 使用均值填充缺失值
mean_val_list = []
for name in feature_name_list:
    mean_val = train[name].mean()
    train[name].fillna(mean_val, inplace=True)
    mean_val_list.append(mean_val)
mean_val_array = np.array(mean_val_list)

# 构造后续神经网络模型的输入输出
X = train[feature_name_list].values.astype("float32")
y = np.hstack([(train[c] > target_threshold).astype(
    'int').values.reshape(-1, 1) for c in resp_name_list])

train_dates = train["date"].values
train_weights = train["weight"].values
train_resp = train["resp"].values

print("[INFO] {} Data prepared !".format(
    str(datetime.now())[:-4]))

[INFO] 2021-01-14 12:03:05.04 Data prepared !


In [4]:
class PurgedGroupTimeSeriesSplit(_BaseKFold):
    """针对带有Group id（组id）数据的时间序列交叉验证集合生成类。

    生成针对带有Group id的数据的时序交叉验证集。其中训练与验证的
    Group之间可以指定group_gap，用来隔离时间上的关系。这种情况下
    group_id通常是时间id，例如天或者小时。

    @Parameters:
    ----------
        n_splits: {int-like}, default=5
            切分的集合数目。
        max_train_group_size: {int-like}, default=+inf
            训练集单个组的最大样本数据限制。
        group_gap: {int-like}, default=None
            依据group_id切分组时，训练组与测试组的id的gap数目。
        max_test_group_size: {int-like}, default=+inf
            测试集单个组的最大样本数据限制。

    @References:
    ----------
    [1] https://www.kaggle.com/gogo827jz/jane-street-ffill-xgboost-purgedtimeseriescv
    """

    @_deprecate_positional_args
    def __init__(self, n_splits=5,
                 *,
                 max_train_group_size=np.inf,
                 max_test_group_size=np.inf,
                 group_gap=None,
                 verbose=False
                 ):
        super().__init__(n_splits, shuffle=False, random_state=None)
        self.max_train_group_size = max_train_group_size
        self.group_gap = group_gap
        self.max_test_group_size = max_test_group_size
        self.verbose = verbose

    def split(self, X, y=None, groups=None):
        """生成训练组与测试组的id索引，返回组索引的生成器。

        @Parameters:
        ----------
            X: {array-like} {n_samples, n_features}
                训练数据，输入形状为{n_samples, n_features}。
            y: {array-like} {n_samples, }
                标签数据，形状为{n_samples, }。
            groups: {array-like} {n_samples, }
                用来依据组来划分训练集与测试集的组id，必须为连续的组id。

        @Yields:
        ----------
            train: ndarray
                依据group_id切分的训练组id。
            test: ndarray
                依据group_id切分的测试组id。
        """
        if groups is None:
            raise ValueError(
                "The 'groups' parameter should not be None ！")
        for i in range(1, len(groups)):
            if groups[i] < groups[i-1]:
                raise ValueError("groups must be a monotone increasing sequence !")

        # 初始化基本参数信息
        X, y, groups = indexable(X, y, groups)
        n_samples, n_splits, group_gap = X.shape[0], self.n_splits, self.group_gap
        n_folds = n_splits + 1

        max_test_group_size = self.max_test_group_size
        max_train_group_size = self.max_train_group_size

        # 使得groups的id取值从0顺序开始（假定groups是递增的）
        groups_reid, groupid2reid, index_tmp = [], {}, -1
        for _, item in enumerate(groups):
            if item not in groupid2reid:
                index_tmp += 1
                groupid2reid[item] = index_tmp
            groups_reid.append(index_tmp)

        group_dict = {}
        u, ind = np.unique(groups_reid, return_index=True)
        unique_groups = np.argsort(ind)
        n_groups = _num_samples(unique_groups)

        # 扫描整个数据id list，构建group_dcit，{group_id: 属于该group的样本的idx}
        for idx in np.arange(n_samples):
            if groups_reid[idx] in group_dict:
                group_dict[groups_reid[idx]].append(idx)
            else:
                group_dict[groups_reid[idx]] = [idx]

        if n_folds > n_groups:
            raise ValueError(
                ("Cannot have number of folds={0} greater than"
                 " the number of groups={1}").format(n_folds, n_groups))

        # group_test_size: 每个fold预留的test group的大小
        group_test_size = min(n_groups // n_folds, max_test_group_size)
        group_test_starts = range(n_groups - n_splits * group_test_size,
                                  n_groups, group_test_size)
        for group_test_start in group_test_starts:
            train_array, test_array = [], []

            group_st = max(0, group_test_start - group_gap - max_train_group_size)
            for train_group_idx in unique_groups[group_st:(group_test_start - group_gap)]:
                train_array_tmp = group_dict[train_group_idx]
                train_array = np.sort(np.unique(
                    np.concatenate((train_array, train_array_tmp)),
                    axis=None), axis=None)
 
            for test_group_idx in unique_groups[group_test_start:
                                                group_test_start +
                                                group_test_size]:
                test_array_tmp = group_dict[test_group_idx]
                test_array = np.sort(np.unique(
                    np.concatenate((test_array, test_array_tmp)),
                    axis=None), axis=None)
            test_array  = test_array[group_gap:]

            if self.verbose > 0:
                    pass
            yield [int(i) for i in train_array], [int(i) for i in test_array]


def test_purged_group_time_series_split():
    X = train_df.query('date > 85').reset_index(drop=True)[["ts_id", "feature_0"]].values
    y = train_df.query('date > 85').reset_index(drop=True)["resp"].values
    groups = train_df.query('date > 85').reset_index(drop=True)["date"].values

    group_ts_kfolds = PurgedGroupTimeSeriesSplit(
        n_splits=7, group_gap=20, max_train_group_size=80, max_test_group_size=60)
    train_idx, valid_idx = [], []
    for train_idx_tmp, valid_idx_tmp in group_ts_kfolds.split(X=X, y=y, groups=groups):
        train_idx.append(train_idx_tmp)
        valid_idx.append(valid_idx_tmp)

        print("train range: {}, valid range: {}".format(
            [min(groups[train_idx_tmp]), max(groups[train_idx_tmp])],
            [min(groups[valid_idx_tmp]), max(groups[valid_idx_tmp])]
        ))

# test_purged_group_time_series_split()

In [5]:
"""
特征工程辅助工具。
"""

def feat_pca(feat_array=None, n_dims=30):
    """利用PCA，将feat_array降维至n_dims维度。"""
    if feat_array.shape[1] <= n_dims:
        raise ValueError("n_dims must smaller than the dim of feat_array !")

    # 归一化feat_array
    X_sc = StandardScaler()
    X_sc.fit(feat_array)
    feat_array = X_sc.transform(feat_array)

    # 降维
    pca = PCA(n_components=n_dims)
    pca.fit(feat_array)
    feat_array_pca = pca.transform(feat_array)

    return X_sc, pca, feat_array_pca


@njit
def njit_search_best_thresold_acc(y_pred_proba, y_true):
    """通过阈值搜索最优的准确率切分阈值."""
    best_acc, best_threshold = 0, 0
    for threshold in range(4500, 5800):
        thresold_tmp = threshold / 10000
        y_pred_label = np.where(y_pred_proba > thresold_tmp, 1, 0)
        score_tmp = np.sum(np.where(y_true == y_pred_label, 1, 0)) / len(y_true)

        if score_tmp > best_acc:
            best_acc = score_tmp
            best_threshold = thresold_tmp
    return best_acc, best_threshold


@njit
def njit_custom_metric(dates_array=None,
                       weights_array=None,
                       resp_array=None,
                       action_label_array=None):
    """利用njit装饰器与numpy来计算Kaggle官方要求的Metric。

    @References:
    ----------
    [1] https://www.kaggle.com/c/jane-street-market-prediction/discussion/199107
    [2] https://www.kaggle.com/c/jane-street-market-prediction/overview/evaluation
    """
    p_array = weights_array * resp_array * action_label_array

    n_unique_dates = np.max(dates_array) - np.min(dates_array) + 1
    dates_array = dates_array - np.min(dates_array)

    p_i_val = np.zeros((n_unique_dates, ))
    for ind, item in enumerate(dates_array):
        p_i_val[item] += p_array[ind]

    t = np.sum(p_i_val) / np.sqrt(np.sum(p_i_val ** 2)) * (np.sqrt(250 / n_unique_dates))
    return min(max(t, 0), 6) * np.sum(p_i_val)


@njit
def njit_search_best_thresold_custom(y_pred_proba=None,
                                     dates_array=None,
                                     weights_array=None,
                                     resp_array=None):
    """通过阈值搜索最优的kaggle官方评分的切分阈值."""
    best_acc, best_threshold = 0, 0
    for threshold in range(4500, 5800):
        thresold_tmp = threshold / 10000
        y_pred_label = np.where(y_pred_proba > thresold_tmp, 1, 0)
        score_tmp = njit_custom_metric(dates_array=dates_array,
                                       weights_array=weights_array,
                                       resp_array=resp_array,
                                       action_label_array=y_pred_label)

        if score_tmp > best_acc:
            best_acc = score_tmp
            best_threshold = thresold_tmp
    return best_acc, best_threshold


print("[INFO] {} Tools prepared !".format(
    str(datetime.now())[:-4]))

[INFO] 2021-01-14 12:01:49.00 Tools prepared !


In [None]:
# clf = xgb.XGBClassifier(
#     n_estimators=500,
#     max_depth=11,
#     learning_rate=0.05,
#     subsample=0.9,
#     colsample_bytree=0.7,
#     missing=-999,
#     random_state=2020,
#     tree_method='gpu_hist'  # THE MAGICAL PARAMETER
# )

In [20]:
# 训练前全局参数准备
N_SPLITS = 7
MODELS = []
group_ts_kfolds = PurgedGroupTimeSeriesSplit(
        n_splits=N_SPLITS, group_gap=5, max_test_group_size=60)

# 开始训练模型
valid_acc_total, valid_roc_auc_total, valid_custom_total = [], [], []

print("[INFO] {} Model training start:".format(str(datetime.now())[:-4]))
print("=========================================")

for fold, (train_idx, valid_idx) in enumerate(group_ts_kfolds.split(X=X, y=y, groups=train_dates)):
    #####################################################
    # Cross validation的数据准备
    X_train, X_val = X[train_idx], X[valid_idx]
    y_train, y_val = y[train_idx], y[valid_idx]

    X_train_weight, X_val_weight = train_weights[train_idx], train_weights[valid_idx]
    X_train_resp, X_val_resp = train_resp[train_idx], train_resp[valid_idx]
    X_train_dates, X_val_dates = train_dates[train_idx], train_dates[valid_idx]

    #####################################################
    # STEP 1: 进行特征工程
    std_scaler, pca_transformer, X_train_pca = feat_pca(feat_array=X_train, n_dims=30)

    X_val_pca = std_scaler.transform(X_val)
    X_val_pca = pca_transformer.transform(X_val)

    X_train = np.hstack([X_train, X_train_pca])
    X_val = np.hstack([X_val, X_val_pca])

    #####################################################
    # STEP 2: 分5次拟合5个lightgbm模型（对应于5个resp值）
    lgb_params = {"boosting_type": "gbdt",   # boosting="gbdt"
                  "objective": "binary",
                  "num_leaves": 32,
                  "max_depth": 4,
                  "learning_rate": 0.04,
                  "subsample_freq": 1,        # bagging_freq=1
                  "subsample": 0.95,          # bagging_fraction=0.9
                  "colsample_bytree": 0.95,   # feature_fraction=0.9
                  "reg_alpha": 0,
                  "reg_lambda": 0.12,
                  "n_jobs": -1,
                  "n_estimators": 3000,
                  "random_state": 956,
                  "metric": "auc",
                  "verbose": -1}

    valid_pred_proba_list, models_tmp = [], []
    for i in range(y_val.shape[1]):
        lgb_clf = lgb.LGBMClassifier(**lgb_params)
        lgb_clf.fit(X_train, y_train[:, i], eval_set=[(X_val, y_val[:, i])],
            early_stopping_rounds=100, verbose=False)

        valid_pred_proba_list.append(lgb_clf.predict_proba(
            X_val, num_iteration=lgb_clf.best_iteration_))
        models_tmp.append([lgb_clf, lgb_clf.best_iteration_])

    valid_pred_proba = np.mean(valid_pred_proba_list, axis=0)[:, 1]

    #####################################################
    # STEP 4: 寻找最优valid的阈值
    best_custom, THRESHOLD = njit_search_best_thresold_custom(
            y_pred_proba=valid_pred_proba,
            dates_array=X_val_dates,
            weights_array=X_val_weight,
            resp_array=X_val_resp)

    #####################################################
    # STEP 5: valid data上按照官方metric进行结果评估
    valid_pred_label = np.where(
            valid_pred_proba>=THRESHOLD, 1, 0).astype(int)
    valid_custom_metric = custom_metric(dates_array=X_val_dates,
                                        weights_array=X_val_weight,
                                        action_label_array=valid_pred_label,
                                        resp_array=X_val_resp)
    valid_acc = accuracy_score(y_val[:, 0].reshape(-1, 1),
                               valid_pred_label.reshape(-1, 1))
    valid_roc_auc = roc_auc_score(y_val[:, 0].reshape(-1, 1),
                                  valid_pred_label.reshape(-1, 1))

    # 标准打印训练信息
    print("-- folds {}({})(train_range: {}->{}, valid_range: {}->{}), valid_acc: {:.4f}, valid_roc_auc: {:.4f}, valid_custom: {:.4f}".format(
            fold+1, N_SPLITS, min(X_train_dates), max(X_train_dates), min(X_val_dates), max(X_val_dates), valid_acc, valid_roc_auc, valid_custom_metric))

    #####################################################
    # STEP 4: 保存模型与关键训练指标
    MODELS.append(models_tmp)
    valid_acc_total.append(valid_acc)
    valid_roc_auc_total.append(valid_roc_auc)
    valid_custom_total.append(valid_custom_metric)

    # 强制内存回收
    del X_train, X_val, y_train, y_val, X_train_pca, X_val_pca
    del X_train_weight, X_val_weight, X_train_resp, X_val_resp, X_train_dates, X_val_dates
    gc.collect()

# 打印总体分数指标
print("-- total metric, valid_acc: {:.4f}, valid_roc_auc: {:.4f}, valid_custom: {:.4f}".format(
        np.mean(valid_acc_total), np.mean(valid_roc_auc_total), np.mean(valid_custom_total)))

print("=========================================")
print("[INFO] {} Model training end.".format(str(datetime.now())[:-4]))


[INFO] 2021-01-14 12:22:52.93 Model training start:


NameError: name 'custom_metric' is not defined

In [None]:
# 训练前全局参数准备
N_SPLITS = 7
MODELS = []
group_ts_kfolds = PurgedGroupTimeSeriesSplit(
        n_splits=N_SPLITS, group_gap=5, max_test_group_size=60)

# 开始训练模型
valid_acc_total, valid_roc_auc_total, valid_custom_total = [], [], []

print("[INFO] {} Model training start:".format(str(datetime.now())[:-4]))
print("=========================================")

for fold, (train_idx, valid_idx) in enumerate(group_ts_kfolds.split(X=X, y=y, groups=train_dates)):
    #####################################################
    # Cross validation的数据准备
    X_train, X_val = X[train_idx], X[valid_idx]
    y_train, y_val = y[train_idx], y[valid_idx]

    X_train_weight, X_val_weight = train_weights[train_idx], train_weights[valid_idx]
    X_train_resp, X_val_resp = train_resp[train_idx], train_resp[valid_idx]
    X_train_dates, X_val_dates = train_dates[train_idx], train_dates[valid_idx]

    #####################################################
    # STEP 1: 进行特征工程
    std_scaler, pca_transformer, X_train_pca = feat_pca(feat_array=X_train, n_dims=30)

    X_val_pca = std_scaler.transform(X_val)
    X_val_pca = pca_transformer.transform(X_val)

    X_train = np.hstack([X_train, X_train_pca])
    X_val = np.hstack([X_val, X_val_pca])

    #####################################################
    # STEP 2: 分5次拟合5个lightgbm模型（对应于5个resp值）
    xgb_params = {"n_estimators": 5000,
                  "max_depth": 4,
                  "learning_rate": 0.04,
                  "verbosity": 0,
                  "objective": "binary:logistic",
                  "booster": "gbtree",
                  "colsample_bytree": 0.95,
                  "colsample_bylevel": 0.95,
                  "subsample": 0.95,
                  "gpu_id": 0,
                  "random_state": 1092,
                  "tree_method": "gpu_hist"}

    valid_pred_proba_list, models_tmp = [], []
    for i in range(y_val.shape[1]):
        xgb_clf = xgb.XGBClassifier(**xgb_params)
        xgb_clf.fit(X_train, y_train[:, i], eval_set=[(X_val, y_val[:, i])],
            early_stopping_rounds=100, verbose=False)

        valid_pred_proba_list.append(xgb_clf.predict_proba(
            X_val, ntree_limit=xgb_clf.best_iteration))
        models_tmp.append([xgb_clf, xgb_clf.best_iteration])

    valid_pred_proba = np.mean(valid_pred_proba_list, axis=0)[:, 1]

    #####################################################
    # STEP 4: 寻找最优valid的阈值
    best_custom, THRESHOLD = njit_search_best_thresold_custom(
            y_pred_proba=valid_pred_proba,
            dates_array=X_val_dates,
            weights_array=X_val_weight,
            resp_array=X_val_resp)

    #####################################################
    # STEP 5: valid data上按照官方metric进行结果评估
    valid_pred_label = np.where(
            valid_pred_proba>=THRESHOLD, 1, 0).astype(int)
    valid_custom_metric = custom_metric(dates_array=X_val_dates,
                                        weights_array=X_val_weight,
                                        action_label_array=valid_pred_label,
                                        resp_array=X_val_resp)
    valid_acc = accuracy_score(y_val[:, 0].reshape(-1, 1),
                               valid_pred_label.reshape(-1, 1))
    valid_roc_auc = roc_auc_score(y_val[:, 0].reshape(-1, 1),
                                  valid_pred_label.reshape(-1, 1))

    # 标准打印训练信息
    print("-- folds {}({})(train_range: {}->{}, valid_range: {}->{}), valid_acc: {:.4f}, valid_roc_auc: {:.4f}, valid_custom: {:.4f}".format(
            fold+1, N_SPLITS, min(X_train_dates), max(X_train_dates), min(X_val_dates), max(X_val_dates), valid_acc, valid_roc_auc, valid_custom_metric))

    #####################################################
    # STEP 4: 保存模型与关键训练指标
    MODELS.append(models_tmp)
    valid_acc_total.append(valid_acc)
    valid_roc_auc_total.append(valid_roc_auc)
    valid_custom_total.append(valid_custom_metric)

    # 强制内存回收
    del X_train, X_val, y_train, y_val, X_train_pca, X_val_pca
    del X_train_weight, X_val_weight, X_train_resp, X_val_resp, X_train_dates, X_val_dates
    gc.collect()

# 打印总体分数指标
print("-- total metric, valid_acc: {:.4f}, valid_roc_auc: {:.4f}, valid_custom: {:.4f}".format(
        np.mean(valid_acc_total), np.mean(valid_roc_auc_total), np.mean(valid_custom_total)))

print("=========================================")
print("[INFO] {} Model training end.".format(str(datetime.now())[:-4]))


In [19]:
valid_pred_proba

array([[0.497877  , 0.502123  ],
       [0.50357217, 0.49642783],
       [0.50979829, 0.49020171],
       ...,
       [0.39726229, 0.60273771],
       [0.46613065, 0.53386935],
       [0.55884169, 0.44115831]])