In [1]:
import pandas as pd
import polars as pl
import numpy as np
import os
from tqdm.auto import tqdm
from matplotlib import pyplot as plt
import pickle

from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error, mean_absolute_error
from lightgbm import LGBMRegressor
import lightgbm as lgb
from xgboost import XGBRegressor
import optuna

import warnings
warnings.filterwarnings('ignore')
pd.options.display.max_columns = None

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
class CONFIG:
    seed = 42
    target_col = "responder_6"
    feature_cols = ["symbol_id", "time_id"] \
        + [f"feature_{idx:02d}" for idx in range(79)] \
        + [f"responder_{idx}_lag_1" for idx in range(9)]
    categorical_cols = []

In [3]:
train = pl.scan_parquet("./create-lags/training_data.parquet").collect().to_pandas()
valid = pl.scan_parquet("./create-lags/validation_data.parquet").collect().to_pandas()
train.shape, valid.shape

((27369232, 104), (1417152, 104))

In [None]:
#train = pd.concat([train, valid]).reset_index(drop=True)
#train.shape

In [None]:
# def get_model(seed):
#     # XGBoost parameters
#     XGB_Params = {
#         'learning_rate': 0.05,
#         'max_depth': 6,
#         'n_estimators': 200,
#         'subsample': 0.8,
#         'colsample_bytree': 0.8,
#         'reg_alpha': 1,
#         'reg_lambda': 5,
#         'random_state': seed,
#         'tree_method': 'hist',
#         'device' : 'cpu',
#         'n_jobs': 4
#     }
    
#     XGB_Model = XGBRegressor(**XGB_Params)
#     return XGB_Model

In [4]:
X_train = train[ CONFIG.feature_cols ]
y_train = train[ CONFIG.target_col ]
w_train = train[ "weight" ]
X_valid = valid[ CONFIG.feature_cols ]
y_valid = valid[ CONFIG.target_col ]
w_valid = valid[ "weight" ]

X_train.shape, y_train.shape, w_train.shape, X_valid.shape, y_valid.shape, w_valid.shape

((27369232, 90),
 (27369232,),
 (27369232,),
 (1417152, 90),
 (1417152,),
 (1417152,))

In [5]:
def weighted_r2_score(y_true, y_pred, sample_weight):
    r2 = 1 - np.average((y_pred - y_true) ** 2, weights=sample_weight) / (np.average((y_true) ** 2, weights=sample_weight) + 1e-38)
    return r2

In [6]:
def custom_metric(y_true, y_pred):
    """
    自定义评估函数（metric），使用加权 R²
    """
    # 获取权重，这里假设每个样本的权重存在全局变量中
    sample_weight = np.ones_like(y_true)  # 如果没有权重时，默认为1
    return 'weighted_r2', weighted_r2_score(y_true, y_pred, sample_weight), True  

In [8]:
def objective_holdout(trial):
    params = {
        'boosting_type': 'gbdt',
        'objective': 'regression',
        'num_threads': 4,
        'metric': None,
        'verbose': -1,
        'n_estimators': trial.suggest_int('n_estimators', 10, 200),
        'num_leaves': trial.suggest_int('num_leaves', 8, 50),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 100, 500),
        'min_child_weight': trial.suggest_int('min_child_weight', 50, 100),
        'max_depth': trial.suggest_int('max_depth', 3, 6),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.05, 0.4),
        'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),
        'bagging_fraction': trial.suggest_uniform('bagging_fraction',  0.4, 1.0),
        'lambda_l1': trial.suggest_loguniform('lambda_l1', 0.1, 2.0),
        'lambda_l2': trial.suggest_loguniform('lambda_l2', 0.1, 2.0)
    }
    print('\n====== Train model: Holdout ====')
    print('Best parameters is: ', params)
    lgb_model = lgb.LGBMRegressor(**params)
    stop_early = lgb.early_stopping(10, verbose=-1)
    lgb_model.fit(X_train, y_train, sample_weight=w_train, eval_set=[(X_valid, y_valid)], eval_metric=custom_metric, callbacks=[stop_early])
    y_pred = lgb_model.predict(X_valid)
    r2_score = weighted_r2_score(y_valid, y_pred, w_valid)
    return r2_score


In [9]:
def optuna_tune():
    study = optuna.create_study(direction='maximize')
    study.optimize(lambda trial: objective_holdout(trial), n_trials=2)
    best_params = study.best_params
    print('Best parameters found: ', best_params)
    with open('./best_params.txt', 'w') as f:
        f.write(str(best_params))
    return best_params

In [10]:
best_params = optuna_tune()

[I 2024-12-09 10:01:52,762] A new study created in memory with name: no-name-54a07156-983f-41e0-95d2-9fb448e18964



Best parameters is:  {'boosting_type': 'gbdt', 'objective': 'regression', 'num_threads': 4, 'metric': None, 'verbose': -1, 'n_estimators': 114, 'num_leaves': 12, 'min_data_in_leaf': 172, 'min_child_weight': 99, 'max_depth': 4, 'learning_rate': 0.09255109733189183, 'feature_fraction': 0.8131525819460832, 'bagging_fraction': 0.5788269482596342, 'lambda_l1': 0.17974087201081015, 'lambda_l2': 0.29039094200384674}
Training until validation scores don't improve for 10 rounds
Did not meet early stopping. Best iteration is:
[114]	valid_0's l2: 0.60206	valid_0's weighted_r2: 0.00768118


[I 2024-12-09 10:03:59,527] Trial 0 finished with value: 0.0065740720818666665 and parameters: {'n_estimators': 114, 'num_leaves': 12, 'min_data_in_leaf': 172, 'min_child_weight': 99, 'max_depth': 4, 'learning_rate': 0.09255109733189183, 'feature_fraction': 0.8131525819460832, 'bagging_fraction': 0.5788269482596342, 'lambda_l1': 0.17974087201081015, 'lambda_l2': 0.29039094200384674}. Best is trial 0 with value: 0.0065740720818666665.



Best parameters is:  {'boosting_type': 'gbdt', 'objective': 'regression', 'num_threads': 4, 'metric': None, 'verbose': -1, 'n_estimators': 116, 'num_leaves': 45, 'min_data_in_leaf': 494, 'min_child_weight': 99, 'max_depth': 5, 'learning_rate': 0.077438657214976, 'feature_fraction': 0.5592688081070811, 'bagging_fraction': 0.49326870771218106, 'lambda_l1': 0.14377341226327067, 'lambda_l2': 0.3942545999840934}
Training until validation scores don't improve for 10 rounds
Did not meet early stopping. Best iteration is:
[113]	valid_0's l2: 0.601771	valid_0's weighted_r2: 0.00815733


[I 2024-12-09 10:05:52,741] Trial 1 finished with value: 0.007352571157980381 and parameters: {'n_estimators': 116, 'num_leaves': 45, 'min_data_in_leaf': 494, 'min_child_weight': 99, 'max_depth': 5, 'learning_rate': 0.077438657214976, 'feature_fraction': 0.5592688081070811, 'bagging_fraction': 0.49326870771218106, 'lambda_l1': 0.14377341226327067, 'lambda_l2': 0.3942545999840934}. Best is trial 1 with value: 0.007352571157980381.


Best parameters found:  {'n_estimators': 116, 'num_leaves': 45, 'min_data_in_leaf': 494, 'min_child_weight': 99, 'max_depth': 5, 'learning_rate': 0.077438657214976, 'feature_fraction': 0.5592688081070811, 'bagging_fraction': 0.49326870771218106, 'lambda_l1': 0.14377341226327067, 'lambda_l2': 0.3942545999840934}


In [11]:
%%time
model = lgb.LGBMRegressor(**best_params)
model.fit(X_train, y_train, sample_weight=w_train)

CPU times: user 15min 24s, sys: 4 s, total: 15min 28s
Wall time: 59.6 s


In [12]:
feature_importance = model.feature_importances_
print("Split Gain Feature importances:", feature_importance)
feature_importance = pd.DataFrame({'feature': CONFIG.feature_cols, 'importance': feature_importance})
feature_importance = feature_importance.sort_values('importance', ascending=False)
plt.figure(figsize=(10, 20))
plt.barh(feature_importance['feature'], feature_importance['importance'], color='skyblue')
plt.xlabel('Importance')
plt.title('Feature Importance')
plt.gca().invert_yaxis()  # 反转y轴以显示最重要的特征在上方
plt.savefig('./feature_importance_split_gain.png', dpi=300, bbox_inches='tight')
plt.close()

Split Gain Feature importances: [  6 169  20  95  33  16  97 100 147 164 114  14   9  10  21  22  30  39
  28  51   6  27  25  39  39  29  45  29  21  25  13  36  63  37  14  59
  26  22 108  42  77  35  21   8  32  20  21  38  12  57  35  31  36  27
  74  36  19  35  47  17  94 105  74  36  10   2  13  17  26  14  36  43
  27   9  48  12  30  37  60  35  34  24  20  12  23  21  11  10  19  16]


In [13]:
def calculate_null_importance(X, y, w, model, metric=weighted_r2_score):
    """计算 Null Importance"""
    baseline_metric = metric(y, model.predict(X, verbose=-1), w) # 计算原始数据的基准性能
    importances = []
    
    # 遍历每个特征
    for feature in X.columns:
        X_permuted = X.copy()
        X_permuted[feature] = np.random.permutation(X_permuted[feature])  # 随机化该特征
        permuted_metric = metric(y, model.predict(X_permuted,verbose=-1), w)  # 计算随机化后的性能
        importances.append(baseline_metric - permuted_metric)  # 计算性能下降值

    return importances

In [14]:
null_importance = calculate_null_importance(X_train, y_train, w_train, model)
print("Null Importance Feature importances:", null_importance)
null_importance_df = pd.DataFrame({'feature': CONFIG.feature_cols, 'null importance': null_importance})
null_importance_df = null_importance_df.sort_values('null importance', ascending=False)
plt.figure(figsize=(10, 20))
plt.barh(null_importance_df['feature'], null_importance_df['null importance'])
plt.xlabel('Null Importance')
plt.title('Feature Importance Based on Null Importance')
plt.gca().invert_yaxis()  # 反转y轴以显示最重要的特征在上方
plt.savefig('./feature_importance_null_importance.png', dpi=300, bbox_inches='tight')
plt.close()

In [None]:
y_pred_train1 = model.predict(X_train.iloc[:X_train.shape[0]//2])
y_pred_train2 = model.predict(X_train.iloc[X_train.shape[0]//2:])
train_score = r2_score(y_train, np.concatenate([y_pred_train1, y_pred_train2], axis=0), sample_weight=w_train )
train_score

In [None]:
y_pred_valid = model.predict(X_valid)
valid_score = r2_score(y_valid, y_pred_valid, sample_weight=w_valid )
valid_score

In [None]:
y_means = { symbol_id : -1 for symbol_id in range(39) }
for symbol_id, gdf in train[["symbol_id", CONFIG.target_col]].groupby("symbol_id"):
    y_mean = gdf[ CONFIG.target_col ].mean()
    y_means[symbol_id] = y_mean
    print(f"symbol_id = {symbol_id}, y_means = {y_mean:.5f}")

In [None]:
cv_detail = { symbol_id : 0 for symbol_id in range(39) }
for symbol_id, gdf in valid.groupby("symbol_id"):
    X_valid = gdf[ CONFIG.feature_cols ]
    y_valid = gdf[ CONFIG.target_col ]
    w_valid = gdf[ "weight" ]
    y_pred_valid = model.predict(X_valid)
    score = r2_score(y_valid, y_pred_valid, sample_weight=w_valid )
    cv_detail[symbol_id] = score
    
    print(f"symbol_id = {symbol_id}, score = {score:.5f}")

In [None]:
sids = list(cv_detail.keys())
plt.bar(sids, [cv_detail[sid] for sid in sids])
plt.grid()
plt.xlabel("symbol_id")
plt.ylabel("CV score")
plt.show()

In [None]:
result = {
    "model" : model,
    "cv" : valid_score,
    "cv_detail" : cv_detail,
    "y_mean" : y_means,
}
with open("result.pkl", "wb") as fp:
    pickle.dump(result, fp)