In [1]:
import os
import numpy as np
from alphagen.data.expression import *
from alphagen.utils import reseed_everything
from alphagen_generic.features import *
from gan.utils.data import get_data_by_year

Gym has been unmaintained since 2022 and does not support NumPy 2.0 amongst other critical functionality.
Please upgrade to Gymnasium, the maintained drop-in replacement of Gym, or contact the authors of your software and request that they upgrade.
Users of this version of Gym should be able to simply replace 'import gym' with 'import gymnasium as gym' in the vast majority of cases.
See the migration guide at https://gymnasium.farama.org/introduction/migration_guide/ for additional information.


# Utility functions

In [2]:
import pandas as pd
from tqdm import tqdm
def get_ml_data(data):
    df = data.df_bak.copy()
    df.columns = ['open','close','high','low','volume', "vwap"]
    close_unstack = df['close'].unstack()
    tmp = (close_unstack.shift(-20)/close_unstack)-1
    label = tmp.stack().reindex(df.index)
    df['label'] = label

    feature = df[['open','close','high','low','volume',]]
    tmp = feature.unstack()
    feature = (tmp/tmp.shift(1)-1)#.stack().reindex(df.index)

    result_feature = []
    cur = feature.stack().reindex(df.index)
    cur.columns = [f'{col}0' for col in cur.columns]
    for past in tqdm(range(1,60)):
        cur = feature.shift(past).stack().reindex(df.index)
        cur.columns = [f'{col}{past}' for col in cur.columns]
        result_feature.append(cur)
    result_feature = pd.concat(result_feature,axis=1)
    df = pd.concat([result_feature,df['label']],axis=1)
    start_date = data._dates[data.max_backtrack_days]
    end_date = data._dates[-data.max_future_days]
    return df.loc[start_date:end_date]

def normalize_data(df_train, df_valid, df_test):
    # Get the column names of the features
    labels = [df_train.iloc[:, -1], df_valid.iloc[:, -1], df_test.iloc[:, -1]]
    df_train_features = df_train.iloc[:, :-1]
    df_valid_features = df_valid.iloc[:, :-1]
    df_test_features = df_test.iloc[:, :-1]

    _mean = df_train_features.mean()
    _std = df_train_features.std()
    print('1')
    df_train_norm = (df_train_features - _mean) / _std
    print('2')
    df_valid_norm = (df_valid_features - _mean) / _std
    print('3')
    df_test_norm = (df_test_features - _mean) / _std

    df_train_norm.fillna(0, inplace=True)
    df_valid_norm.fillna(0, inplace=True)
    df_test_norm.fillna(0, inplace=True)


    df_train_norm['label'] = np.nan_to_num(labels[0],nan=0,posinf=0,neginf=0)
    df_valid_norm['label'] = np.nan_to_num(labels[1],nan=0,posinf=0,neginf=0)
    df_test_norm['label'] = np.nan_to_num(labels[2],nan=0,posinf=0,neginf=0)

    df_train_norm['label'] = df_train_norm['label'].groupby('datetime').transform(lambda x: (x - x.mean()) / x.std()).clip(-4, 4)

    df_train_norm = df_train_norm.clip(-4, 4)
    df_valid_norm = df_valid_norm.clip(-4, 4)
    df_test_norm = df_test_norm.clip(-4, 4)

    return df_train_norm, df_valid_norm, df_test_norm

# Train Lightgbm Model

In [3]:
import lightgbm as lgb
import pandas as pd

def train_lightgbm_model(df_train, df_valid, df_test):
    # Fill NaN values with 0
    df_train_filled = df_train.fillna(0)
    df_valid_filled = df_valid.fillna(0)
    df_test_filled = df_test.fillna(0)

    # Separate features and labels
    X_train = df_train_filled.drop(columns=['label'])
    y_train = df_train_filled['label']
    X_valid = df_valid_filled.drop(columns=['label'])
    y_valid = df_valid_filled['label']
    X_test = df_test_filled.drop(columns=['label'])
    y_test = df_test_filled['label']

    # Convert data to LightGBM Dataset format
    train_data = lgb.Dataset(X_train, label=y_train)
    valid_data = lgb.Dataset(X_valid, label=y_valid)

    # Set hyperparameters for LightGBM model
    params = {
        'objective': 'regression',
        'metric': 'mse',
        'num_leaves': 210,
        'max_depth': 8,
        'learning_rate': 0.05,
        'feature_fraction': 0.9,
        'bagging_fraction': 0.8,
        'bagging_freq': 5,
        'verbose': 0
    }

    # Train the LightGBM model
    # early_stopping_rounds=100,verbose_eval=100
    model = lgb.train(params, train_data, valid_sets=[train_data, valid_data], num_boost_round=1000, callbacks=[lgb.early_stopping(100), lgb.log_evaluation(100)])

    # Evaluate the model on the test set
    y_pred = model.predict(X_test)
    
    pred = pd.concat([pd.Series(y_pred,index=df_test.index),df_test['label']],axis=1)
    # Print the RMSE score
    # rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    # print(f"RMSE: {rmse}")

    return  model,pred

for instruments in ['csi300','csi500']:
    for train_end in range(2017,2024):
        print(f"instruments: {instruments}\ttrain_end: {train_end}")
        returned = get_data_by_year(
            train_start = 2011,train_end=train_end,valid_year=train_end+1,test_year =train_end+2,
            instruments=instruments, target=target,freq='day',)
        data_all, data,data_valid,data_valid_withhead,data_test,data_test_withhead,_ = returned
        df_train = get_ml_data(data)
        df_valid = get_ml_data(data_valid)
        df_test = get_ml_data(data_test)
        df_train, df_valid, df_test = normalize_data(df_train, df_valid, df_test)
        
        model_name = 'lgbm'
        name = f"{instruments}_{model_name}_{train_end}"
        os.makedirs(f"out_ml/{name}",exist_ok=True)
        model,pred = train_lightgbm_model(df_train, df_valid, df_test)
        model.save_model(f"out_ml/{name}/{model_name}.pt")
        pred.to_pickle(f"out_ml/{name}/pred.pkl")

instruments: csi300	train_end: 2017


  cur = feature.stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().rein

1
2
3
Training until validation scores don't improve for 100 rounds
[100]	training's l2: 0.741067	valid_1's l2: 0.0122695
Early stopping, best iteration is:
[3]	training's l2: 0.779305	valid_1's l2: 0.0105584
instruments: csi300	train_end: 2018


  cur = feature.stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().rein

1
2
3
Training until validation scores don't improve for 100 rounds
[100]	training's l2: 0.760025	valid_1's l2: 0.0185063
Early stopping, best iteration is:
[2]	training's l2: 0.797574	valid_1's l2: 0.0163969
instruments: csi300	train_end: 2019


  cur = feature.stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().rein

1
2
3
Training until validation scores don't improve for 100 rounds
[100]	training's l2: 0.777142	valid_1's l2: 0.0223606
Early stopping, best iteration is:
[1]	training's l2: 0.810226	valid_1's l2: 0.0200909
instruments: csi300	train_end: 2020


  cur = feature.stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().rein

1
2
3
Training until validation scores don't improve for 100 rounds
[100]	training's l2: 0.788864	valid_1's l2: 0.0173779
Early stopping, best iteration is:
[2]	training's l2: 0.819466	valid_1's l2: 0.0147085
instruments: csi300	train_end: 2021


  cur = feature.stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().rein

1
2
3
Training until validation scores don't improve for 100 rounds
[100]	training's l2: 0.802986	valid_1's l2: 0.0158242
Early stopping, best iteration is:
[1]	training's l2: 0.831788	valid_1's l2: 0.0142579
instruments: csi300	train_end: 2022


  cur = feature.stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().rein

1
2
3
Training until validation scores don't improve for 100 rounds
[100]	training's l2: 0.812747	valid_1's l2: 0.0106564
Early stopping, best iteration is:
[2]	training's l2: 0.840869	valid_1's l2: 0.00992056
instruments: csi300	train_end: 2023


  cur = feature.stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().rein

1
2
3
Training until validation scores don't improve for 100 rounds
[100]	training's l2: 0.817227	valid_1's l2: 0.0209532
Early stopping, best iteration is:
[1]	training's l2: 0.845569	valid_1's l2: 0.0168099
instruments: csi500	train_end: 2017


  cur = feature.stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().rein

1
2
3
Training until validation scores don't improve for 100 rounds
[100]	training's l2: 0.748911	valid_1's l2: 0.0142604
Early stopping, best iteration is:
[3]	training's l2: 0.775228	valid_1's l2: 0.0130173
instruments: csi500	train_end: 2018


  cur = feature.stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().rein

1
2
3
Training until validation scores don't improve for 100 rounds
[100]	training's l2: 0.766629	valid_1's l2: 0.0225579
Early stopping, best iteration is:
[1]	training's l2: 0.792938	valid_1's l2: 0.0204343
instruments: csi500	train_end: 2019


  cur = feature.stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().rein

1
2
3
Training until validation scores don't improve for 100 rounds
[100]	training's l2: 0.779203	valid_1's l2: 0.0243213
Early stopping, best iteration is:
[1]	training's l2: 0.802849	valid_1's l2: 0.0205674
instruments: csi500	train_end: 2020


  cur = feature.stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().rein

1
2
3
Training until validation scores don't improve for 100 rounds
[100]	training's l2: 0.78664	valid_1's l2: 0.0225143
Early stopping, best iteration is:
[1]	training's l2: 0.808708	valid_1's l2: 0.0191076
instruments: csi500	train_end: 2021


  cur = feature.stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().rein

1
2
3
Training until validation scores don't improve for 100 rounds
[100]	training's l2: 0.795769	valid_1's l2: 0.0165255
Early stopping, best iteration is:
[2]	training's l2: 0.816778	valid_1's l2: 0.0151133
instruments: csi500	train_end: 2022


  cur = feature.stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().rein

1
2
3
Training until validation scores don't improve for 100 rounds
[100]	training's l2: 0.802966	valid_1's l2: 0.010187
Early stopping, best iteration is:
[1]	training's l2: 0.823494	valid_1's l2: 0.009212
instruments: csi500	train_end: 2023


  cur = feature.stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().rein

1
2
3
Training until validation scores don't improve for 100 rounds
[100]	training's l2: 0.809263	valid_1's l2: 0.0228549
Early stopping, best iteration is:
[1]	training's l2: 0.829571	valid_1's l2: 0.0196192


# Train Xgboost Model

In [4]:

import xgboost as xgb
import pandas as pd

def train_xgboost_model(df_train, df_valid, df_test):
    # Fill NaN values with 0
    df_train_filled = df_train.fillna(0)
    df_valid_filled = df_valid.fillna(0)
    df_test_filled = df_test.fillna(0)

    # Separate features and labels
    X_train = df_train_filled.drop(columns=['label'])
    y_train = df_train_filled['label']
    X_valid = df_valid_filled.drop(columns=['label'])
    y_valid = df_valid_filled['label']
    X_test = df_test_filled.drop(columns=['label'])
    y_test = df_test_filled['label']

    # Convert data to DMatrix format
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dvalid = xgb.DMatrix(X_valid, label=y_valid)

    # Set hyperparameters for XGBoost model
    params = {
        'objective': 'reg:squarederror',
        'eval_metric': 'rmse',
        'colsample_bytree': 0.8879,
        'learning_rate': 0.2,
        'subsample': 0.8789,
        'lambda': 205.6999,
        'alpha': 580.9768,
        'max_depth': 8,
        'num_boost_round': 1000,
        'early_stopping_rounds': 100,
        'verbose_eval': 100
    }

    # Train the XGBoost model
    model = xgb.train(params, dtrain, evals=[(dtrain, 'train'), (dvalid, 'valid')], early_stopping_rounds=params['early_stopping_rounds'], verbose_eval=params['verbose_eval'])

    # Convert test data to DMatrix format
    dtest = xgb.DMatrix(X_test)

    # Make predictions on the test set
    y_pred = model.predict(dtest)

    # Combine the predictions with the actual labels
    # pred = pd.concat([df_test['label'], pd.Series(y_pred, index=df_test.index)], axis=1)
    pred = pd.concat([pd.Series(y_pred,index=df_test.index),df_test['label']],axis=1)

    return model, pred



for instruments in ['csi300','csi500']:
    for train_end in range(2017,2024):
        returned = get_data_by_year(
            train_start = 2011,train_end=train_end,valid_year=train_end+1,test_year =train_end+2,
            instruments=instruments, target=target,freq='day',)
        data_all, data,data_valid,data_valid_withhead,data_test,data_test_withhead,_ = returned
        df_train = get_ml_data(data)
        df_valid = get_ml_data(data_valid)
        df_test = get_ml_data(data_test)
        df_train, df_valid, df_test = normalize_data(df_train, df_valid, df_test)
        
        model_name = 'xgb'
        name = f"{instruments}_{model_name}_{train_end}"
        os.makedirs(f"out_ml/{name}",exist_ok=True)
        model,pred = train_xgboost_model(df_train, df_valid, df_test)
        model.save_model(f"out_ml/{name}/{model_name}.pt")
        pred.to_pickle(f"out_ml/{name}/pred.pkl")

  cur = feature.stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().rein

1
2
3


Parameters: { "early_stopping_rounds", "num_boost_round", "verbose_eval" } are not used.

  self.starting_round = model.num_boosted_rounds()


[0]	train-rmse:0.88340	valid-rmse:0.10286
[9]	train-rmse:0.88208	valid-rmse:0.10404


  model.save_model(f"out_ml/{name}/{model_name}.pt")
  cur = feature.stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reind

1
2
3


Parameters: { "early_stopping_rounds", "num_boost_round", "verbose_eval" } are not used.

  self.starting_round = model.num_boosted_rounds()


[0]	train-rmse:0.89336	valid-rmse:0.12782
[9]	train-rmse:0.89209	valid-rmse:0.12867


  model.save_model(f"out_ml/{name}/{model_name}.pt")
  cur = feature.stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reind

1
2
3


Parameters: { "early_stopping_rounds", "num_boost_round", "verbose_eval" } are not used.

  self.starting_round = model.num_boosted_rounds()


[0]	train-rmse:0.90023	valid-rmse:0.14160
[9]	train-rmse:0.89917	valid-rmse:0.14238


  model.save_model(f"out_ml/{name}/{model_name}.pt")
  cur = feature.stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reind

1
2
3


Parameters: { "early_stopping_rounds", "num_boost_round", "verbose_eval" } are not used.

  self.starting_round = model.num_boosted_rounds()


[0]	train-rmse:0.90545	valid-rmse:0.12131
[9]	train-rmse:0.90430	valid-rmse:0.12237


  model.save_model(f"out_ml/{name}/{model_name}.pt")
  cur = feature.stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reind

1
2
3


Parameters: { "early_stopping_rounds", "num_boost_round", "verbose_eval" } are not used.

  self.starting_round = model.num_boosted_rounds()


[0]	train-rmse:0.91207	valid-rmse:0.11942
[9]	train-rmse:0.91107	valid-rmse:0.12028


  model.save_model(f"out_ml/{name}/{model_name}.pt")
  cur = feature.stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reind

1
2
3


Parameters: { "early_stopping_rounds", "num_boost_round", "verbose_eval" } are not used.

  self.starting_round = model.num_boosted_rounds()


[0]	train-rmse:0.91723	valid-rmse:0.09953
[9]	train-rmse:0.91608	valid-rmse:0.09972


  model.save_model(f"out_ml/{name}/{model_name}.pt")
  cur = feature.stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reind

1
2
3


Parameters: { "early_stopping_rounds", "num_boost_round", "verbose_eval" } are not used.

  self.starting_round = model.num_boosted_rounds()


[0]	train-rmse:0.91959	valid-rmse:0.12951
[9]	train-rmse:0.91838	valid-rmse:0.13090


  model.save_model(f"out_ml/{name}/{model_name}.pt")
  cur = feature.stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reind

1
2
3


Parameters: { "early_stopping_rounds", "num_boost_round", "verbose_eval" } are not used.

  self.starting_round = model.num_boosted_rounds()


[0]	train-rmse:0.88089	valid-rmse:0.11427
[9]	train-rmse:0.87964	valid-rmse:0.11512


  model.save_model(f"out_ml/{name}/{model_name}.pt")
  cur = feature.stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reind

1
2
3


Parameters: { "early_stopping_rounds", "num_boost_round", "verbose_eval" } are not used.

  self.starting_round = model.num_boosted_rounds()


[0]	train-rmse:0.89045	valid-rmse:0.14286
[9]	train-rmse:0.88917	valid-rmse:0.14361


  model.save_model(f"out_ml/{name}/{model_name}.pt")
  cur = feature.stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reind

1
2
3


Parameters: { "early_stopping_rounds", "num_boost_round", "verbose_eval" } are not used.

  self.starting_round = model.num_boosted_rounds()


[0]	train-rmse:0.89598	valid-rmse:0.14353
[9]	train-rmse:0.89481	valid-rmse:0.14568


  model.save_model(f"out_ml/{name}/{model_name}.pt")
  cur = feature.stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reind

1
2
3


Parameters: { "early_stopping_rounds", "num_boost_round", "verbose_eval" } are not used.

  self.starting_round = model.num_boosted_rounds()


[0]	train-rmse:0.89926	valid-rmse:0.13827
[9]	train-rmse:0.89812	valid-rmse:0.14040


  model.save_model(f"out_ml/{name}/{model_name}.pt")
  cur = feature.stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reind

1
2
3


Parameters: { "early_stopping_rounds", "num_boost_round", "verbose_eval" } are not used.

  self.starting_round = model.num_boosted_rounds()


[0]	train-rmse:0.90392	valid-rmse:0.12298
[9]	train-rmse:0.90282	valid-rmse:0.12427


  model.save_model(f"out_ml/{name}/{model_name}.pt")
  cur = feature.stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reind

1
2
3


Parameters: { "early_stopping_rounds", "num_boost_round", "verbose_eval" } are not used.

  self.starting_round = model.num_boosted_rounds()


[0]	train-rmse:0.90745	valid-rmse:0.09595
[9]	train-rmse:0.90639	valid-rmse:0.09709


  model.save_model(f"out_ml/{name}/{model_name}.pt")
  cur = feature.stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reind

1
2
3


Parameters: { "early_stopping_rounds", "num_boost_round", "verbose_eval" } are not used.

  self.starting_round = model.num_boosted_rounds()


[0]	train-rmse:0.91078	valid-rmse:0.14026
[9]	train-rmse:0.90972	valid-rmse:0.14338


  model.save_model(f"out_ml/{name}/{model_name}.pt")


# Train MLP Model

In [5]:
import torch

import torch.nn as nn
import torch.optim as optim

def train_mlp_model(df_train, df_valid, df_test):
    # Fill NaN values with 0
    df_train_filled = df_train.fillna(0)
    df_valid_filled = df_valid.fillna(0)
    df_test_filled = df_test.fillna(0)

    # Separate features and labels
    X_train = df_train_filled.drop(columns=['label']).values
    y_train = df_train_filled['label'].values
    X_valid = df_valid_filled.drop(columns=['label']).values
    y_valid = df_valid_filled['label'].values
    X_test = df_test_filled.drop(columns=['label']).values
    y_test = df_test_filled['label'].values

    # Convert data to tensors
    X_train = torch.tensor(X_train, dtype=torch.float32)
    y_train = torch.tensor(y_train, dtype=torch.float32)
    X_valid = torch.tensor(X_valid, dtype=torch.float32)
    y_valid = torch.tensor(y_valid, dtype=torch.float32)
    X_test = torch.tensor(X_test, dtype=torch.float32)
    y_test = torch.tensor(y_test, dtype=torch.float32)

    # Define the MLP model
    model = nn.Sequential(
        nn.Linear(X_train.shape[1], 64),
        nn.ReLU(),
        nn.Linear(64, 32),
        nn.ReLU(),
        nn.Linear(32, 1)
    )

    # Define the loss function and optimizer
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)


    # Move the model to GPU if available
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    # Train the model
    num_epochs = 10
    batch_size = 512
    for epoch in range(num_epochs):
        # Shuffle the training data
        indices = torch.randperm(X_train.shape[0])
        X_train_shuffled = X_train[indices]
        y_train_shuffled = y_train[indices]

        # Mini-batch training
        for i in tqdm(range(0, X_train.shape[0], batch_size)):
            # Get the mini-batch
            X_batch = X_train_shuffled[i:i+batch_size]
            y_batch = y_train_shuffled[i:i+batch_size]

            # Move the mini-batch to GPU if available
            X_batch = X_batch.to(device)
            y_batch = y_batch.to(device)

            # Forward pass
            outputs = model(X_batch)
            loss = criterion(outputs.flatten(), y_batch.flatten())

            # Backward pass and optimization
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

    # Evaluate the model on the test set
    with torch.no_grad():
        # Move the test data to GPU if available
        test_outputs = model(X_test.to(device)).detach().cpu().numpy().flatten()
        pred_df = pd.concat([df_test['label'],pd.Series(test_outputs,index=df_test.index)],axis=1)
    torch.cuda.empty_cache()
    return model, pred_df

for instruments in ['csi300','csi500']:
    for train_end in range(2017,2024):
        returned = get_data_by_year(
            train_start = 2011,train_end=train_end,valid_year=train_end+1,test_year =train_end+2,
            instruments=instruments, target=target,freq='day',)
        data_all, data,data_valid,data_valid_withhead,data_test,data_test_withhead,_ = returned
        df_train = get_ml_data(data)
        df_valid = get_ml_data(data_valid)
        df_test = get_ml_data(data_test)
        df_train, df_valid, df_test = normalize_data(df_train, df_valid, df_test)
        for seed in range(5):
            reseed_everything(seed)
            model_name = 'mlp'
            name = f"{instruments}_{model_name}_{train_end}"
            os.makedirs(f"out_ml/{name}",exist_ok=True)
            model,pred = train_mlp_model(df_train, df_valid, df_test)
            # model.save_model(f"out_ml/{name}/model.pt")
            pred.to_pickle(f"out_ml/{name}/pred_{seed}.pkl")

  cur = feature.stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().reindex(df.index)
  cur = feature.shift(past).stack().rein

1
2
3


100%|██████████| 998/998 [00:03<00:00, 318.61it/s]
100%|██████████| 998/998 [00:03<00:00, 321.48it/s]
100%|██████████| 998/998 [00:03<00:00, 318.25it/s]
100%|██████████| 998/998 [00:03<00:00, 326.74it/s]
100%|██████████| 998/998 [00:03<00:00, 330.31it/s]
100%|██████████| 998/998 [00:03<00:00, 314.82it/s]
100%|██████████| 998/998 [00:02<00:00, 349.68it/s]
100%|██████████| 998/998 [00:02<00:00, 376.21it/s]
100%|██████████| 998/998 [00:02<00:00, 342.32it/s]
100%|██████████| 998/998 [00:03<00:00, 308.55it/s]
100%|██████████| 998/998 [00:03<00:00, 309.31it/s]
100%|██████████| 998/998 [00:02<00:00, 368.31it/s]
100%|██████████| 998/998 [00:02<00:00, 364.95it/s]
100%|██████████| 998/998 [00:02<00:00, 337.12it/s]
100%|██████████| 998/998 [00:02<00:00, 334.10it/s]
100%|██████████| 998/998 [00:02<00:00, 338.10it/s]
100%|██████████| 998/998 [00:02<00:00, 354.96it/s]
100%|██████████| 998/998 [00:02<00:00, 360.07it/s]
100%|██████████| 998/998 [00:02<00:00, 350.17it/s]
100%|██████████| 998/998 [00:02

1
2
3


100%|██████████| 1140/1140 [00:03<00:00, 287.50it/s]
100%|██████████| 1140/1140 [00:03<00:00, 348.53it/s]
100%|██████████| 1140/1140 [00:03<00:00, 344.02it/s]
100%|██████████| 1140/1140 [00:03<00:00, 340.00it/s]
100%|██████████| 1140/1140 [00:03<00:00, 341.52it/s]
100%|██████████| 1140/1140 [00:03<00:00, 357.43it/s]
100%|██████████| 1140/1140 [00:03<00:00, 360.84it/s]
100%|██████████| 1140/1140 [00:03<00:00, 340.93it/s]
100%|██████████| 1140/1140 [00:03<00:00, 343.91it/s]
100%|██████████| 1140/1140 [00:03<00:00, 359.32it/s]
100%|██████████| 1140/1140 [00:03<00:00, 321.94it/s]
100%|██████████| 1140/1140 [00:03<00:00, 350.22it/s]
100%|██████████| 1140/1140 [00:03<00:00, 336.86it/s]
100%|██████████| 1140/1140 [00:03<00:00, 345.86it/s]
100%|██████████| 1140/1140 [00:03<00:00, 337.39it/s]
100%|██████████| 1140/1140 [00:03<00:00, 344.86it/s]
100%|██████████| 1140/1140 [00:03<00:00, 330.69it/s]
100%|██████████| 1140/1140 [00:03<00:00, 287.73it/s]
100%|██████████| 1140/1140 [00:03<00:00, 287.4

1
2
3


100%|██████████| 1283/1283 [00:05<00:00, 229.61it/s]
100%|██████████| 1283/1283 [00:04<00:00, 287.35it/s]
100%|██████████| 1283/1283 [00:04<00:00, 317.41it/s]
100%|██████████| 1283/1283 [00:03<00:00, 332.89it/s]
100%|██████████| 1283/1283 [00:03<00:00, 348.68it/s]
100%|██████████| 1283/1283 [00:03<00:00, 336.97it/s]
100%|██████████| 1283/1283 [00:03<00:00, 329.08it/s]
100%|██████████| 1283/1283 [00:03<00:00, 346.15it/s]
100%|██████████| 1283/1283 [00:03<00:00, 337.22it/s]
100%|██████████| 1283/1283 [00:03<00:00, 352.71it/s]
100%|██████████| 1283/1283 [00:04<00:00, 312.36it/s]
100%|██████████| 1283/1283 [00:04<00:00, 308.47it/s]
100%|██████████| 1283/1283 [00:04<00:00, 278.90it/s]
100%|██████████| 1283/1283 [00:03<00:00, 340.16it/s]
100%|██████████| 1283/1283 [00:03<00:00, 375.68it/s]
100%|██████████| 1283/1283 [00:03<00:00, 361.74it/s]
100%|██████████| 1283/1283 [00:03<00:00, 322.33it/s]
100%|██████████| 1283/1283 [00:03<00:00, 341.20it/s]
100%|██████████| 1283/1283 [00:03<00:00, 387.3

1
2
3


100%|██████████| 1426/1426 [00:04<00:00, 313.59it/s]
100%|██████████| 1426/1426 [00:06<00:00, 233.20it/s]
100%|██████████| 1426/1426 [00:06<00:00, 213.57it/s]
100%|██████████| 1426/1426 [00:04<00:00, 305.58it/s]
100%|██████████| 1426/1426 [00:04<00:00, 321.27it/s]
100%|██████████| 1426/1426 [00:04<00:00, 351.37it/s]
100%|██████████| 1426/1426 [00:04<00:00, 306.49it/s]
100%|██████████| 1426/1426 [00:05<00:00, 272.44it/s]
100%|██████████| 1426/1426 [00:05<00:00, 267.59it/s]
100%|██████████| 1426/1426 [00:04<00:00, 296.17it/s]
100%|██████████| 1426/1426 [00:04<00:00, 335.70it/s]
100%|██████████| 1426/1426 [00:04<00:00, 343.38it/s]
100%|██████████| 1426/1426 [00:04<00:00, 351.27it/s]
100%|██████████| 1426/1426 [00:05<00:00, 270.14it/s]
100%|██████████| 1426/1426 [00:04<00:00, 294.52it/s]
100%|██████████| 1426/1426 [00:04<00:00, 300.34it/s]
100%|██████████| 1426/1426 [00:04<00:00, 326.12it/s]
100%|██████████| 1426/1426 [00:05<00:00, 273.21it/s]
100%|██████████| 1426/1426 [00:04<00:00, 333.8

1
2
3


100%|██████████| 1568/1568 [00:04<00:00, 321.05it/s]
100%|██████████| 1568/1568 [00:04<00:00, 322.50it/s]
100%|██████████| 1568/1568 [00:04<00:00, 354.18it/s]
100%|██████████| 1568/1568 [00:05<00:00, 299.51it/s]
100%|██████████| 1568/1568 [00:04<00:00, 360.44it/s]
100%|██████████| 1568/1568 [00:04<00:00, 340.46it/s]
100%|██████████| 1568/1568 [00:04<00:00, 349.45it/s]
100%|██████████| 1568/1568 [00:04<00:00, 372.13it/s]
100%|██████████| 1568/1568 [00:04<00:00, 362.19it/s]
100%|██████████| 1568/1568 [00:05<00:00, 288.00it/s]
100%|██████████| 1568/1568 [00:04<00:00, 319.49it/s]
100%|██████████| 1568/1568 [00:04<00:00, 319.85it/s]
100%|██████████| 1568/1568 [00:04<00:00, 323.35it/s]
100%|██████████| 1568/1568 [00:04<00:00, 332.91it/s]
100%|██████████| 1568/1568 [00:04<00:00, 331.54it/s]
100%|██████████| 1568/1568 [00:04<00:00, 339.20it/s]
100%|██████████| 1568/1568 [00:04<00:00, 353.89it/s]
100%|██████████| 1568/1568 [00:04<00:00, 341.50it/s]
100%|██████████| 1568/1568 [00:05<00:00, 275.2

1
2
3


100%|██████████| 1710/1710 [00:04<00:00, 361.06it/s]
100%|██████████| 1710/1710 [00:05<00:00, 338.21it/s]
100%|██████████| 1710/1710 [00:04<00:00, 350.98it/s]
100%|██████████| 1710/1710 [00:05<00:00, 308.04it/s]
100%|██████████| 1710/1710 [00:05<00:00, 302.38it/s]
100%|██████████| 1710/1710 [00:04<00:00, 353.28it/s]
100%|██████████| 1710/1710 [00:05<00:00, 332.30it/s]
100%|██████████| 1710/1710 [00:05<00:00, 337.85it/s]
100%|██████████| 1710/1710 [00:05<00:00, 336.28it/s]
100%|██████████| 1710/1710 [00:04<00:00, 342.55it/s]
100%|██████████| 1710/1710 [00:05<00:00, 338.47it/s]
100%|██████████| 1710/1710 [00:05<00:00, 341.21it/s]
100%|██████████| 1710/1710 [00:04<00:00, 347.28it/s]
100%|██████████| 1710/1710 [00:05<00:00, 338.52it/s]
100%|██████████| 1710/1710 [00:05<00:00, 336.10it/s]
100%|██████████| 1710/1710 [00:05<00:00, 334.62it/s]
100%|██████████| 1710/1710 [00:04<00:00, 398.41it/s]
100%|██████████| 1710/1710 [00:05<00:00, 336.91it/s]
100%|██████████| 1710/1710 [00:04<00:00, 366.1

1
2
3


100%|██████████| 1852/1852 [00:05<00:00, 348.02it/s]
100%|██████████| 1852/1852 [00:05<00:00, 322.99it/s]
100%|██████████| 1852/1852 [00:05<00:00, 352.82it/s]
100%|██████████| 1852/1852 [00:05<00:00, 337.19it/s]
100%|██████████| 1852/1852 [00:05<00:00, 328.36it/s]
100%|██████████| 1852/1852 [00:05<00:00, 358.65it/s]
100%|██████████| 1852/1852 [00:05<00:00, 345.39it/s]
100%|██████████| 1852/1852 [00:05<00:00, 349.47it/s]
100%|██████████| 1852/1852 [00:05<00:00, 356.72it/s]
100%|██████████| 1852/1852 [00:05<00:00, 324.33it/s]
100%|██████████| 1852/1852 [00:05<00:00, 353.50it/s]
100%|██████████| 1852/1852 [00:05<00:00, 351.75it/s]
100%|██████████| 1852/1852 [00:05<00:00, 337.93it/s]
100%|██████████| 1852/1852 [00:05<00:00, 309.51it/s]
100%|██████████| 1852/1852 [00:05<00:00, 331.62it/s]
100%|██████████| 1852/1852 [00:05<00:00, 355.07it/s]
100%|██████████| 1852/1852 [00:05<00:00, 343.28it/s]
100%|██████████| 1852/1852 [00:05<00:00, 329.92it/s]
100%|██████████| 1852/1852 [00:05<00:00, 340.1

1
2
3


100%|██████████| 1663/1663 [00:05<00:00, 299.28it/s]
100%|██████████| 1663/1663 [00:05<00:00, 292.92it/s]
100%|██████████| 1663/1663 [00:05<00:00, 322.11it/s]
100%|██████████| 1663/1663 [00:05<00:00, 330.23it/s]
100%|██████████| 1663/1663 [00:05<00:00, 309.18it/s]
100%|██████████| 1663/1663 [00:05<00:00, 306.50it/s]
100%|██████████| 1663/1663 [00:05<00:00, 297.41it/s]
100%|██████████| 1663/1663 [00:05<00:00, 324.41it/s]
100%|██████████| 1663/1663 [00:05<00:00, 308.71it/s]
100%|██████████| 1663/1663 [00:05<00:00, 319.25it/s]
100%|██████████| 1663/1663 [00:05<00:00, 304.34it/s]
100%|██████████| 1663/1663 [00:05<00:00, 311.45it/s]
100%|██████████| 1663/1663 [00:05<00:00, 319.60it/s]
100%|██████████| 1663/1663 [00:04<00:00, 348.52it/s]
100%|██████████| 1663/1663 [00:05<00:00, 305.05it/s]
100%|██████████| 1663/1663 [00:05<00:00, 331.87it/s]
100%|██████████| 1663/1663 [00:05<00:00, 313.05it/s]
100%|██████████| 1663/1663 [00:05<00:00, 301.92it/s]
100%|██████████| 1663/1663 [00:05<00:00, 322.9

1
2
3


100%|██████████| 1901/1901 [00:06<00:00, 277.25it/s]
100%|██████████| 1901/1901 [00:07<00:00, 255.83it/s]
100%|██████████| 1901/1901 [00:06<00:00, 280.46it/s]
100%|██████████| 1901/1901 [00:06<00:00, 291.89it/s]
100%|██████████| 1901/1901 [00:06<00:00, 309.44it/s]
100%|██████████| 1901/1901 [00:06<00:00, 306.86it/s]
100%|██████████| 1901/1901 [00:06<00:00, 275.38it/s]
100%|██████████| 1901/1901 [00:07<00:00, 269.42it/s]
100%|██████████| 1901/1901 [00:07<00:00, 261.45it/s]
100%|██████████| 1901/1901 [00:06<00:00, 283.58it/s]
100%|██████████| 1901/1901 [00:06<00:00, 294.88it/s]
100%|██████████| 1901/1901 [00:06<00:00, 291.21it/s]
100%|██████████| 1901/1901 [00:06<00:00, 290.08it/s]
100%|██████████| 1901/1901 [00:06<00:00, 271.73it/s]
100%|██████████| 1901/1901 [00:06<00:00, 277.44it/s]
100%|██████████| 1901/1901 [00:06<00:00, 296.03it/s]
100%|██████████| 1901/1901 [00:06<00:00, 305.67it/s]
100%|██████████| 1901/1901 [00:06<00:00, 294.76it/s]
100%|██████████| 1901/1901 [00:06<00:00, 307.6

1
2
3


100%|██████████| 2139/2139 [00:06<00:00, 309.34it/s]
100%|██████████| 2139/2139 [00:07<00:00, 286.49it/s]
100%|██████████| 2139/2139 [00:08<00:00, 260.68it/s]
100%|██████████| 2139/2139 [00:07<00:00, 280.90it/s]
100%|██████████| 2139/2139 [00:07<00:00, 270.84it/s]
100%|██████████| 2139/2139 [00:07<00:00, 297.11it/s]
100%|██████████| 2139/2139 [00:06<00:00, 305.84it/s]
100%|██████████| 2139/2139 [00:07<00:00, 297.34it/s]
100%|██████████| 2139/2139 [00:07<00:00, 270.52it/s]
100%|██████████| 2139/2139 [00:07<00:00, 296.58it/s]
100%|██████████| 2139/2139 [00:08<00:00, 248.85it/s]
100%|██████████| 2139/2139 [00:07<00:00, 290.38it/s]
100%|██████████| 2139/2139 [00:07<00:00, 288.52it/s]
100%|██████████| 2139/2139 [00:07<00:00, 273.35it/s]
100%|██████████| 2139/2139 [00:07<00:00, 290.39it/s]
100%|██████████| 2139/2139 [00:07<00:00, 299.37it/s]
100%|██████████| 2139/2139 [00:08<00:00, 256.20it/s]
100%|██████████| 2139/2139 [00:06<00:00, 307.45it/s]
100%|██████████| 2139/2139 [00:07<00:00, 268.3

1
2
3


100%|██████████| 2376/2376 [00:09<00:00, 262.77it/s]
100%|██████████| 2376/2376 [00:09<00:00, 261.25it/s]
100%|██████████| 2376/2376 [00:10<00:00, 235.39it/s]
100%|██████████| 2376/2376 [00:08<00:00, 276.79it/s]
100%|██████████| 2376/2376 [00:09<00:00, 250.34it/s]
100%|██████████| 2376/2376 [00:08<00:00, 277.91it/s]
100%|██████████| 2376/2376 [00:09<00:00, 251.99it/s]
100%|██████████| 2376/2376 [00:09<00:00, 242.92it/s]
100%|██████████| 2376/2376 [00:08<00:00, 273.53it/s]
100%|██████████| 2376/2376 [00:09<00:00, 261.01it/s]
100%|██████████| 2376/2376 [00:09<00:00, 243.91it/s]
100%|██████████| 2376/2376 [00:08<00:00, 289.81it/s]
100%|██████████| 2376/2376 [00:09<00:00, 240.72it/s]
100%|██████████| 2376/2376 [00:09<00:00, 263.98it/s]
100%|██████████| 2376/2376 [00:08<00:00, 280.88it/s]
100%|██████████| 2376/2376 [00:09<00:00, 257.73it/s]
100%|██████████| 2376/2376 [00:09<00:00, 259.77it/s]
100%|██████████| 2376/2376 [00:09<00:00, 239.96it/s]
100%|██████████| 2376/2376 [00:09<00:00, 237.6

1
2
3


100%|██████████| 2613/2613 [00:10<00:00, 252.12it/s]
100%|██████████| 2613/2613 [00:10<00:00, 257.86it/s]
100%|██████████| 2613/2613 [00:10<00:00, 260.05it/s]
100%|██████████| 2613/2613 [00:10<00:00, 241.52it/s]
100%|██████████| 2613/2613 [00:10<00:00, 249.18it/s]
100%|██████████| 2613/2613 [00:10<00:00, 247.62it/s]
100%|██████████| 2613/2613 [00:10<00:00, 257.89it/s]
100%|██████████| 2613/2613 [00:09<00:00, 271.38it/s]
100%|██████████| 2613/2613 [00:09<00:00, 266.97it/s]
100%|██████████| 2613/2613 [00:10<00:00, 253.72it/s]
100%|██████████| 2613/2613 [00:10<00:00, 254.61it/s]
100%|██████████| 2613/2613 [00:09<00:00, 271.87it/s]
100%|██████████| 2613/2613 [00:10<00:00, 249.69it/s]
100%|██████████| 2613/2613 [00:11<00:00, 234.19it/s]
100%|██████████| 2613/2613 [00:11<00:00, 236.49it/s]
100%|██████████| 2613/2613 [00:10<00:00, 256.74it/s]
100%|██████████| 2613/2613 [00:10<00:00, 257.97it/s]
100%|██████████| 2613/2613 [00:10<00:00, 260.97it/s]
100%|██████████| 2613/2613 [00:09<00:00, 269.0

1
2
3


100%|██████████| 2849/2849 [00:09<00:00, 297.15it/s]
100%|██████████| 2849/2849 [00:09<00:00, 306.70it/s]
100%|██████████| 2849/2849 [00:10<00:00, 272.93it/s]
100%|██████████| 2849/2849 [00:11<00:00, 246.25it/s]
100%|██████████| 2849/2849 [00:12<00:00, 233.01it/s]
100%|██████████| 2849/2849 [00:09<00:00, 303.59it/s]
100%|██████████| 2849/2849 [00:10<00:00, 268.56it/s]
100%|██████████| 2849/2849 [00:11<00:00, 254.82it/s]
100%|██████████| 2849/2849 [00:10<00:00, 269.67it/s]
100%|██████████| 2849/2849 [00:10<00:00, 268.90it/s]
100%|██████████| 2849/2849 [00:10<00:00, 278.88it/s]
100%|██████████| 2849/2849 [00:11<00:00, 254.75it/s]
100%|██████████| 2849/2849 [00:11<00:00, 241.30it/s]
100%|██████████| 2849/2849 [00:09<00:00, 287.18it/s]
100%|██████████| 2849/2849 [00:10<00:00, 280.95it/s]
100%|██████████| 2849/2849 [00:10<00:00, 274.83it/s]
100%|██████████| 2849/2849 [00:10<00:00, 277.06it/s]
100%|██████████| 2849/2849 [00:11<00:00, 258.20it/s]
100%|██████████| 2849/2849 [00:10<00:00, 261.3

1
2
3


100%|██████████| 3086/3086 [00:11<00:00, 272.55it/s]
100%|██████████| 3086/3086 [00:10<00:00, 302.54it/s]
100%|██████████| 3086/3086 [00:10<00:00, 289.79it/s]
100%|██████████| 3086/3086 [00:11<00:00, 259.98it/s]
100%|██████████| 3086/3086 [00:12<00:00, 252.94it/s]
100%|██████████| 3086/3086 [00:10<00:00, 285.74it/s]
100%|██████████| 3086/3086 [00:10<00:00, 282.38it/s]
100%|██████████| 3086/3086 [00:11<00:00, 269.19it/s]
100%|██████████| 3086/3086 [00:12<00:00, 253.24it/s]
100%|██████████| 3086/3086 [00:11<00:00, 275.91it/s]
100%|██████████| 3086/3086 [00:11<00:00, 259.30it/s]
100%|██████████| 3086/3086 [00:11<00:00, 264.87it/s]
100%|██████████| 3086/3086 [00:10<00:00, 300.39it/s]
100%|██████████| 3086/3086 [00:11<00:00, 267.57it/s]
100%|██████████| 3086/3086 [00:11<00:00, 258.35it/s]
100%|██████████| 3086/3086 [00:10<00:00, 294.93it/s]
100%|██████████| 3086/3086 [00:09<00:00, 309.85it/s]
100%|██████████| 3086/3086 [00:10<00:00, 281.66it/s]
100%|██████████| 3086/3086 [00:11<00:00, 267.3

# Show LightGbm Result

In [1]:
from alphagen.utils.correlation import batch_pearsonr, batch_spearmanr, batch_ret, batch_sharpe_ratio, batch_max_drawdown
import torch
import os
import numpy as np

def chunk_batch_spearmanr(x, y, chunk_size=100):
    n_days = len(x)
    spearmanr_list= []
    for i in range(0, n_days, chunk_size):
        spearmanr_list.append(batch_spearmanr(x[i:i+chunk_size], y[i:i+chunk_size]))
    spearmanr_list = torch.cat(spearmanr_list, dim=0)
    return spearmanr_list

def get_tensor_metrics(x, y, risk_free_rate=0.0):
    # Ensure tensors are 2D (days, stocks)
    if x.dim() > 2: x = x.squeeze(-1)
    if y.dim() > 2: y = y.squeeze(-1)

    ic_s = batch_pearsonr(x, y)
    ric_s = chunk_batch_spearmanr(x, y, chunk_size=400)
    ret_s = batch_ret(x, y) - 0.001

    ic_s = torch.nan_to_num(ic_s, nan=0.)
    ric_s = torch.nan_to_num(ric_s, nan=0.)
    ret_s = torch.nan_to_num(ret_s, nan=0.) / 20
    ic_s_mean = ic_s.mean().item()
    ic_s_std = ic_s.std().item() if ic_s.std().item() > 1e-6 else 1.0
    ric_s_mean = ric_s.mean().item()
    ric_s_std = ric_s.std().item() if ric_s.std().item() > 1e-6 else 1.0
    ret_s_mean = ret_s.mean().item()
    ret_s_std = ret_s.std().item() if ret_s.std().item() > 1e-6 else 1.0
    
    # Calculate Sharpe Ratio and Maximum Drawdown for ret series
    ret_sharpe = batch_sharpe_ratio(ret_s, risk_free_rate).item()
    ret_mdd = batch_max_drawdown(ret_s).item()
    result = dict(
        ic=ic_s_mean,
        ic_std=ic_s_std,
        icir=ic_s_mean / ic_s_std,
        ric=ric_s_mean,
        ric_std=ric_s_std,
        ricir=ric_s_mean / ric_s_std,
        ret=ret_s_mean * len(ret_s) / 3,
        ret_std=ret_s_std,
        retir=ret_s_mean / ret_s_std,
        ret_sharpe=ret_sharpe,
        ret_mdd=ret_mdd,
    )
    return result, ret_s


In [3]:
import pandas as pd
instruments = 'csi300'

result = []
for year in range(2021,2024):
    result.append(pd.read_pickle(f'out_ml/{instruments}_lgbm_{year}/pred.pkl'))

# def get_final_metrics(df):
#     ic_raw = df.groupby('datetime').corr()['label'].unstack().iloc[:,:-1]
#     ic_s = ic_raw.mean(axis=0)
#     ic_s_std = ic_raw.std(axis=0)
#     icir_s = ic_s/ic_s_std

#     ric_raw = df.groupby('datetime').corr('spearman')['label'].unstack().iloc[:,:-1]
#     ric_s = ric_raw.mean(axis=0)
#     ric_s_std = ric_raw.std(axis=0)
#     ricir_s = ric_s/ric_s_std
#     return {'IC':ic_s.mean(),'ICIR':icir_s.mean(),'RankIC':ric_s.mean(),'RankICIR':ricir_s.mean()}

# print('LightGBM Result:\n')
# print(get_final_metrics(df))
df = pd.concat(result,axis=0)
data = df.pivot_table(index="datetime", columns="instrument", values=[0,"label"])
pred = data[0].values
label = data["label"].values
res, ret_s = get_tensor_metrics(torch.tensor(pred), torch.tensor(label))
print(pd.DataFrame(res,index=['Test']))
save_path = os.path.join(f'out_ml/{instruments}_lgbm_{year}', 'ret_s.npy')
np.save(save_path, ret_s)

            ic    ic_std     icir       ric   ric_std     ricir      ret  \
Test  0.007858  0.101905  0.07711  0.010839  0.116737  0.092846 -0.60269   

      ret_std     retir  ret_sharpe   ret_mdd  
Test  0.25084 -0.010569   -0.167778  4.409597  


In [5]:
import pandas as pd
instruments = 'csi500'

result = []
for year in range(2021,2024):
    result.append(pd.read_pickle(f'out_ml/{instruments}_lgbm_{year}/pred.pkl'))

# def get_final_metrics(df):
#     ic_raw = df.groupby('datetime').corr()['label'].unstack().iloc[:,:-1]
#     ic_s = ic_raw.mean(axis=0)
#     ic_s_std = ic_raw.std(axis=0)
#     icir_s = ic_s/ic_s_std

#     ric_raw = df.groupby('datetime').corr('spearman')['label'].unstack().iloc[:,:-1]
#     ric_s = ric_raw.mean(axis=0)
#     ric_s_std = ric_raw.std(axis=0)
#     ricir_s = ric_s/ric_s_std
#     return {'IC':ic_s.mean(),'ICIR':icir_s.mean(),'RankIC':ric_s.mean(),'RankICIR':ricir_s.mean()}

# print('LightGBM Result:\n')
# print(get_final_metrics(df))
df = pd.concat(result,axis=0)
data = df.pivot_table(index="datetime", columns="instrument", values=[0,"label"])
pred = data[0].values
label = data["label"].values
res, ret_s = get_tensor_metrics(torch.tensor(pred), torch.tensor(label))
print(pd.DataFrame(res,index=['Test']))
save_path = os.path.join(f'out_ml/{instruments}_lgbm_{year}', 'ret_s.npy')
np.save(save_path, ret_s)

            ic    ic_std      icir       ric   ric_std    ricir       ret  \
Test  0.017567  0.095897  0.183183  0.031426  0.101436  0.30981  1.115041   

       ret_std     retir  ret_sharpe   ret_mdd  
Test  0.250061  0.019615    0.311374  2.001124  


# Show XGBoost Result

In [6]:
import pandas as pd
instruments = 'csi300'

result = []
for year in range(2021,2024):
    result.append(pd.read_pickle(f'out_ml/{instruments}_xgb_{year}/pred.pkl'))
# df = pd.concat(result,0)

# def get_final_metrics(df):
#     ic_raw = df.groupby('datetime').corr()['label'].unstack().iloc[:,:-1]
#     ic_s = ic_raw.mean(axis=0)
#     ic_s_std = ic_raw.std(axis=0)
#     icir_s = ic_s/ic_s_std

#     ric_raw = df.groupby('datetime').corr('spearman')['label'].unstack().iloc[:,:-1]
#     ric_s = ric_raw.mean(axis=0)
#     ric_s_std = ric_raw.std(axis=0)
#     ricir_s = ric_s/ric_s_std
#     return {'IC':ic_s.mean(),'ICIR':icir_s.mean(),'RankIC':ric_s.mean(),'RankICIR':ricir_s.mean()}

# print('XGBoost Result:\n')
# print(get_final_metrics(df))
df = pd.concat(result,axis=0)

data = df.pivot_table(index="datetime", columns="instrument", values=[0,"label"])
pred = data[0].values
label = data["label"].values
res, ret_s = get_tensor_metrics(torch.tensor(pred), torch.tensor(label))
print(pd.DataFrame(res,index=['Test']))
save_path = os.path.join(f'out_ml/{instruments}_xgb_{year}', 'ret_s.npy')
np.save(save_path, ret_s)

            ic    ic_std      icir       ric   ric_std     ricir       ret  \
Test  0.032469  0.154367  0.210337  0.054208  0.155485  0.348636  1.230674   

       ret_std     retir  ret_sharpe   ret_mdd  
Test  0.046732  0.115843     1.83895  0.557618  


In [7]:
import pandas as pd
instruments = 'csi500'

result = []
for year in range(2021,2024):
    result.append(pd.read_pickle(f'out_ml/{instruments}_xgb_{year}/pred.pkl'))
# df = pd.concat(result,0)

# def get_final_metrics(df):
#     ic_raw = df.groupby('datetime').corr()['label'].unstack().iloc[:,:-1]
#     ic_s = ic_raw.mean(axis=0)
#     ic_s_std = ic_raw.std(axis=0)
#     icir_s = ic_s/ic_s_std

#     ric_raw = df.groupby('datetime').corr('spearman')['label'].unstack().iloc[:,:-1]
#     ric_s = ric_raw.mean(axis=0)
#     ric_s_std = ric_raw.std(axis=0)
#     ricir_s = ric_s/ric_s_std
#     return {'IC':ic_s.mean(),'ICIR':icir_s.mean(),'RankIC':ric_s.mean(),'RankICIR':ricir_s.mean()}

# print('XGBoost Result:\n')
# print(get_final_metrics(df))
df = pd.concat(result,axis=0)

data = df.pivot_table(index="datetime", columns="instrument", values=[0,"label"])
pred = data[0].values
label = data["label"].values
res, ret_s = get_tensor_metrics(torch.tensor(pred), torch.tensor(label))
print(pd.DataFrame(res,index=['Test']))
save_path = os.path.join(f'out_ml/{instruments}_xgb_{year}', 'ret_s.npy')
np.save(save_path, ret_s)

            ic    ic_std      icir       ric   ric_std     ricir      ret  \
Test  0.031357  0.128164  0.244662  0.056449  0.130752  0.431721  0.70863   

       ret_std     retir  ret_sharpe   ret_mdd  
Test  0.036338  0.085782    1.361755  0.599928  


# Show MLP Result

In [8]:
import pandas as pd
instruments = 'csi300'

result_all = []
for seed in range(1):
    result = []
    for year in range(2021,2024):
        result.append(pd.read_pickle(f'out_ml/{instruments}_mlp_{year}/pred_{seed}.pkl'))
    df = pd.concat(result,axis=0)#.groupby('datetime').corr('spearman')['label'].unstack().mean()
    data = df.pivot_table(index="datetime", columns="instrument", values=[0,"label"])
    pred = data[0].values
    label = data["label"].values
    res, ret_s = get_tensor_metrics(torch.tensor(pred), torch.tensor(label))
    print(pd.DataFrame(res,index=['Test']))
    save_path = os.path.join(f'out_ml/{instruments}_mlp_{year}', 'ret_s.npy')
    np.save(save_path, ret_s)
#     df.columns = ['pred','label']
#     df=df[['pred','label']]



#     def get_final_metrics(df):
#         ic_raw = df.groupby('datetime').corr()['label'].unstack().iloc[:,:-1]
#         ic_s = ic_raw.mean(axis=0)
#         ic_s_std = ic_raw.std(axis=0)
#         icir_s = ic_s/ic_s_std

#         ric_raw = df.groupby('datetime').corr('spearman')['label'].unstack().iloc[:,:-1]
#         ric_s = ric_raw.mean(axis=0)
#         ric_s_std = ric_raw.std(axis=0)
#         ricir_s = ric_s/ric_s_std
#         return {'IC':ic_s.mean(),'ICIR':icir_s.mean(),'RankIC':ric_s.mean(),'RankICIR':ricir_s.mean()}
#     tmp = get_final_metrics(df)
#     result_all.append(tmp)
# print('MLP Result:\n')
# print(result_all)

            ic    ic_std      icir       ric   ric_std    ricir       ret  \
Test  0.020873  0.136993  0.152364  0.035296  0.131846  0.26771  0.060805   

       ret_std     retir  ret_sharpe   ret_mdd  
Test  0.005112  0.052318     0.83053  0.319244  


In [9]:
import pandas as pd
instruments = 'csi500'

result_all = []
for seed in range(1):
    result = []
    for year in range(2021,2024):
        result.append(pd.read_pickle(f'out_ml/{instruments}_mlp_{year}/pred_{seed}.pkl'))
    df = pd.concat(result,axis=0)#.groupby('datetime').corr('spearman')['label'].unstack().mean()
    data = df.pivot_table(index="datetime", columns="instrument", values=[0,"label"])
    pred = data[0].values
    label = data["label"].values
    res, ret_s = get_tensor_metrics(torch.tensor(pred), torch.tensor(label))
    print(pd.DataFrame(res,index=['Test']))
    save_path = os.path.join(f'out_ml/{instruments}_mlp_{year}', 'ret_s.npy')
    np.save(save_path, ret_s)
#     df.columns = ['pred','label']
#     df=df[['pred','label']]



#     def get_final_metrics(df):
#         ic_raw = df.groupby('datetime').corr()['label'].unstack().iloc[:,:-1]
#         ic_s = ic_raw.mean(axis=0)
#         ic_s_std = ic_raw.std(axis=0)
#         icir_s = ic_s/ic_s_std

#         ric_raw = df.groupby('datetime').corr('spearman')['label'].unstack().iloc[:,:-1]
#         ric_s = ric_raw.mean(axis=0)
#         ric_s_std = ric_raw.std(axis=0)
#         ricir_s = ric_s/ric_s_std
#         return {'IC':ic_s.mean(),'ICIR':icir_s.mean(),'RankIC':ric_s.mean(),'RankICIR':ricir_s.mean()}
#     tmp = get_final_metrics(df)
#     result_all.append(tmp)
# print('MLP Result:\n')
# print(result_all)

            ic    ic_std      icir       ric  ric_std     ricir       ret  \
Test  0.026362  0.096785  0.272374  0.042072  0.09928  0.423769  0.166096   

       ret_std     retir  ret_sharpe   ret_mdd  
Test  0.004958  0.147349    2.339087  0.132326  
