In [1]:
import optuna
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
import lightgbm as lgb
import xgboost as xgb
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

data = pd.read_csv('factor_pass9.csv', index_col=0)
data['continuous_count'] = data.groupby((data['evebit'] != data['evebit'].shift(1)).cumsum()).cumcount() + 1
data = data.reindex(columns=data.columns.tolist()[:-2] + ['continuous_count', 'return'])
data.dropna(inplace=True)

insample_data = data.loc[data.loc[:, 'date'] < "2022-01-01", :]
insample_data_sorted = insample_data.sort_values('date')
insample_data_sorted['rank'] = insample_data_sorted['date'].rank()
sum_of_weight = (insample_data_sorted['rank']).sum()
insample_data_sorted['weight'] = insample_data_sorted['rank'] / sum_of_weight
outdsample_data = data.loc[data.loc[:, 'date'] >= "2022-01-01", :]

X = insample_data_sorted.iloc[:, 3:-3].astype(float)
y = insample_data_sorted.iloc[:, -3].astype(float)
other_info_outsample_test = outdsample_data.iloc[:, :3]
X_outsample_test = outdsample_data.iloc[:, 3:-1].astype(float)
y_outsample_test = outdsample_data.iloc[:, -1].astype(float)

def objective_lgb(trial):
    params = {
        'objective': 'regression',
        'metric': 'rmse',
        'boosting_type': 'gbdt',
        'verbosity': -1,
        'lambda_l1': trial.suggest_float('lambda_l1', 1e-8, 10.0, log = True),
        'lambda_l2': trial.suggest_float('lambda_l2', 1e-8, 10.0, log = True),
        'num_leaves': trial.suggest_int('num_leaves', 2, 256),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.2, 1.0),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.2, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 0, 8),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'max_depth': trial.suggest_int('max_depth', 2, 30),
        'random_state': 42
    }
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    avg_rmse = 0.0
    
    for train_idx, val_idx in kf.split(X):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
        weight_train, weight_val = insample_data_sorted.iloc[train_idx, -1], insample_data_sorted.iloc[val_idx, -1]
    
        lgb_train = lgb.Dataset(X_train, label=y_train, weight=weight_train)
        lgb_val = lgb.Dataset(X_val, label=y_val, weight=weight_val)
        
        model = lgb.train(
            params,
            lgb_train,
            valid_sets=[lgb_train, lgb_val]
        )
        
        preds = model.predict(X_val, num_iteration=model.best_iteration)
        rmse = mean_squared_error(y_val, preds, squared=False)
        avg_rmse += rmse / 5  # Average RMSE over folds
        
    return avg_rmse

def objective_xgb(trial):
    params = {
        'objective': 'reg:squarederror',
        'verbosity': 0,
        'lambda': trial.suggest_float('lambda', 1e-8, 10.0, log = True),
        'alpha': trial.suggest_float('alpha', 1e-8, 10.0, log = True),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.2, 1.0),
        'subsample': trial.suggest_float('subsample', 0.2, 1.0),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2),
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'max_depth': trial.suggest_int('max_depth', 2, 30),
        'random_state': 42,
        'baggling_fraction': trial.suggest_float('bagging_fraction', 0.2, 1.0),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.2, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 0, 8),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
    }
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    avg_rmse = 0.0
    
    for train_idx, val_idx in kf.split(X):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
        weight_train, weight_val = insample_data_sorted.iloc[train_idx, -2], insample_data_sorted.iloc[val_idx, -2]
        
        dtrain = xgb.DMatrix(X_train, label=y_train, weight = weight_train)
        dval = xgb.DMatrix(X_val, label=y_val, weight = weight_val)
        
        model = xgb.train(params, dtrain, evals=[(dtrain, 'train'), (dval, 'eval')], verbose_eval=False, early_stopping_rounds=50)
        preds = model.predict(dval)
        rmse = mean_squared_error(y_val, preds, squared=False)
        avg_rmse += rmse / 5  # Taking average RMSE over folds
        
    return avg_rmse

In [2]:
study_lgb = optuna.create_study(direction='minimize', sampler=optuna.samplers.TPESampler(seed=42))
study_lgb.optimize(objective_lgb, n_trials=100)

best_params_lgb = study_lgb.best_params
best_rmse_lgb = study_lgb.best_value

study_xgb = optuna.create_study(direction='minimize', sampler=optuna.samplers.TPESampler(seed=42))
study_xgb.optimize(objective_xgb, n_trials=100)

best_params_xgb = study_xgb.best_params
best_rmse_xgb = study_xgb.best_value

[I 2023-12-13 10:13:31,206] A new study created in memory with name: no-name-dde4da1b-4f7a-4a63-8a90-52a09d270d65
[I 2023-12-13 10:13:32,914] Trial 0 finished with value: 0.025383145516882366 and parameters: {'lambda_l1': 2.348881295853308e-05, 'lambda_l2': 3.6010467344475403, 'num_leaves': 188, 'feature_fraction': 0.6789267873576292, 'bagging_fraction': 0.3248149123539492, 'bagging_freq': 1, 'min_child_samples': 10, 'max_depth': 27}. Best is trial 0 with value: 0.025383145516882366.
[I 2023-12-13 10:13:33,376] Trial 1 finished with value: 0.025400561065519283 and parameters: {'lambda_l1': 0.002570603566117598, 'lambda_l2': 0.023585940584142682, 'num_leaves': 7, 'feature_fraction': 0.9759278817295955, 'bagging_fraction': 0.8659541126403374, 'bagging_freq': 1, 'min_child_samples': 22, 'max_depth': 7}. Best is trial 0 with value: 0.025383145516882366.
[I 2023-12-13 10:13:35,265] Trial 2 finished with value: 0.025770321119838478 and parameters: {'lambda_l1': 5.472429642032198e-06, 'lambda

In [5]:
def cal_ICIR(data: pd.DataFrame, feild: str) -> tuple[float, float]:
    """
    data is a dataframe with columns: date, return, factor feild
    feild is the factor name
    return IC and IR
    """
    data = data.loc[:, ['date', 'return', feild]]
    data.dropna(inplace=True)
    IC_dataframe = data.groupby('date').apply(lambda x: x.corr(method='spearman')[feild]['return'])
    return IC_dataframe.mean(), IC_dataframe.mean()/IC_dataframe.std()

def test_factor(ICIR: tuple[float, float]) -> str:
    """
    ICIR is a tuple of IC and IR
    return the test result
    """
    if abs(ICIR[0]) > 0.01 and abs(ICIR[1]) > 0.03:
        return 'pass'
    else:
        return 'fail'

In [2]:
import joblib
model = joblib.load('./model/lightGBM.pkl')

In [6]:
dtrain = lgb.Dataset(X, label=y)
model = lgb.train(best_params_lgb, dtrain)
preds = model.predict(X_outsample_test) 
matrix = pd.concat([other_info_outsample_test, pd.DataFrame(preds, columns=['preds'], index = X_outsample_test.index), y_outsample_test], axis=1)
ICIR = cal_ICIR(matrix, "preds")
print(ICIR)
result = test_factor(ICIR)

(0.04098354371529285, 0.10894446726011417)


In [5]:
import joblib
joblib.dump(model, filename='./model/lightGBM.pkl')

['./model/lightGBM.pkl']

In [7]:
matrix.to_csv('./result/lightGBM.csv')

In [12]:
dtrain = xgb.DMatrix(X, label=y)
model = xgb.train(best_params_xgb, dtrain)
preds = model.predict(xgb.DMatrix(X_outsample_test))
matrix = pd.concat([other_info_outsample_test, pd.DataFrame(preds, columns=['preds'], index = X_outsample_test.index), y_outsample_test], axis=1)
ICIR = cal_ICIR(matrix, "preds")
print(ICIR)
result = test_factor(ICIR)
print(result)

(0.03939948351261069, 0.10843730464715315)
pass


In [9]:
import joblib
joblib.dump(model, filename='./model/XGBoost.pkl')

['./model/XGBoost.pkl']

In [13]:
matrix.to_csv('./result/XGBoost.csv')

In [14]:
abs_IC = []
abs_IR = []
for i in outdsample_data.columns[4:-1]:
    ICIR = cal_ICIR(outdsample_data, i)
    abs_IC.append(abs(ICIR[0]))
    abs_IR.append(abs(ICIR[1]))
    print(i)
    print(ICIR)
    result = test_factor(ICIR)
    print(result)

dividendyield
(0.015824726607574644, 0.042184647306419805)
pass
day_momentum
(-0.03689649790894963, -0.09217629437455241)
pass
val_ortoev_ttm
(0.004996381701627184, 0.010633179127117202)
fail
val_lnmv
(-0.010571168040773474, -0.02969597672129529)
fail
val_lntotassets
(0.0010502788330560924, 0.003110385890002309)
fail
fa_sellexpensetogr_ttm
(-0.003763208351110012, -0.010505331561647416)
fail
fa_salestocost_ttm
(0.0028314831635785215, 0.007291340941836725)
fail
mmt_discret_w
(-0.012501906316337128, -0.034048293924522015)
pass
vol_up_std_m
(-0.001158797691518292, -0.002693989094036354)
fail
dividendyield2
(0.004289274651796401, 0.009095240775840676)
fail
val_floatmv
(-0.0024735025088819806, -0.0075502323326710075)
fail
risk_variance20
(-0.009785025227259643, -0.02135754289821015)
fail
risk_lossvariance20
(-0.017069249920817615, -0.038097589977360576)
pass
risk_beta20
(0.010234820763753061, 0.024995799227719252)
fail
risk_volatilityratio
(-0.013005732597344619, -0.029162272673061564)
fail


In [15]:
sorted(abs_IC, reverse=True)

[0.03689649790894963,
 0.026512998470046022,
 0.019731197614579584,
 0.018572959184153297,
 0.01710687269746506,
 0.017069249920817615,
 0.015824726607574644,
 0.013005732597344619,
 0.012501906316337128,
 0.012481426753688983,
 0.010571168040773474,
 0.010234820763753061,
 0.009785025227259643,
 0.009503046040429404,
 0.007915464923685743,
 0.007320708279582767,
 0.004996381701627184,
 0.004758114551516592,
 0.004289274651796401,
 0.003763208351110012,
 0.0028314831635785215,
 0.0024735025088819806,
 0.001441978074287302,
 0.001158797691518292,
 0.0010502788330560924,
 0.0004440169096252624,
 nan]

In [16]:
sorted(abs_IR, reverse=True)

[0.09217629437455241,
 0.07745444774194984,
 0.05645075343469298,
 0.0543580827941368,
 0.04825639989170216,
 0.042184647306419805,
 0.038097589977360576,
 0.034048293924522015,
 0.02969597672129529,
 0.029578191177139307,
 0.029162272673061564,
 0.026925380060587138,
 0.024995799227719252,
 0.022212844881620723,
 0.02135754289821015,
 0.017652525366289463,
 0.013341822957780526,
 0.010633179127117202,
 0.010505331561647416,
 0.009095240775840676,
 0.0075502323326710075,
 0.007291340941836725,
 0.0035244772111479995,
 0.003110385890002309,
 0.002693989094036354,
 0.0013220020126310454,
 nan]