In [3]:
import warnings
warnings.filterwarnings('ignore')

import torch
import pandas as pd
from src.metric import ICMetric
from src.model import AlphaForge

result_dir = '/home/m1ngx/quant/results/'
data_dir = '/home/m1ngx/quant/data/'
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
# load data
close = pd.read_feather('/home/m1ngx/quant/data/daily/close.ftr')
open = pd.read_feather('/home/m1ngx/quant/data/daily/open.ftr')
high = pd.read_feather('/home/m1ngx/quant/data/daily/high.ftr')
low = pd.read_feather('/home/m1ngx/quant/data/daily/low.ftr')
amount = pd.read_feather('/home/m1ngx/quant/data/daily/amount.ftr')
volume = pd.read_feather('/home/m1ngx/quant/data/daily/volume.ftr')
train_idx = close.index[pd.to_datetime(close.index) < '2021-01-01']

feature_data = {
    'close': close.loc[train_idx],
    'open': open.loc[train_idx],
    'high': high.loc[train_idx],
    'low': low.loc[train_idx],
    'amount': amount.loc[train_idx],
    'volume': volume.loc[train_idx],
}

# convert to torch tensor
for k, v in feature_data.items():
    feature_data[k] = torch.tensor(v.values, dtype=torch.float32, device=device)

# using any label you like
ret = close.pct_change(5).shift(-5).loc[train_idx]
ret_rank = ret.rank(axis=1, pct=True)
ret_rank = torch.tensor(ret_rank.values, dtype=torch.float32, device=device)
metric = ICMetric(ret_rank, method='pearson')

In [None]:
# same format for all the features as well as the label, nan allowed
display(close.head())

code,sh.600000,sh.600004,sh.600005,sh.600006,sh.600007,sh.600008,sh.600009,sh.600010,sh.600011,sh.600012,...,sz.301630,sz.301631,sz.301632,sz.301633,sz.301636,sz.301658,sz.301662,sz.301665,sz.301678,sz.302132
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2015-01-05,96.671576,16.021395,15.900525,16.326623,20.396198,32.716495,49.796891,22.123511,28.356944,9.668132,...,,,,,,,,,,31.11
2015-01-06,97.032515,16.076404,15.60529,16.588268,19.67146,35.990851,48.729641,21.561482,27.538955,10.201862,...,,,,,,,,,,33.76
2015-01-07,95.107506,15.911376,15.73182,16.24813,19.658518,34.529568,48.947942,21.919137,27.508659,9.896874,...,,,,,,,,,,33.51
2015-01-08,91.738739,15.56757,15.01482,15.72484,19.179673,34.231899,48.51134,21.561482,26.236232,9.622384,...,,,,,,,,,,33.56
2015-01-09,92.821557,15.636331,14.803937,15.515525,18.985547,33.095346,48.390062,21.357108,25.933273,9.561386,...,,,,,,,,,,31.87


In [None]:
window_sizes = [5, 10, 20]
hidden_size = 256
max_length = 10

max_factors = 100
init_sample_size = 4096
max_sample_size = 20000
batch_size_p = 64
num_epochs_p = 1000
learning_rate_p = 1e-4
early_stopping_p = 20
batch_size_g = 64
num_epochs_g = 500
learning_rate_g = 1e-3
early_stopping_g = 10
ic_threshold = 0.05
corr_threshold = 0.6

alpha_forge = AlphaForge(feature_data, window_sizes, metric, hidden_size, max_length, device)
alpha_pool = alpha_forge.train_model(
    max_factors=max_factors,
    init_sample_size=init_sample_size,
    max_sample_size=max_sample_size,
    batch_size_p=batch_size_p,
    num_epochs_p=num_epochs_p,
    learning_rate_p=learning_rate_p,
    early_stopping_p=early_stopping_p,
    batch_size_g=batch_size_g,
    num_epochs_g=num_epochs_g,
    learning_rate_g=learning_rate_g,
    early_stopping_g=early_stopping_g,
    ic_threshold=ic_threshold,
    corr_threshold=corr_threshold,
    result_dir=result_dir
)

In [None]:
feature_data_all = {
    'close': close,
    'open': open,
    'high': high,
    'low': low,
    'amount': amount,
    'volume': volume,
}

for k, v in feature_data.items():
    feature_data_all[k] = torch.tensor(v.values, dtype=torch.float32, device=device)

# calculate factor values using all data
factors = {}
for i, (k, v) in enumerate(alpha_pool.items()):
    data = alpha_forge.calculate_expression(k, rank=True)
    factors[k] = pd.DataFrame(data.cpu().numpy(), index=close.index, columns=close.columns)