In [1]:
import json
import os
from collections import Counter

import numpy as np

from alphagen.data.expression import *
from alphagen.models.alpha_pool import AlphaPool
from alphagen.utils.correlation import batch_pearsonr, batch_spearmanr
from alphagen.utils.pytorch_utils import normalize_by_day
from alphagen.utils.random import reseed_everything
from alphagen_generic.operators import funcs as generic_funcs
from alphagen_generic.features import *
from alphagen_qlib.calculator import QLibStockDataCalculator
# from gplearn.fitness import make_fitness
# from gplearn.functions import make_function
# from gplearn.genetic import SymbolicRegressor


# funcs = [make_function(**func._asdict()) for func in generic_funcs]

instruments = '15min_symbols'
seed = 4
reseed_everything(seed)

cache = {}
# device = torch.device('cuda:0')
device = torch.device('cpu')
close = Feature(FeatureType.CLOSE)
target = Ref(close, -1) / close - 1
data_train = StockData(instrument=instruments,
                        start_time='2022-11-01 00:00:00',
                        end_time='2024-02-29 23:45:00')
data_valid = StockData(instrument=instruments,
                        start_time='2024-03-01 00:00:00',
                        end_time='2024-05-31 23:45:00')
data_test = StockData(instrument=instruments,
                        start_time='2024-06-01 00:00:00',
                        end_time='2024-08-31 23:45:00')
calculator_train = QLibStockDataCalculator(data_train, target)
calculator_valid = QLibStockDataCalculator(data_valid, target)
calculator_test = QLibStockDataCalculator(data_test, target)

pool = AlphaPool(capacity=10,
                 calculator=calculator_train,
                 ic_lower_bound=None,
                 l1_alpha=5e-3)


def try_single():
    top_key = Counter(cache).most_common(1)[0][0]
    expr = eval(top_key)
    ic_valid, ric_valid = calculator_valid.calc_single_all_ret(expr)
    ic_test, ric_test = calculator_test.calc_single_all_ret(expr)
    return {'ic_test': ic_test,
            'ic_valid': ic_valid,
            'ric_test': ric_test,
            'ric_valid': ric_valid}


def try_pool(capacity):
    pool = AlphaPool(capacity=capacity,
                     calculator=calculator_train,
                     ic_lower_bound=None)

    exprs = []
    for key in dict(Counter(cache).most_common(capacity)):
        exprs.append(eval(key))
    pool.force_load_exprs(exprs)
    pool._optimize(alpha=5e-3, lr=5e-4, n_iter=2000)

    ic_test, ric_test = pool.test_ensemble(calculator_test)
    ic_valid, ric_valid = pool.test_ensemble(calculator_valid)
    return {'ic_test': ic_test,
            'ic_valid': ic_valid,
            'ric_test': ric_test,
            'ric_valid': ric_valid}


generation = 0

def ev():
    global generation
    generation += 1
    res = (
        [{'pool': 0, 'res': try_single()}] +
        [{'pool': cap, 'res': try_pool(cap)} for cap in (10, 20, 50, 100)]
    )
    print(res)
    dir_ = './path/1110'
    os.makedirs(dir_, exist_ok=True)
    if generation % 2 == 0:
        with open(f'{dir_}/{generation}.json', 'w') as f:
            json.dump({'cache': cache, 'res': res}, f)

[528841:MainThread](2024-11-10 15:10:18,917) INFO - qlib.Initialization - [config.py:416] - default_conf: client.
[528841:MainThread](2024-11-10 15:10:19,201) INFO - qlib.Initialization - [__init__.py:74] - qlib successfully initialized based on client settings.
[528841:MainThread](2024-11-10 15:10:19,203) INFO - qlib.Initialization - [__init__.py:76] - data_path={'__DEFAULT_FREQ': PosixPath('/root/jupyter/CTA/alphagen/my_data/qlib')}


In [3]:
import ast
import astor
import re

class FormulaTransformer(ast.NodeTransformer):
    def visit_Call(self, node):
        # 检查是否是函数调用
        if isinstance(node.func, ast.Name):
            # 提取函数名称
            func_name = node.func.id

            match = re.match(r'(Ref|Mean|Sum|Std|Var|Max|Min|Med|Mad|Delta|WMA|EMA|Cov|Corr)(\d+)', func_name)
            if match:
                base_name = match.group(1)  # 提取函数名称 'mean' 或 'std'
                window_size = int(match.group(2))  # 提取数字部分

                # 创建新的函数名称节点
                node.func.id = base_name

                # 添加第二个参数（窗口大小）到参数列表
                node.args.append(ast.Constant(value=window_size, kind=None))

        # 递归访问子节点
        self.generic_visit(node)
        return node

In [4]:
from deap import base, creator, tools, gp, algorithms
from functools import partial
import random


features = ['open_', 'high', 'low', 'close', 'volume', 
            'quote_asset_volume', 'number_of_trades', 
            'taker_buy_base_asset_volume', 'taker_buy_quote_asset_volume']
# features = [open_, high, low, close, volume, 
#             quote_asset_volume, number_of_trades, 
#             taker_buy_base_asset_volume, taker_buy_quote_asset_volume]
constants = [f'Constant({v})' for v in [-30., -10., -5., -2., -1., -0.5, -0.01, 0.01, 0.5, 1., 2., 5., 10., 30.]]
# constants = []

terminals = features + constants

# Define custom fitness function for DEAP
def custom_fitness(individual):
    expression_tree = ast.parse(str(individual), mode='eval')
    transformer = FormulaTransformer()
    modified_tree = transformer.visit(expression_tree)
    new_expr = astor.to_source(modified_tree)
    new_expr = new_expr.replace("$", "")
    # print(new_expr)
    tmp = 'calculator_train.calc_single_IC_ret({})'.format(new_expr)
    # ic = eval(tmp)
    try:
        ic = eval(tmp)
    except OutOfDataRangeError:
        # print(new_expr)
        ic = -0.
    if np.isnan(ic):
        ic = -0.
    cache[new_expr] = ic
    return [abs(ic)]

# Define DEAP creator
creator.create("FitnessMax", base.Fitness, weights=(1.0,))  # Minimize fitness for regression
creator.create("Individual", gp.PrimitiveTree, fitness=creator.FitnessMax)

# Initialize DEAP primitive set
pset = gp.PrimitiveSet("MAIN", arity=len(features))  # Set number of input variables based on your requirements
for i, feature in enumerate(terminals):
    pset.renameArguments(**{f'ARG{i}':feature})
# Add customized operation
for func in generic_funcs:
    pset.addPrimitive(func.function, func.arity, name=func.name)
    
# pset.addEphemeralConstant("randnum",  partial(random.choice, [-30., -10., -5., -2., -1., -0.5, -0.01, 0.01, 0.5, 1., 2., 5., 10., 30.]))

# Register necessary operators for DEAP's genetic programming
toolbox = base.Toolbox()
toolbox.register("expr", gp.genHalfAndHalf, pset=pset, min_=2, max_=6)
toolbox.register("individual", tools.initIterate, creator.Individual, toolbox.expr)
toolbox.register("population", tools.initRepeat, list, toolbox.individual)
toolbox.register("evaluate", custom_fitness)
toolbox.register("select", tools.selTournament, tournsize=600)
toolbox.register("mate", gp.cxOnePoint)
toolbox.register("expr_mut", gp.genFull, min_=0, max_=3)
toolbox.register("mutate", gp.mutUniform, expr=toolbox.expr_mut, pset=pset)

In [5]:
import random
import numpy


def main():
    # random.seed(328)
    pop = toolbox.population(n=1000)
    hof = tools.HallOfFame(20)

    stats = tools.Statistics(lambda ind: ind.fitness.values)
    # stats_size = tools.Statistics(len)
    # mstats = tools.MultiStatistics(fitness=stats_fit, size=stats_size)
    stats.register("avg", numpy.mean)
    stats.register("std", numpy.std)
    stats.register("min", numpy.min)
    stats.register("max", numpy.max)

    pop, log = algorithms.eaSimple(pop, toolbox, cxpb=0.6, mutpb=0.3, ngen=40, stats=stats,
                                   halloffame=hof, verbose=True)
    # print log
    return pop, log, hof

In [6]:
pop, log, hof = main()

gen	nevals	avg       	std       	min	max      
0  	1000  	0.00217513	0.00285279	0  	0.0138404
1  	697   	0.00787773	0.00524527	0  	0.0143444
2  	729   	0.00866287	0.00571966	0  	0.0151518
3  	725   	0.00948997	0.00617635	0  	0.0151518
4  	741   	0.00936484	0.0061245 	0  	0.0151518
5  	733   	0.00944871	0.00617906	0  	0.0151518
6  	732   	0.00940598	0.00622308	0  	0.0151518
7  	693   	0.00981604	0.00617002	0  	0.0151518
8  	714   	0.00962173	0.00615225	0  	0.0151518
9  	690   	0.00958606	0.00610828	0  	0.0151518
10 	745   	0.00919567	0.00622346	0  	0.0172778
11 	713   	0.00909442	0.0059567 	0  	0.0172778
12 	710   	0.00973992	0.00752229	0  	0.0172778
13 	734   	0.0100606 	0.00753835	0  	0.0172778
14 	728   	0.00989319	0.00750672	0  	0.0172778
15 	706   	0.0102229 	0.00774259	0  	0.0642332
16 	680   	0.0184042 	0.0221635 	0  	0.0642332
17 	694   	0.0356047 	0.0305578 	0  	0.0642332
18 	721   	0.0340321 	0.0306428 	0  	0.0642332
19 	720   	0.0335647 	0.0305178 	0  	0.0642332
20 	728   	0.

In [7]:
for i, individual in enumerate(hof):
    print(f"Expression {i + 1}: {str(individual)}")

Expression 1: Div(WMA10(close), Abs(close))
Expression 2: Div(WMA10(close), close)
Expression 3: Div(EMA10(low), close)
Expression 4: Div(WMA10(Add(open_, open_)), close)
Expression 5: Div(WMA10(Abs(open_)), close)
Expression 6: Div(WMA10(open_), close)
Expression 7: Div(Abs(high), close)
Expression 8: Div(Less(Greater(high, low), EMA50(taker_buy_quote_asset_volume)), close)
Expression 9: Div(high, Abs(close))
Expression 10: Div(high, Less(close, close))
Expression 11: Div(high, close)
Expression 12: Div(close, high)
Expression 13: Div(WMA10(Abs(low)), close)
Expression 14: Div(WMA10(Greater(low, low)), close)
Expression 15: Div(WMA10(low), close)
Expression 16: Div(WMA10(high), close)
Expression 17: Div(Sum10(close), close)
Expression 18: Div(Mean10(close), close)
Expression 19: Div(close, Mean10(close))
Expression 20: Div(WMA10(WMA10(close)), close)


In [14]:
for i, individual in enumerate(hof):
    expression_tree = ast.parse(str(individual), mode='eval')
    transformer = FormulaTransformer()
    modified_tree = transformer.visit(expression_tree)
    new_expr = astor.to_source(modified_tree)
    new_expr = new_expr.replace("$", "")
    new_expr = eval(new_expr)
    data_df = data_test.make_dataframe(new_expr.evaluate(data_test))
    data_df.reset_index(inplace=True)
    data_df.rename(columns={'0':new_expr}, inplace=True)
    data_df.to_csv('./factors/{}.csv'.format(new_expr), index=None)

In [15]:
target

Sub(Div(Ref($close,-1),$close),Constant(1))