In [2]:
import pandas as pd
from tqdm import tqdm
import pickle
import numpy as np
import statsmodels.api as sm
import os 
from matplotlib import pyplot as plt
#时间
import datetime

#算法辅助&数据
from sklearn.model_selection import KFold,cross_validate   #交叉验证
from sklearn.metrics import confusion_matrix,accuracy_score,roc_curve,classification_report
from sklearn.model_selection import train_test_split     #训练集测试集拆分

#算法（单一学习器）                          
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier   
from xgboost.sklearn import XGBClassifier
from sklearn.svm import SVC

#融合模型
from sklearn.ensemble import VotingClassifier

# 关闭警告
import warnings
warnings.filterwarnings("ignore")
import logging
logging.getLogger().setLevel(logging.ERROR)

### 1 数据导入

In [5]:
# 基准指数
index_item = '000300.XSHG'
file_path = './300/Financials'   #InformationTechnology
# 机器学习数据
data = pd.read_pickle(os.path.join(file_path, 'data/ml_data.pkl')) #'./300/Financials/data/ml_data.pkl'
# 研究周期
date_list = sorted(set(data.index.get_level_values(0)))                                    # 研究周期确定
# 建立特征集                                         
data

Unnamed: 0_level_0,Unnamed: 1_level_0,pe_ratio_ttm,ep_ratio_ttm,pb_ratio_ttm,book_to_market_ratio_ttm,dividend_yield_ttm,ps_ratio_ttm,sp_ratio_ttm,weighted_common_stock_ttm,diluted_common_stock_ttm,fixed_asset_turnover_ttm,...,deferred_revenue_ttm_0,paid_in_capital_ttm_0,equity_preferred_stock_ttm_0,capital_reserve_ttm_0,surplus_reserve_ttm_0,general_reserve_ttm_0,ret_5d,excess_ret_5d,current_ret,target
date,order_book_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2020-01-02,000001.XSHE,0.491892,-2.352586,0.829540,-1.939777,-3.267715,0.167533,-0.436247,-0.012161,-0.173722,0.818821,...,,0.229447,-1.245639,1.067219,-0.788240,1.045909,-0.008855,-0.002834,0.017415,False
2020-01-02,000166.XSHE,-0.399074,0.076039,-0.473085,0.145983,-0.141754,-0.141216,-0.540358,3.331139,3.197824,0.240929,...,,3.248588,,-0.900885,-0.648681,0.112428,-0.003888,0.002134,-0.007736,True
2020-01-02,000627.XSHE,-0.363849,0.201827,-0.063105,-0.110651,-0.549181,-2.651091,3.004346,0.244681,0.236792,1.448473,...,,0.299717,,0.155598,-0.261653,,-0.015576,-0.009555,-0.004240,False
2020-01-02,000728.XSHE,-0.047514,0.023232,-1.169693,1.038002,2.207949,1.176209,-0.481576,0.013457,0.027954,-2.062625,...,,-0.011840,,0.422706,0.336175,0.426688,0.004314,0.010336,-0.010664,True
2020-01-02,000776.XSHE,-0.654702,0.346968,-1.328400,1.008386,0.264397,-0.265885,-0.471541,-0.469797,-0.486314,-0.564988,...,,-0.420020,,0.594528,0.502844,0.647267,0.012988,0.019009,-0.006452,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2025-03-28,601916.XSHG,-0.628489,0.688156,-0.291167,1.150332,-0.478289,-0.867428,1.767644,1.747852,1.288607,-0.809025,...,,1.829352,,0.186123,-0.731517,-0.244833,,,0.000000,False
2025-03-28,601939.XSHG,0.757571,-0.785475,-0.002010,-0.503697,0.028391,0.844508,-1.038814,-0.209199,-0.048738,0.025757,...,,-0.260284,-0.981350,-0.080821,-0.377893,-1.170047,,,-0.003480,False
2025-03-28,601988.XSHG,0.889101,-1.062468,-0.130705,-0.190872,-0.308083,0.632296,-0.819172,0.030185,0.188264,-0.263880,...,,-0.012149,1.179011,0.279119,-0.148667,-0.769746,,,0.001808,False
2025-03-28,601995.XSHG,0.810638,-0.859175,0.525654,-0.293980,-0.947016,1.296612,-0.657063,-0.983669,,0.865794,...,,-1.069557,,0.040780,-0.882193,-1.225830,,,0.004033,False


In [6]:
# 特征集
x = data.loc[:,:'general_reserve_ttm_0']                                                                   # 去除文本数据，保留因子数据
# 目标集合
y = data.loc[:,'target']                                                                   # Y对象为binary数据1，0
# 样本分离
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=0)         # 拆分训练集和测试集
# 特征名称
features = x_train.columns.tolist()          

In [7]:
pre_gap = 240 * 2        # 预计算
ret_gap = 5              # 预测收益
predict_gap = 60         # 预测长度

# 滚动周期
trade_days = sorted(list(set(data.index.get_level_values(0).tolist())))
re_train_days = trade_days[pre_gap::predict_gap]

all_trading_dates = pickle.load(open('trading_dates.pkl','rb'))

def get_trading_dates(start_date,end_date):
    #input date are timestamps
    start_date = start_date.date()
    end_date = end_date.date()
    trading_dates = []
    for date in all_trading_dates:
        if start_date <= date <= end_date:
            trading_dates.append(date)
    return trading_dates

def get_previous_trading_date(dt, gap):
    idx = all_trading_dates.index(pd.Timestamp(dt).date())
    return all_trading_dates[max(0, idx - gap)]

def get_next_trading_date(dt, gap):
    idx = all_trading_dates.index(pd.Timestamp(dt).date())
    return all_trading_dates[min(len(all_trading_dates) - 1, idx + gap)]

def get_siganl_delay(x_test,T = 5):
    # 时间获取
    start_date = x_test.index.get_level_values(0).min()
    end_date = x_test.index.get_level_values(0).max()
    date_list = pd.to_datetime(get_trading_dates(start_date,end_date))
    # 信号数字化
    predict_signal = x_test['predict'].astype(int)
    # 信号递延
    predict_signal = predict_signal.unstack().fillna(0).reindex(date_list).unstack()
    predict_signal_delay = pd.DataFrame({i:predict_signal.groupby(level=0).apply(lambda x:x.shift(i)).droplevel(0) for i in range(1,T+1)}).sort_index()
    # 信号叠加
    predict_signal_delay['signal_level'] = predict_signal_delay.sum(axis = 1) 
    predict_signal_cum = predict_signal_delay.signal_level.unstack('order_book_id')
    # predict_signal_cum.stack().replace(0,np.nan).hist(bins = 5)
    # predict_signal_cum.stack().value_counts().sort_index()

    return predict_signal_cum,predict_signal_delay


# 获取标的收益
def get_bar(df):
    """
    :param df: 买入队列 -> dataframe/unstack
    :return ret: 基准的逐日收益 -> dataframe
    """
    start_date = get_previous_trading_date(df.index.min(),1).strftime('%F')
    end_date = df.index.max().strftime('%F')
    stock_list = df.columns.tolist()
    price_open = pd.read_pickle('./300/300_stock_open.pkl')
    price_open = price_open.loc[start_date:end_date]
    price_open = price_open[[i for i in price_open.columns if i in stock_list]]
    
    return price_open

# 回测框架
def backtest(df_weight, change_n = 20, cash = 10000 * 1000, tax = 0.0005, other_tax = 0.0001, commission = 0.0002, min_fee = 5, cash_interest_yield = 0.02):

    # 基础参数
    
    inital_cash = cash                                                                                                            # 起始资金
    stock_holding_num_hist = 0                                                                                                    # 初始化持仓       
    buy_cost = other_tax + commission                                                                                             # 买入交易成本
    sell_cost = tax + other_tax + commission                                                                                      # 卖出交易成本
    cash_interest_daily = (1 + cash_interest_yield) ** (1/252) - 1                                                                # 现金账户利息(日)
    account = pd.DataFrame(index = df_weight.index,columns=['total_account_asset','holding_market_cap','cash_account'])           # 账户信息存储
    price_open = get_bar(df_weight)                                                                                               # 获取开盘价格数据
    stock_round_lot = pd.Series(dict([(i,100) for i in df_weight.columns.tolist()]))                         # 标的最小买入数量
    change_day = sorted(set(df_weight.index.tolist()[::change_n] + [df_weight.index[-1]]))                                        # 调仓日期

    # 滚动计算
    for i in tqdm(range(0,len(change_day)-1)):
        start_date = change_day[i]
        end_date = change_day[i+1]

        # 获取给定权重
        df_weight_temp = df_weight.loc[start_date].dropna()
        stock_list_temp = df_weight_temp.index.tolist()
        # 计算个股持股数量 = 向下取整(给定权重 * 可用资金 // 最小买入股数) * 最小买入股数
        stock_holding_num = ((df_weight_temp 
                            * cash 
                            / (price_open.loc[start_date,stock_list_temp] * (1 + sell_cost))        # 预留交易费用
                            // stock_round_lot.loc[stock_list_temp]) 
                            * stock_round_lot.loc[stock_list_temp])

        # 仓位变动      
        ## 防止相减为空 & 剔除无变动
        stock_holding_num_change = stock_holding_num.sub(stock_holding_num_hist,fill_value = 0).replace(0,np.nan).dropna()
        # 获取期间价格
        price_open_temp = price_open.loc[start_date:end_date,stock_holding_num_change.index]           # 引入完整券池
        
        # 计算交易成本
        def calc_fee(x,min_fee):
            if x < 0:
                fee_temp = -1 * x * sell_cost                                                                                       # 印花税 + 过户费等 + 佣金
            else:
                fee_temp = x * buy_cost                                                                                             # 过户费等 + 佣金
            # 最低交易成本限制
            if fee_temp > min_fee:
                return fee_temp
            else:
                return min_fee

        transaction_costs = ((price_open_temp.loc[start_date] 
                            * stock_holding_num_change)).apply(lambda x: calc_fee(x,min_fee)).sum()
        # 计算期间市值 （交易手续费在现金账户计提）
        holding_market_cap = (price_open_temp * stock_holding_num).sum(axis =1)
        cash_account = cash - transaction_costs - holding_market_cap.loc[start_date]
        cash_account = pd.Series([cash_account * ((1 + cash_interest_daily)**(i+1)) for i in range(0,len(holding_market_cap))],
                                index = holding_market_cap.index)
        total_account_asset = holding_market_cap + cash_account
        
        # 将当前持仓存入 
        stock_holding_num_hist = stock_holding_num
        # 下一期期初可用资金
        cash = total_account_asset.loc[end_date]

        account.loc[start_date:end_date,'total_account_asset'] = round(total_account_asset,2)
        account.loc[start_date:end_date,'holding_market_cap'] = round(holding_market_cap,2)
        account.loc[start_date:end_date,'cash_account'] = round(cash_account,2)

    account.loc[pd.to_datetime(get_previous_trading_date(account.index.min(),1))] = [inital_cash,0,inital_cash]
    account = account.sort_index()
    
    return account



def get_benchmark(df,benchmark,benchmark_type):
    """
    :param df: 买入队列 -> dataframe/unstack
    :param benchmark: 基准指数 -> str
    :return ret: 基准的逐日收益 -> dataframe
    """
    start_date = get_previous_trading_date(df.index.min(),1).strftime('%F')
    end_date = df.index.max().strftime('%F')
    if benchmark_type == 'mcw':
        price_open = pd.read_pickle('./300/300_open.pkl')
        price_open = price_open.loc[start_date:end_date]
    else:
        index_fix = INDEX_FIX(start_date,end_date,benchmark)
        stock_list = index_fix.columns.tolist()
        price_open = get_price(stock_list,start_date,end_date,fields=['open']).open.unstack('order_book_id')
        price_open = price_open.pct_change().mask(~index_fix).mean(axis = 1)
        price_open = (1 + price_open).cumprod().to_frame(benchmark)
    
    return price_open



# 回测绩效指标绘制
def get_performance_analysis(account_result,benchmark_index,benchmark_type = 'mcw'):
    
    rf = 0.03

    # 加入基准    
    performance = pd.concat([account_result['total_account_asset'].to_frame('strategy'),
                             get_benchmark(account_result,benchmark_index,benchmark_type)],axis = 1)
    performance_net = performance.pct_change().dropna(how = 'all')                                # 清算至当日开盘
    performance_cumnet = (1 + performance_net).cumprod()
    performance_cumnet['alpha'] = performance_cumnet['strategy']/performance_cumnet[benchmark_index]
    performance_cumnet = performance_cumnet.fillna(1)

    # 指标计算
    performance_pct = performance_cumnet.pct_change().dropna()

    # 策略收益
    strategy_name,benchmark_name,alpha_name = performance_cumnet.columns.tolist() 
    Strategy_Final_Return = performance_cumnet[strategy_name].iloc[-1] - 1

    # 策略年化收益
    Strategy_Annualized_Return_EAR = (1 + Strategy_Final_Return) ** (252/len(performance_cumnet)) - 1

    # 基准收益
    Benchmark_Final_Return = performance_cumnet[benchmark_name].iloc[-1] - 1

    # 基准年化收益
    Benchmark_Annualized_Return_EAR = (1 + Benchmark_Final_Return) ** (252/len(performance_cumnet)) - 1

    # alpha 
    ols_result = sm.OLS(performance_pct[strategy_name] * 252 - rf, sm.add_constant(performance_pct[benchmark_name] * 252 - rf)).fit()
    Alpha = ols_result.params[0]

    # beta
    Beta = ols_result.params[1]

    # beta_2 = np.cov(performance_pct[strategy_name],performance_pct[benchmark_name])[0,1]/performance_pct[benchmark_name].var()
    # 波动率
    Strategy_Volatility = performance_pct[strategy_name].std() * np.sqrt(252)

    # 夏普
    Strategy_Sharpe = (Strategy_Annualized_Return_EAR - rf)/Strategy_Volatility

    # 下行波动率
    strategy_ret = performance_pct[strategy_name]
    Strategy_Down_Volatility = strategy_ret[strategy_ret < 0].std() * np.sqrt(252)

    # sortino
    Sortino = (Strategy_Annualized_Return_EAR - rf)/Strategy_Down_Volatility
    
    # 跟踪误差
    Tracking_Error = (performance_pct[strategy_name] - performance_pct[benchmark_name]).std() * np.sqrt(252)

    # 信息比率
    Information_Ratio = (Strategy_Annualized_Return_EAR - Benchmark_Annualized_Return_EAR)/Tracking_Error

    # 最大回测
    i = np.argmax((np.maximum.accumulate(performance_cumnet[strategy_name]) 
                    - performance_cumnet[strategy_name])
                    /np.maximum.accumulate(performance_cumnet[strategy_name]))
    j = np.argmax(performance_cumnet[strategy_name][:i])
    Max_Drawdown = (1-performance_cumnet[strategy_name][i]/performance_cumnet[strategy_name][j])

    # 卡玛比率
    Calmar = (Strategy_Annualized_Return_EAR)/Max_Drawdown

    # 超额收益
    Alpha_Final_Return = performance_cumnet[alpha_name].iloc[-1] - 1

    # 超额年化收益
    Alpha_Annualized_Return_EAR = (1 + Alpha_Final_Return) ** (252/len(performance_cumnet)) - 1

    # 超额波动率
    Alpha_Volatility = performance_pct[alpha_name].std() * np.sqrt(252)

    # 超额夏普
    Alpha_Sharpe = (Alpha_Annualized_Return_EAR - rf)/Alpha_Volatility

    # 超额最大回测
    i = np.argmax((np.maximum.accumulate(performance_cumnet[alpha_name]) 
                    - performance_cumnet[alpha_name])
                    /np.maximum.accumulate(performance_cumnet[alpha_name]))
    j = np.argmax(performance_cumnet[alpha_name][:i])
    Alpha_Max_Drawdown = (1-performance_cumnet[alpha_name][i]/performance_cumnet[alpha_name][j])

    # 胜率
    performance_pct['win'] = performance_pct[alpha_name] > 0
    Win_Ratio = performance_pct['win'].value_counts().loc[True] / len(performance_pct)

    # 盈亏比
    profit_lose = performance_pct.groupby('win')[alpha_name].mean()
    Profit_Lose_Ratio = abs(profit_lose[True]/profit_lose[False])
    

    result = {
        '策略累计收益':round(Strategy_Final_Return,4),
        '策略年化收益': round(Strategy_Annualized_Return_EAR,4),
        '基准累计收益':round(Benchmark_Final_Return,4),
        '基准年化收益': round(Benchmark_Annualized_Return_EAR,4),
        '阿尔法':round(Alpha,4),
        '贝塔':round(Beta,4),
        '波动率':round(Strategy_Volatility,4),
        '夏普比率':round(Strategy_Sharpe,4),
        '下行波动率':round(Strategy_Down_Volatility,4),
        '索提诺比率':round(Sortino,4),
        '跟踪误差':round(Tracking_Error,4),
        '信息比率':round(Information_Ratio,4),
        '最大回撤':round(Max_Drawdown,4),
        '卡玛比率': round(Calmar,4),
        '超额累计收益':round(Alpha_Final_Return,4),
        '超额年化收益': round(Alpha_Annualized_Return_EAR,4),
        '超额波动率':round(Alpha_Volatility,4),
        '超额夏普':round(Alpha_Sharpe,4),
        '超额最大回测':round(Alpha_Max_Drawdown,4),
        '胜率':round(Win_Ratio,4),
        '盈亏比':round(Profit_Lose_Ratio,4)
    }
    

    return performance_cumnet,result



### 2 模型训练

#### randomforest

In [5]:
def best_estimetor_RandomForestClassifier(data_train_input,fatcors):
    precision_postive_cv_clf = pd.DataFrame()
    for j in tqdm(range(3,8,2)):
        for t in np.logspace(-8,-5,3):
            for z in range(int(np.sqrt(len(fatcors))),len(fatcors)-1,4):
                clf = RandomForestClassifier(n_estimators=10,
                                             criterion='gini',
                                             max_features= z,
                                             max_depth=j,
                                             class_weight='balanced',
                                             ccp_alpha=t,
                                             random_state = 0,
                                             n_jobs = -2)         #生成随机森林
                precision_postive_cv = 0
                for i in range(1,6):
                    data_train = data_train_input[data_train_input.label != i]
                    data_validation = data_train_input[data_train_input.label == i]
                    x_train = data_train.loc[:,fatcors]
                    y_train = data_train.loc[:,'target']
                    x_val = data_validation.loc[:,fatcors]
                    y_val = data_validation.loc[:,'target']
                    clf.fit(x_train,y_train)
                    y_hat = clf.predict(x_val)                                          # 训练模型
                    martrix = confusion_matrix(y_val, y_hat)
                    metric = martrix[1,1] / martrix[:,1].sum()                          # 精确率 预测是涨结果真的涨了
                    # metric = (martrix[1,1] + martrix[0,0]) / martrix.sum().sum()      # 准确率 预测是涨结果真的涨了
                    precision_postive_cv += metric
                temp = pd.DataFrame([j,t,z,precision_postive_cv/5],index = ['max_depth','ccp_alpha','max_features','precision_postive_cv'])
                precision_postive_cv_clf = pd.concat([precision_postive_cv_clf,temp],axis = 1)
    return precision_postive_cv_clf

In [129]:
model_name = 'rf'
data_test_total = pd.DataFrame()

for i in tqdm(re_train_days):            # 测试5期
    # 训练开始/结束时间
    train_start = pd.Timestamp(get_previous_trading_date(i,(pre_gap + ret_gap)))
    train_end = pd.Timestamp(get_previous_trading_date(i,ret_gap))
    # 测试开始/结束时间
    test_start = i
    test_end = pd.Timestamp(get_next_trading_date(i,predict_gap - 1))
    # 训练集数据
    data_train = data.loc[train_start:train_end]
    data_train_x = data_train.loc[:,features]
    data_train_y = data_train.loc[:,'target']
    # 测试集数据
    data_test = data.loc[test_start:test_end]
    data_test_x = data_test.loc[:,features]
    
    # 打标记号_5fold_交叉验证
    date_len = sorted(set(data_train.index.get_level_values(0)))
    fold_length = (len(date_len) - 80)//5
    label = ([1] * fold_length 
            + [0] * 20 
            + [2] * fold_length 
            + [0] * 20 
            + [3] * fold_length 
            + [0] * 20 
            + [4] * fold_length 
            + [0] * 20 
            + [5] * (len(date_len) - 80 - (fold_length * 4))
            )
    for i,j in zip(date_len,label):
        data_train.loc[i,'label'] = j
    
    # 模型训练
    paramter = best_estimetor_RandomForestClassifier(data_train,features).T.sort_values(by ='precision_postive_cv',ascending = False).iloc[0].to_dict()
    print(i,paramter)
    # 模型输出
    clf = RandomForestClassifier(criterion = 'gini',
                                 max_depth = int(paramter['max_depth']),
                                 class_weight = 'balanced',
                                 ccp_alpha = paramter['ccp_alpha'],
                                 max_features = int(paramter['max_features']),
                                 random_state = 0,
                                 n_jobs = -2)

    # 预测结果输出
    y_hat = clf.fit(data_train_x,data_train_y).predict(data_test_x)
    data_test['predict'] = y_hat
    data_test_total = pd.concat([data_test_total,data_test[['predict']]],axis = 0)

data_test_total.to_pickle(os.path.join(file_path, f'data/data_test_total_{model_name}_all.pkl'))

100%|██████████| 3/3 [08:55<00:00, 178.39s/it]


2021-12-17 00:00:00 {'max_depth': 7.0, 'ccp_alpha': 1e-08, 'max_features': 56.0, 'precision_postive_cv': 0.47701520096308025}


100%|██████████| 3/3 [08:40<00:00, 173.61s/it]it]


2022-03-21 00:00:00 {'max_depth': 7.0, 'ccp_alpha': 3.162277660168379e-07, 'max_features': 12.0, 'precision_postive_cv': 0.501602799508617}


100%|██████████| 3/3 [08:31<00:00, 170.58s/it]it]


2022-06-21 00:00:00 {'max_depth': 7.0, 'ccp_alpha': 1e-05, 'max_features': 32.0, 'precision_postive_cv': 0.505049423676636}


100%|██████████| 3/3 [08:12<00:00, 164.16s/it]it]


2022-09-14 00:00:00 {'max_depth': 7.0, 'ccp_alpha': 1e-05, 'max_features': 48.0, 'precision_postive_cv': 0.5096420184214152}


100%|██████████| 3/3 [07:46<00:00, 155.50s/it]it]


2022-12-14 00:00:00 {'max_depth': 3.0, 'ccp_alpha': 1e-05, 'max_features': 68.0, 'precision_postive_cv': 0.5240057592656968}


100%|██████████| 3/3 [07:14<00:00, 144.68s/it]it]


2023-03-16 00:00:00 {'max_depth': 3.0, 'ccp_alpha': 1e-05, 'max_features': 68.0, 'precision_postive_cv': 0.5235513461747923}


100%|██████████| 3/3 [06:51<00:00, 137.27s/it]it]


2023-06-14 00:00:00 {'max_depth': 7.0, 'ccp_alpha': 1e-05, 'max_features': 72.0, 'precision_postive_cv': 0.5183452655705423}


100%|██████████| 3/3 [06:22<00:00, 127.55s/it]]  


2023-09-08 00:00:00 {'max_depth': 7.0, 'ccp_alpha': 1e-05, 'max_features': 56.0, 'precision_postive_cv': 0.5186167908626625}


100%|██████████| 3/3 [06:19<00:00, 126.44s/it]it]


2023-12-11 00:00:00 {'max_depth': 3.0, 'ccp_alpha': 1e-08, 'max_features': 60.0, 'precision_postive_cv': 0.5198050634701163}


100%|██████████| 3/3 [05:57<00:00, 119.31s/it]it]


2024-03-13 00:00:00 {'max_depth': 5.0, 'ccp_alpha': 3.162277660168379e-07, 'max_features': 32.0, 'precision_postive_cv': 0.5266753340134738}


100%|██████████| 3/3 [05:53<00:00, 117.78s/it]/it]


2024-06-13 00:00:00 {'max_depth': 7.0, 'ccp_alpha': 3.162277660168379e-07, 'max_features': 60.0, 'precision_postive_cv': 0.5332032853277555}


100%|██████████| 3/3 [05:56<00:00, 118.91s/it]/it]


2024-09-05 00:00:00 {'max_depth': 5.0, 'ccp_alpha': 1e-08, 'max_features': 8.0, 'precision_postive_cv': 0.528400716789086}


100%|██████████| 3/3 [06:07<00:00, 122.50s/it]/it]


2024-12-09 00:00:00 {'max_depth': 5.0, 'ccp_alpha': 1e-05, 'max_features': 8.0, 'precision_postive_cv': 0.5181064043272627}


100%|██████████| 3/3 [06:36<00:00, 132.28s/it]/it]


2025-03-12 00:00:00 {'max_depth': 3.0, 'ccp_alpha': 1e-08, 'max_features': 12.0, 'precision_postive_cv': 0.5414420873756441}


100%|██████████| 14/14 [1:40:49<00:00, 432.07s/it]


In [130]:
model_name = 'rf'
data_test_total = pd.read_pickle(os.path.join(file_path, f'data/data_test_total_{model_name}_all.pkl'))

In [131]:
# 信号递延
predict_signal_cum,predict_signal_delay = get_siganl_delay(data_test_total)
# 回测检验
buy_list = (predict_signal_cum != 0).astype(int)
df_wight = buy_list.div(buy_list.sum(axis = 1),axis = 0).replace(0,np.nan)
df_wight = df_wight.shift(1).dropna(how = 'all')
account_result = backtest(df_wight,5)
performance_cumnet,result = get_performance_analysis(account_result,index_item)

# Create figure
fig = plt.figure(figsize=(12, 6))
performance_cumnet.plot(secondary_y='alpha')

# Add title and labels
plt.title('Performance and Alpha Over Time')
plt.xlabel('Date')
plt.ylabel('Performance')
plt.ylabel('Alpha', rotation=270, labelpad=15)

# Save figure
plt.savefig(os.path.join(file_path, f'performance_{model_name}.png'), dpi=300, bbox_inches='tight')
plt.close()

# save results dict to pickle
with open(os.path.join(file_path, f'results_dict_{model_name}.pkl'), 'wb') as f:
    pickle.dump(result, f)


100%|██████████| 156/156 [00:00<00:00, 721.31it/s]


<Figure size 1200x600 with 0 Axes>

#### xgboost

In [8]:
def best_estimetor_XGBoost(data_train_input,fatcors):
    precision_postive_cv_clf = pd.DataFrame()
    for max_depth_level in tqdm([3,6,9]):
        for eta_level in [0.8,1,1.2]:
            for gamma_level in [0.001, 0.01, 0.1]:
                clf = XGBClassifier(max_depth=max_depth_level,eta = eta_level,gamma = gamma_level)
                precision_postive_cv = 0
                for i in range(1,6):
                    data_train = data_train_input[data_train_input.label != i]
                    data_validation = data_train_input[data_train_input.label == i]
                    x_train = data_train.loc[:,fatcors]
                    y_train = data_train.loc[:,'target']
                    x_val = data_validation.loc[:,fatcors]
                    y_val = data_validation.loc[:,'target']
                    clf.fit(x_train,y_train)
                    y_hat = clf.predict(x_val)                                          # 训练模型
                    martrix = confusion_matrix(y_val, y_hat)
                    metric = martrix[1,1] / martrix[:,1].sum()                          # 精确率 预测是涨结果真的涨了
                    # metric = (martrix[1,1] + martrix[0,0]) / martrix.sum().sum()      # 准确率 预测是涨结果真的涨了
                    precision_postive_cv += metric
                temp = pd.DataFrame([max_depth_level,eta_level,gamma_level,precision_postive_cv/5],index = ['max_depth_level','eta_level','gamma_level','precision_postive_cv'])
                precision_postive_cv_clf = pd.concat([precision_postive_cv_clf,temp],axis = 1)
    return precision_postive_cv_clf


In [9]:
model_name = 'xgb'
data_test_total = pd.DataFrame()

for i in tqdm(re_train_days):   # 测试5期
    # 训练开始/结束时间
    train_start = pd.Timestamp(get_previous_trading_date(i,(pre_gap + ret_gap)))
    train_end = pd.Timestamp(get_previous_trading_date(i,ret_gap))
    # 测试开始/结束时间
    test_start = i
    test_end = pd.Timestamp(get_next_trading_date(i,predict_gap - 1))
    # 训练集数据
    data_train = data.loc[train_start:train_end]
    data_train_x = data_train.loc[:,features]
    data_train_y = data_train.loc[:,'target']
    # 测试集数据
    data_test = data.loc[test_start:test_end]
    data_test_x = data_test.loc[:,features]
    
    # 打标记号_5fold_交叉验证
    date_len = sorted(set(data_train.index.get_level_values(0)))
    fold_length = (len(date_len) - 80)//5
    label = ([1] * fold_length 
            + [0] * 20 
            + [2] * fold_length 
            + [0] * 20 
            + [3] * fold_length 
            + [0] * 20 
            + [4] * fold_length 
            + [0] * 20 
            + [5] * (len(date_len) - 80 - (fold_length * 4))
            )
    for i,j in zip(date_len,label):
        data_train.loc[i,'label'] = j
    
    # 模型训练
    paramter = best_estimetor_XGBoost(data_train,features).T.sort_values(by ='precision_postive_cv',ascending = False).iloc[0].to_dict()
    print(i,paramter)
    # 模型输出
    clf = XGBClassifier(max_depth=int(paramter['max_depth_level']),
                        eta = paramter['eta_level'],
                        gamma = paramter['gamma_level'])


    # 预测结果输出
    y_hat = clf.fit(data_train_x,data_train_y).predict(data_test_x)
    data_test['predict'] = y_hat
    data_test_total = pd.concat([data_test_total,data_test[['predict']]],axis = 0)

data_test_total.to_pickle(os.path.join(file_path, f'data/data_test_total_{model_name}_all.pkl'))

100%|██████████| 3/3 [01:14<00:00, 24.92s/it]


2021-12-17 00:00:00 {'max_depth_level': 9.0, 'eta_level': 1.0, 'gamma_level': 0.001, 'precision_postive_cv': 0.5000316963602932}


100%|██████████| 3/3 [01:19<00:00, 26.50s/it]]


2022-03-21 00:00:00 {'max_depth_level': 9.0, 'eta_level': 1.0, 'gamma_level': 0.1, 'precision_postive_cv': 0.5026048095116482}


100%|██████████| 3/3 [01:23<00:00, 27.83s/it]]


2022-06-21 00:00:00 {'max_depth_level': 3.0, 'eta_level': 1.0, 'gamma_level': 0.1, 'precision_postive_cv': 0.5137101557116253}


100%|██████████| 3/3 [01:33<00:00, 31.29s/it]]


2022-09-14 00:00:00 {'max_depth_level': 9.0, 'eta_level': 1.0, 'gamma_level': 0.001, 'precision_postive_cv': 0.5178720794726646}


100%|██████████| 3/3 [01:32<00:00, 30.68s/it]]


2022-12-14 00:00:00 {'max_depth_level': 6.0, 'eta_level': 0.8, 'gamma_level': 0.01, 'precision_postive_cv': 0.521321500661595}


100%|██████████| 3/3 [01:30<00:00, 30.31s/it]]


2023-03-16 00:00:00 {'max_depth_level': 3.0, 'eta_level': 1.2, 'gamma_level': 0.001, 'precision_postive_cv': 0.5100423759553093}


100%|██████████| 3/3 [01:29<00:00, 29.75s/it]]


2023-06-14 00:00:00 {'max_depth_level': 9.0, 'eta_level': 0.8, 'gamma_level': 0.01, 'precision_postive_cv': 0.5186681725528688}


100%|██████████| 3/3 [01:28<00:00, 29.47s/it]]


2023-09-08 00:00:00 {'max_depth_level': 9.0, 'eta_level': 0.8, 'gamma_level': 0.001, 'precision_postive_cv': 0.5209135092849546}


100%|██████████| 3/3 [01:25<00:00, 28.46s/it]]


2023-12-11 00:00:00 {'max_depth_level': 9.0, 'eta_level': 1.0, 'gamma_level': 0.001, 'precision_postive_cv': 0.523644668159087}


100%|██████████| 3/3 [01:25<00:00, 28.54s/it]]


2024-03-13 00:00:00 {'max_depth_level': 9.0, 'eta_level': 0.8, 'gamma_level': 0.1, 'precision_postive_cv': 0.500776043500324}


100%|██████████| 3/3 [01:22<00:00, 27.38s/it]t]


2024-06-13 00:00:00 {'max_depth_level': 6.0, 'eta_level': 0.8, 'gamma_level': 0.1, 'precision_postive_cv': 0.5137848608327606}


100%|██████████| 3/3 [01:21<00:00, 27.12s/it]t]


2024-09-05 00:00:00 {'max_depth_level': 9.0, 'eta_level': 0.8, 'gamma_level': 0.1, 'precision_postive_cv': 0.5221929012529378}


100%|██████████| 3/3 [01:22<00:00, 27.59s/it]t]


2024-12-09 00:00:00 {'max_depth_level': 9.0, 'eta_level': 1.0, 'gamma_level': 0.001, 'precision_postive_cv': 0.5250510903324799}


100%|██████████| 3/3 [01:22<00:00, 27.42s/it]t]


2025-03-12 00:00:00 {'max_depth_level': 6.0, 'eta_level': 1.0, 'gamma_level': 0.001, 'precision_postive_cv': 0.5255311341355545}


100%|██████████| 14/14 [20:05<00:00, 86.11s/it]


In [12]:
model_name = 'xgb'
data_test_total = pd.read_pickle(os.path.join(file_path, f'data/data_test_total_{model_name}_all.pkl'))
data_test_total

Unnamed: 0_level_0,Unnamed: 1_level_0,predict
date,order_book_id,Unnamed: 2_level_1
2021-12-24,000001.XSHE,0
2021-12-24,000166.XSHE,0
2021-12-24,000776.XSHE,0
2021-12-24,000783.XSHE,1
2021-12-24,002142.XSHE,1
...,...,...
2025-03-28,601916.XSHG,1
2025-03-28,601939.XSHG,1
2025-03-28,601988.XSHG,1
2025-03-28,601995.XSHG,0


In [11]:
# 信号递延
predict_signal_cum,predict_signal_delay = get_siganl_delay(data_test_total)
# 回测检验
buy_list = (predict_signal_cum != 0).astype(int)
df_wight = buy_list.div(buy_list.sum(axis = 1),axis = 0).replace(0,np.nan)
df_wight = df_wight.shift(1).dropna(how = 'all')
account_result = backtest(df_wight,5)
performance_cumnet,result = get_performance_analysis(account_result,index_item)

# Create figure
fig = plt.figure(figsize=(12, 6))
performance_cumnet.plot(secondary_y='alpha')

# Add title and labels
plt.title('Performance and Alpha Over Time')
plt.xlabel('Date')
plt.ylabel('Performance')
plt.ylabel('Alpha', rotation=270, labelpad=15)

# Save figure
plt.savefig(os.path.join(file_path, f'performance_{model_name}.png'), dpi=300, bbox_inches='tight')
plt.close()

# save results dict to pickle
with open(os.path.join(file_path, f'results_dict_{model_name}.pkl'), 'wb') as f:
    pickle.dump(result, f)


100%|██████████| 157/157 [00:00<00:00, 607.88it/s]


<Figure size 1200x600 with 0 Axes>

# collect results

In [153]:
sec_list = ['Energy','Materials','ConsumerDiscretionary',
 'ConsumerStaples','HealthCare','Financials','RealEstate',
 'InformationTechnology','TelecommunicationServices','Utilities','Industrials']

model = ['rf','xgb']

result_list = {}
for sec in sec_list:
    for model_name in model:
        file_path = f'./300/{sec}'
        try:
            with open(os.path.join(file_path, f'results_dict_{model_name}.pkl'), 'rb') as f:
                result = pickle.load(f)
            result_list.update({(sec,model_name):result})
        except:
            pass
pd.DataFrame(result_list).T.to_excel('sector_backtest_result.xlsx')

In [152]:
pip install openpyxl

Collecting openpyxl
  Downloading openpyxl-3.1.5-py2.py3-none-any.whl.metadata (2.5 kB)
Collecting et-xmlfile (from openpyxl)
  Downloading et_xmlfile-2.0.0-py3-none-any.whl.metadata (2.7 kB)
Downloading openpyxl-3.1.5-py2.py3-none-any.whl (250 kB)
Downloading et_xmlfile-2.0.0-py3-none-any.whl (18 kB)
Installing collected packages: et-xmlfile, openpyxl
Successfully installed et-xmlfile-2.0.0 openpyxl-3.1.5
Note: you may need to restart the kernel to use updated packages.
