In [1]:
import numpy as np
import pandas as pd
import warnings
import os
warnings.filterwarnings("ignore")

In [18]:
# 设置数据集路径
Workspace = "/home/vscode/workspace"
DATA_PATH = "data/store/external/MintueData/TickData/Price/"
DATA_PATH = os.path.join(Workspace, DATA_PATH)
DATA_PATH

'/home/vscode/workspace/data/store/external/MintueData/TickData/Price/'

In [19]:
df = pd.read_parquet(DATA_PATH + '20251020.parquet')
df.columns

Index(['code', 'DateTime', 'open', 'high', 'low', 'close', 'BidPrice01',
       'BidPrice02', 'BidPrice03', 'BidPrice04', 'BidPrice05', 'BidPrice06',
       'BidPrice07', 'BidPrice08', 'BidPrice09', 'BidPrice10', 'BidVolume01',
       'BidVolume02', 'BidVolume03', 'BidVolume04', 'BidVolume05',
       'BidVolume06', 'BidVolume07', 'BidVolume08', 'BidVolume09',
       'BidVolume10', 'AskPrice01', 'AskPrice02', 'AskPrice03', 'AskPrice04',
       'AskPrice05', 'AskPrice06', 'AskPrice07', 'AskPrice08', 'AskPrice09',
       'AskPrice10', 'AskVolume01', 'AskVolume02', 'AskVolume03',
       'AskVolume04', 'AskVolume05', 'AskVolume06', 'AskVolume07',
       'AskVolume08', 'AskVolume09', 'AskVolume10'],
      dtype='object')

In [20]:
df1 = df.loc[df['code']=='000001.sz']
df1.head(20)

Unnamed: 0,code,DateTime,open,high,low,close,BidPrice01,BidPrice02,BidPrice03,BidPrice04,...,AskVolume01,AskVolume02,AskVolume03,AskVolume04,AskVolume05,AskVolume06,AskVolume07,AskVolume08,AskVolume09,AskVolume10
0,000001.sz,20251020 92500,11.41,11.41,11.41,11.41,11.41,11.4,11.39,11.38,...,8000.0,8900.0,149200.0,58200.0,39500.0,156600.0,184000.0,220100.0,423100.0,169900.0
1,000001.sz,20251020 93000,11.41,11.41,11.41,11.41,11.41,11.4,11.39,11.38,...,26500.0,149300.0,56900.0,43500.0,161600.0,192000.0,220100.0,432400.0,174900.0,224500.0
2,000001.sz,20251020 93100,11.41,11.41,11.34,11.36,11.35,11.34,11.33,11.32,...,35500.0,3800.0,49700.0,21700.0,37400.0,8800.0,26400.0,313981.0,176100.0,287900.0
3,000001.sz,20251020 93200,11.35,11.38,11.34,11.37,11.37,11.36,11.35,11.34,...,30700.0,56400.0,9100.0,27600.0,303981.0,178000.0,287400.0,268300.0,240700.0,259300.0
4,000001.sz,20251020 93300,11.37,11.41,11.37,11.4,11.4,11.39,11.38,11.37,...,29100.0,142416.0,307181.0,188200.0,292600.0,268300.0,237900.0,250900.0,252500.0,572300.0
5,000001.sz,20251020 93400,11.41,11.41,11.37,11.38,11.37,11.36,11.35,11.34,...,213400.0,167500.0,32100.0,104500.0,176816.0,290681.0,195700.0,301600.0,276800.0,248900.0
6,000001.sz,20251020 93500,11.39,11.39,11.36,11.37,11.36,11.35,11.34,11.33,...,1000.0,66100.0,66200.0,125900.0,190800.0,177016.0,291181.0,192000.0,316600.0,265400.0
7,000001.sz,20251020 93600,11.37,11.38,11.37,11.38,11.37,11.36,11.35,11.34,...,37800.0,94300.0,148700.0,243400.0,160116.0,299881.0,193000.0,327500.0,276200.0,251400.0
8,000001.sz,20251020 93700,11.38,11.38,11.36,11.36,11.35,11.34,11.33,11.32,...,29800.0,81000.0,252900.0,223500.0,187500.0,245100.0,153516.0,306181.0,193000.0,349300.0
9,000001.sz,20251020 93800,11.37,11.37,11.35,11.36,11.35,11.34,11.33,11.32,...,286400.0,236700.0,240000.0,110300.0,168000.0,154700.0,150816.0,336281.0,195200.0,349300.0


In [74]:
class MinuteFactorCalculator:
    def __init__(self, minute_data, formulas, names):
        """
        分钟级数据因子计算
        minute_data:包含open, high, low, close的DataFrame
        formulas:因子表达式列表
        names:因子名称列表
        """
        self.minute_data = minute_data.copy()
        self.factors_data = None
        self.factor_formulas = formulas
        self.factor_names = names
        self._preprocess_data()
        
    
    def _preprocess_data(self):
        """数据预处理"""
        df = self.minute_data
        
        # 确保数据按时间和股票代码排序
        if 'DateTime' in df.columns and 'code' in df.columns:
            df = df.sort_values(['code', 'DateTime']).reset_index(drop=True)
        elif 'DateTime' in df.columns:
            df = df.sort_values('DateTime').reset_index(drop=True)
            
        # 处理价格数据中的异常值
        price_cols = ['open', 'high', 'low', 'close']
        for col in price_cols:
            if col in df.columns:
                # 将0或负值替换为NaN
                df[col] = df[col].replace(0, np.nan)
                df[col] = df[col].apply(lambda x: x if x > 0 else np.nan)
        
        self.minute_data = df
        
        
    def calculate_factors(self):
        """
        计算指定的因子
        factor_formulas:因子公式列表
        """
        factor_formulas = self.factor_formulas.copy()
        factor_names = self.factor_names.copy()
        df = self.minute_data.copy()
        print(f"一共计算 {len(factor_formulas)} 个因子")
        
        # 计算每个因子
        for i, formula in enumerate(factor_formulas):
            try:
                factor_name = factor_names[i]
                df[factor_name] = self._calculate_single_factor(df, formula)
                print(f"因子 {factor_name} 计算完毕, \t其表达式为:{formula}")
            except Exception as e:
                print(f"因子 {factor_name} 计算失败,错误: {e}")
                df[factor_names[i]] = np.nan
        
        self.factors_data = df
        return df
    
    def _calculate_single_factor(self, df, formula):
        """
        计算单个因子
        df:分钟级数据表
        formula:因子公式
        """
        # 替换公式中的函数名
        formula = formula.replace('greater', 'np.maximum')
        formula = formula.replace('less', 'np.minimum')  # 修复拼写错误
        
        # 定义shift函数
        def shift(series, n):
            """实现shift功能"""
            return series.shift(n)
        
        # 准备计算环境
        local_vars = {
            'np': np,
            'pd': pd,
            'shift': shift,
            'open': df['open'],
            'high': df['high'], 
            'low': df['low'],
            'close': df['close']
        }
        
        # 安全地计算表达式
        try:
            result = eval(formula, {'__builtins__': {}}, local_vars)
            return result
        except Exception as e:
            print(f"计算表达式失败: {formula}, 错误: {e}")
            return np.nan

In [75]:
# 创建因子计算表达式与因子名称
formulas = ["(close-open)/open",
            "(high-low)/open", 
            "(close-open)/(high-low+1e-12)",
            "(high-np.maximum(open, close))/open",
            "(high-np.maximum(open, close))/(high-low+1e-12)",
            "(np.minimum(open, close)-low)/open",
            "(np.minimum(open, close)-low)/(high-low+1e-12)",
            "(2*close-high-low)/open",
            "(2*close-high-low)/(high-low+1e-12)"]
names = ["KMID",
         "KLN",
         "KMID2",
         "KUP",
         "KUP2",
         "KLOW",
         "KLOW2",
         "KSFT",
         "KSFT2"]

feature = ['open', 'high', 'low', 'close']
windows = range(5)
for field in feature:
    field = field.lower()
    formulas += ["shift(%s, %d)/close" % (field, d) if d != 0 else "%s/close" % field for d in windows]
    names += [field.upper() + str(d) for d in windows]

In [76]:


# 当前测试数据没有volum，所以下面的先不进行测试，后续再说
# formulas += ["shift(volume, %d)/(volume+1e-12)" % d if d != 0 else "volume/(volume+1e-12)" for d in windows]
# names += ["VOLUME" + str(d) for d in windows]

# windows = [5, 10, 20, 30, 60]
# formulas += ["shift(close, %d)/close" % d for d in windows]
# names += ["ROC%d" % d for d in windows]

# formulas += ["mean(close, %d)/close" % d for d in windows]
# names += ["MA%d" % d for d in windows]

# formulas += ["std(close, %d)/close" % d for d in windows]
# names += ["STD%d" % d for d in windows]

# formulas += ["max(high, %d)/close" % d for d in windows]
# names += ["MAX%d" % d for d in windows]

# formulas += ["min(low, %d)/close" % d for d in windows]
# names += ["MIN%d" % d for d in windows]

# formulas += ["quantile(close, %d, 0.8)/close" % d for d in windows]
# names += ["QTLU%d" % d for d in windows]

# formulas += ["quantile(close, %d, 0.2)/close" % d for d in windows]
# names += ["QTLD%d" % d for d in windows]

# formulas += ["(close-min(low, %d))/(max(high, %d)-min(low, %d)+1e-12)" % (d, d, d) for d in windows]
# names += ["RSV%d" % d for d in windows]

# formulas += ["idxmax(high, %d)/%d" % (d, d) for d in windows]
# names += ["IMAX%d" % d for d in windows]

# formulas += ["idxmin(low, %d)/%d" % (d, d) for d in windows]
# names += ["IMIN%d" % d for d in windows]

# formulas += ["(idxmax(high, %d)-idxmin(low, %d))/%d" % (d, d, d) for d in windows]
# names += ["IMXD%d" % d for d in windows]

# formulas += ["corr(close, log(volume+1), %d)" % d for d in windows]
# names += ["CORR%d" % d for d in windows]

# formulas += ["corr(close/shift(close,1), log(volume/shift(volume, 1)+1), %d)" % d for d in windows]
# names += ["CORD%d" % d for d in windows]

# formulas += ["mean(close>shift(close, 1), %d)" % d for d in windows]
# names += ["CNTP%d" % d for d in windows]

# formulas += ["mean(close<shift(close, 1), %d)" % d for d in windows]
# names += ["CNTN%d" % d for d in windows]

# formulas += ["mean(close>shift(close, 1), %d)-mean(close<shift(close, 1), %d)" % (d, d) for d in windows]
# names += ["CNTD%d" % d for d in windows]

# formulas += [
#     "sum(greater(close-shift(close, 1), 0), %d)/(sum(Abs(close-shift(close, 1)), %d)+1e-12)" % (d, d)
#     for d in windows
# ]
# names += ["SUMP%d" % d for d in windows]

# formulas += [
#     "sum(greater(shift(close, 1)-close, 0), %d)/(sum(Abs(close-shift(close, 1)), %d)+1e-12)" % (d, d)
#     for d in windows
# ]
# names += ["SUMN%d" % d for d in windows]

# formulas += [
#     "(sum(greater(close-shift(close, 1), 0), %d)-sum(greater(shift(close, 1)-close, 0), %d))"
#     "/(sum(Abs(close-shift(close, 1)), %d)+1e-12)" % (d, d, d)
#     for d in windows
# ]
# names += ["SUMD%d" % d for d in windows]

# formulas += ["mean(volume, %d)/(volume+1e-12)" % d for d in windows]
# names += ["VMA%d" % d for d in windows]

# formulas += ["std(volume, %d)/(volume+1e-12)" % d for d in windows]
# names += ["VSTD%d" % d for d in windows]

# formulas += [
#     "std(Abs(close/shift(close, 1)-1)*volume, %d)/(mean(Abs(close/shift(close, 1)-1)*volume, %d)+1e-12)"
#     % (d, d)
#     for d in windows
# ]
# names += ["WVMA%d" % d for d in windows]

# formulas += [
#     "sum(greater(volume-shift(volume, 1), 0), %d)/(sum(Abs(volume-shift(volume, 1)), %d)+1e-12)"
#     % (d, d)
#     for d in windows
# ]
# names += ["VSUMP%d" % d for d in windows]

# formulas += [
#     "sum(greater(shift(volume, 1)-volume, 0), %d)/(sum(Abs(volume-shift(volume, 1)), %d)+1e-12)"
#     % (d, d)
#     for d in windows
# ]
# names += ["VSUMN%d" % d for d in windows]

# formulas += [
#     "(sum(greater(volume-shift(volume, 1), 0), %d)-sum(greater(shift(volume, 1)-volume, 0), %d))"
#     "/(sum(Abs(volume-shift(volume, 1)), %d)+1e-12)" % (d, d, d)
#     for d in windows
# ]
# names += ["VSUMD%d" % d for d in windows]

In [79]:
calculator = MinuteFactorCalculator(df1, formulas, names)
facotrs_result = calculator.calculate_factors()

一共计算 29 个因子
因子 KMID 计算完毕, 	其表达式为:(close-open)/open
因子 KLN 计算完毕, 	其表达式为:(high-low)/open
因子 KMID2 计算完毕, 	其表达式为:(close-open)/(high-low+1e-12)
因子 KUP 计算完毕, 	其表达式为:(high-np.maximum(open, close))/open
因子 KUP2 计算完毕, 	其表达式为:(high-np.maximum(open, close))/(high-low+1e-12)
因子 KLOW 计算完毕, 	其表达式为:(np.minimum(open, close)-low)/open
因子 KLOW2 计算完毕, 	其表达式为:(np.minimum(open, close)-low)/(high-low+1e-12)
因子 KSFT 计算完毕, 	其表达式为:(2*close-high-low)/open
因子 KSFT2 计算完毕, 	其表达式为:(2*close-high-low)/(high-low+1e-12)
因子 OPEN0 计算完毕, 	其表达式为:open/close
因子 OPEN1 计算完毕, 	其表达式为:shift(open, 1)/close
因子 OPEN2 计算完毕, 	其表达式为:shift(open, 2)/close
因子 OPEN3 计算完毕, 	其表达式为:shift(open, 3)/close
因子 OPEN4 计算完毕, 	其表达式为:shift(open, 4)/close
因子 HIGH0 计算完毕, 	其表达式为:high/close
因子 HIGH1 计算完毕, 	其表达式为:shift(high, 1)/close
因子 HIGH2 计算完毕, 	其表达式为:shift(high, 2)/close
因子 HIGH3 计算完毕, 	其表达式为:shift(high, 3)/close
因子 HIGH4 计算完毕, 	其表达式为:shift(high, 4)/close
因子 LOW0 计算完毕, 	其表达式为:low/close
因子 LOW1 计算完毕, 	其表达式为:shift(low, 1)/close
因子 LOW2 计算完毕, 	其表达式为:shift

In [80]:
facotrs_result

Unnamed: 0,code,DateTime,open,high,low,close,BidPrice01,BidPrice02,BidPrice03,BidPrice04,...,LOW0,LOW1,LOW2,LOW3,LOW4,CLOSE0,CLOSE1,CLOSE2,CLOSE3,CLOSE4
0,000001.sz,20251020 100000,11.31,11.31,11.30,11.31,11.30,11.29,11.28,11.27,...,0.999116,,,,,1.0,,,,
1,000001.sz,20251020 100100,11.30,11.31,11.29,11.29,11.29,11.28,11.27,11.26,...,1.000000,1.000886,,,,1.0,1.001772,,,
2,000001.sz,20251020 100200,11.29,11.29,11.28,11.29,11.28,11.27,11.26,11.25,...,0.999114,1.000000,1.000886,,,1.0,1.000000,1.001772,,
3,000001.sz,20251020 100300,11.29,11.30,11.28,11.30,11.29,11.28,11.27,11.26,...,0.998230,0.998230,0.999115,1.000000,,1.0,0.999115,0.999115,1.000885,
4,000001.sz,20251020 100400,11.29,11.30,11.28,11.30,11.29,11.28,11.27,11.26,...,0.998230,0.998230,0.998230,0.999115,1.000000,1.0,1.000000,0.999115,0.999115,1.000885
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
237,000001.sz,20251020 95500,11.31,11.32,11.30,11.32,11.31,11.30,11.29,11.28,...,0.998233,0.998233,1.000000,1.000000,1.000000,1.0,0.999117,1.000000,1.000883,1.000883
238,000001.sz,20251020 95600,11.32,11.32,11.31,11.31,11.30,11.29,11.28,11.27,...,1.000000,0.999116,0.999116,1.000884,1.000884,1.0,1.000884,1.000000,1.000884,1.001768
239,000001.sz,20251020 95700,11.31,11.31,11.29,11.29,11.29,11.28,11.27,11.26,...,1.000000,1.001772,1.000886,1.000886,1.002657,1.0,1.001772,1.002657,1.001772,1.002657
240,000001.sz,20251020 95800,11.29,11.31,11.29,11.31,11.30,11.29,11.28,11.27,...,0.998232,0.998232,1.000000,0.999116,0.999116,1.0,0.998232,1.000000,1.000884,1.000000


In [60]:
facotrs_result

Unnamed: 0,code,DateTime,open,high,low,close,BidPrice01,BidPrice02,BidPrice03,BidPrice04,...,LOW0,LOW1,LOW2,LOW3,LOW4,CLOSE0,CLOSE1,CLOSE2,CLOSE3,CLOSE4
0,000001.sz,20251020 100000,11.31,11.31,11.30,11.31,11.30,11.29,11.28,11.27,...,0.999116,,,,,1.0,,,,
1,000001.sz,20251020 100100,11.30,11.31,11.29,11.29,11.29,11.28,11.27,11.26,...,1.000000,1.000886,,,,1.0,1.001772,,,
2,000001.sz,20251020 100200,11.29,11.29,11.28,11.29,11.28,11.27,11.26,11.25,...,0.999114,1.000000,1.000886,,,1.0,1.000000,1.001772,,
3,000001.sz,20251020 100300,11.29,11.30,11.28,11.30,11.29,11.28,11.27,11.26,...,0.998230,0.998230,0.999115,1.000000,,1.0,0.999115,0.999115,1.000885,
4,000001.sz,20251020 100400,11.29,11.30,11.28,11.30,11.29,11.28,11.27,11.26,...,0.998230,0.998230,0.998230,0.999115,1.000000,1.0,1.000000,0.999115,0.999115,1.000885
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
237,000001.sz,20251020 95500,11.31,11.32,11.30,11.32,11.31,11.30,11.29,11.28,...,0.998233,0.998233,1.000000,1.000000,1.000000,1.0,0.999117,1.000000,1.000883,1.000883
238,000001.sz,20251020 95600,11.32,11.32,11.31,11.31,11.30,11.29,11.28,11.27,...,1.000000,0.999116,0.999116,1.000884,1.000884,1.0,1.000884,1.000000,1.000884,1.001768
239,000001.sz,20251020 95700,11.31,11.31,11.29,11.29,11.29,11.28,11.27,11.26,...,1.000000,1.001772,1.000886,1.000886,1.002657,1.0,1.001772,1.002657,1.001772,1.002657
240,000001.sz,20251020 95800,11.29,11.31,11.29,11.31,11.30,11.29,11.28,11.27,...,0.998232,0.998232,1.000000,0.999116,0.999116,1.0,0.998232,1.000000,1.000884,1.000000


In [83]:
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

In [84]:
class FactorEffective():
    def __init__(self, df, target_col, factor_cols, date_col='DateTime',forward_period=1, test_size=0.2, random_state=42):
        """
        target_col : str
            目标变量列名
        factor_cols : list
            因子列名列表
        date_col : str
            日期时间列名
        forward_period : int
            预测未来期数（分钟）
        test_size : float
            测试集比例
        random_state : int
            随机种子
        """
        self.df = df.copy()
        self.target_col = target_col
        self.factor_cols = factor_cols
        self.date_col = date_col
        self.forward_period = forward_period
        self.test_size = test_size
        self.random_state = random_state
        self.model = None
        self.scaler = StandardScaler()
        self.results = {}
    
    def prepare_data(self):
        """数据预处理"""

/bin/sh: 1: nvidia-smi: not found
