In [4]:
import os
import pandas as pd
import numpy as np
from tqdm import tqdm

In [5]:
def remove_multicollinearity(dataframe, threshold=0.9):
    """
    删除具有多重共线性的特征
    """
    corr_matrix = dataframe.corr().abs()
    upper_triangle = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    to_drop = [column for column in upper_triangle.columns if any(upper_triangle[column] > threshold)]
    return dataframe.drop(columns=to_drop, errors='ignore')

def process_symbol_data(symbol, input_dir, output_dir, 
                        raw_ta_filename,
                        target_shift=-1, 
                        train_ratio=0.7, test_ratio=0.15, 
                        correlation_threshold=0.001,
                        start_index=10000):
    """
    对指定币对的数据进行处理并保存。
    """
    file_path = os.path.join(input_dir, raw_ta_filename)
    
    # 读取数据
    df = pd.read_csv(file_path)
    df = df.drop(columns=['Unnamed: 0'], errors='ignore')
    df = df.set_index('date')
    
    # 计算 return
    df['return'] = df['close'].pct_change()
    df = df.iloc[start_index:]
    nan_cols = df.columns[df.isna().any()]
    df = df.drop(columns=nan_cols)
    
    # 定义目标变量
    df['target'] = df['return'].shift(target_shift)
    if target_shift < 0:
        df = df.iloc[:target_shift, :]
    else:
        df = df.iloc[target_shift:, :]
    
    drop_cols = ['Unnamed: 0', 'date'] 
    feature_cols = [c for c in df.columns if c not in drop_cols + ['target']]
    
    X = df[feature_cols].values
    y = df['target'].values
    
    if np.isnan(X).sum() > 0 or np.isnan(y).sum() > 0:
        X = np.nan_to_num(X, nan=np.nanmean(X))
        y = np.nan_to_num(y, nan=np.nanmean(y))
    
    # 初步划分数据集
    N = len(df)
    train_size = int(N * train_ratio)
    test_size = int(N * test_ratio)
    
    train_end = train_size
    test_end = train_size + test_size
    
    X_train = X[:train_end]
    y_train = y[:train_end]
    X_train_df = pd.DataFrame(X_train, columns=feature_cols)
    y_train_series = pd.Series(y_train, name='target')
    
    df_temp = pd.concat([X_train_df, y_train_series], axis=1)
    df_filtered = remove_multicollinearity(df_temp, threshold=0.9)
    
    if 'target' not in df_filtered.columns:
        df_filtered['target'] = y_train_series
    correlations = df_filtered.corr(method='pearson')['target'].drop('target')
    selected_features = correlations[correlations.abs() > correlation_threshold].index.tolist()
    df_filtered = df_filtered[selected_features + ['target']]
    
    all_features_df = pd.DataFrame(X, columns=feature_cols)
    all_df = pd.concat([all_features_df, pd.Series(y, name='target')], axis=1)
    all_df = all_df[selected_features + ['target']]
    
    train_size = int(N * train_ratio)
    test_size = int(N * test_ratio)
    
    train_end = train_size
    test_end = train_size + test_size
    
    X_all = all_df.drop('target', axis=1).values
    y_all = all_df['target'].values
    
    X_train = X_all[:train_end]
    y_train = y_all[:train_end]

    X_test = X_all[train_end:test_end]
    y_test = y_all[train_end:test_end]

    X_val = X_all[test_end:]
    y_val = y_all[test_end:]
    
    X_train_df = pd.DataFrame(X_train, columns=selected_features)
    X_val_df = pd.DataFrame(X_val, columns=selected_features)
    X_test_df = pd.DataFrame(X_test, columns=selected_features)
    
    X_train_df['target'] = y_train
    X_val_df['target'] = y_val
    X_test_df['target'] = y_test
    
    # 创建输出目录
    symbol_dir = os.path.join(output_dir, symbol)
    os.makedirs(symbol_dir, exist_ok=True)
    
    train_file_path = os.path.join(symbol_dir, f"Xy_train_{symbol}.csv")
    val_file_path = os.path.join(symbol_dir, f"Xy_val_{symbol}.csv")
    test_file_path = os.path.join(symbol_dir, f"Xy_test_{symbol}.csv")

    X_train_df.to_csv(train_file_path, index=False)
    X_val_df.to_csv(val_file_path, index=False)
    X_test_df.to_csv(test_file_path, index=False)
    
    print(f"{symbol}: Processed files saved at {symbol_dir}")

def process_all_symbols(input_dir='.', output_dir='.', **kwargs):
    """
    读取输入目录下的所有符合 *_1m_ta.csv 格式的文件，并进行处理。
    """
    # 获取所有符合 *_1m_ta.csv 的文件
    ta_files = [f for f in os.listdir(input_dir) if f.endswith('_1m_ta.csv')]
    
    # 提取交易对名称并处理
    for ta_file in tqdm(ta_files, desc="Processing all symbols"):
        symbol = ta_file.split('_1m_ta.csv')[0]
        process_symbol_data(symbol, input_dir=input_dir, output_dir=output_dir, raw_ta_filename=ta_file, **kwargs)

In [None]:
process_all_symbols(input_dir='data', output_dir='processed', start_index=10000, correlation_threshold=0.001)


Processing all symbols:   0%|          | 0/106 [00:00<?, ?it/s]

Processing all symbols:   1%|          | 1/106 [00:27<47:52, 27.36s/it]

1000SATSUSDT: Processed files saved at processed\1000SATSUSDT


Processing all symbols:   2%|▏         | 2/106 [00:55<47:58, 27.68s/it]

AAVEUSDT: Processed files saved at processed\AAVEUSDT


Processing all symbols:   3%|▎         | 3/106 [01:22<46:55, 27.34s/it]

ACEUSDT: Processed files saved at processed\ACEUSDT


Processing all symbols:   4%|▍         | 4/106 [01:46<44:44, 26.32s/it]

ADAUSDT: Processed files saved at processed\ADAUSDT


Processing all symbols:   5%|▍         | 5/106 [02:10<42:20, 25.15s/it]

AEVOUSDT: Processed files saved at processed\AEVOUSDT


Processing all symbols:   6%|▌         | 6/106 [02:24<35:44, 21.45s/it]

AGIXUSDT: Processed files saved at processed\AGIXUSDT


Processing all symbols:   7%|▋         | 7/106 [02:53<39:49, 24.14s/it]

AIUSDT: Processed files saved at processed\AIUSDT


Processing all symbols:   8%|▊         | 8/106 [03:19<40:09, 24.59s/it]

ALTUSDT: Processed files saved at processed\ALTUSDT


Processing all symbols:   8%|▊         | 9/106 [03:47<41:29, 25.67s/it]

APEUSDT: Processed files saved at processed\APEUSDT


Processing all symbols:   9%|▉         | 10/106 [04:16<42:44, 26.72s/it]

API3USDT: Processed files saved at processed\API3USDT


Processing all symbols:  10%|█         | 11/106 [04:48<44:45, 28.27s/it]

APTUSDT: Processed files saved at processed\APTUSDT


Processing all symbols:  11%|█▏        | 12/106 [05:14<43:25, 27.72s/it]

ARBUSDT: Processed files saved at processed\ARBUSDT


Processing all symbols:  12%|█▏        | 13/106 [05:40<42:06, 27.17s/it]

ATOMUSDT: Processed files saved at processed\ATOMUSDT


Processing all symbols:  13%|█▎        | 14/106 [06:07<41:36, 27.14s/it]

AVAXUSDT: Processed files saved at processed\AVAXUSDT


Processing all symbols:  14%|█▍        | 15/106 [06:37<42:27, 27.99s/it]

BAKEUSDT: Processed files saved at processed\BAKEUSDT


Processing all symbols:  15%|█▌        | 16/106 [06:56<37:40, 25.12s/it]

BBUSDT: Processed files saved at processed\BBUSDT


Processing all symbols:  16%|█▌        | 17/106 [07:26<39:26, 26.59s/it]

BCHUSDT: Processed files saved at processed\BCHUSDT


Processing all symbols:  17%|█▋        | 18/106 [07:53<39:09, 26.70s/it]

BELUSDT: Processed files saved at processed\BELUSDT


Processing all symbols:  18%|█▊        | 19/106 [08:18<38:05, 26.27s/it]

BLURUSDT: Processed files saved at processed\BLURUSDT


Processing all symbols:  19%|█▉        | 20/106 [08:45<38:07, 26.59s/it]

BNBUSDT: Processed files saved at processed\BNBUSDT


Processing all symbols:  20%|█▉        | 21/106 [09:10<36:56, 26.07s/it]

BNXUSDT: Processed files saved at processed\BNXUSDT


Processing all symbols:  21%|██        | 22/106 [09:36<36:12, 25.87s/it]

BOMEUSDT: Processed files saved at processed\BOMEUSDT


Processing all symbols:  22%|██▏       | 23/106 [09:52<31:39, 22.89s/it]

BONDUSDT: Processed files saved at processed\BONDUSDT


Processing all symbols:  23%|██▎       | 24/106 [10:20<33:42, 24.66s/it]

BTCUSDT: Processed files saved at processed\BTCUSDT


Processing all symbols:  24%|██▎       | 25/106 [10:51<35:38, 26.40s/it]

CFXUSDT: Processed files saved at processed\CFXUSDT


Processing all symbols:  25%|██▍       | 26/106 [11:21<36:33, 27.42s/it]

CHZUSDT: Processed files saved at processed\CHZUSDT


Processing all symbols:  25%|██▌       | 27/106 [11:48<35:59, 27.33s/it]

CKBUSDT: Processed files saved at processed\CKBUSDT


Processing all symbols:  26%|██▋       | 28/106 [12:16<36:04, 27.75s/it]

COTIUSDT: Processed files saved at processed\COTIUSDT


Processing all symbols:  27%|██▋       | 29/106 [12:42<34:57, 27.23s/it]

CRVUSDT: Processed files saved at processed\CRVUSDT


Processing all symbols:  28%|██▊       | 30/106 [13:11<34:51, 27.52s/it]

CTSIUSDT: Processed files saved at processed\CTSIUSDT


Processing all symbols:  29%|██▉       | 31/106 [13:42<35:54, 28.73s/it]

DIAUSDT: Processed files saved at processed\DIAUSDT


Processing all symbols:  30%|███       | 32/106 [14:09<34:39, 28.10s/it]

DOGEUSDT: Processed files saved at processed\DOGEUSDT


Processing all symbols:  31%|███       | 33/106 [14:19<27:29, 22.60s/it]

DOGSUSDT: Processed files saved at processed\DOGSUSDT


Processing all symbols:  32%|███▏      | 34/106 [14:48<29:33, 24.63s/it]

DOTUSDT: Processed files saved at processed\DOTUSDT


Processing all symbols:  33%|███▎      | 35/106 [15:15<30:02, 25.39s/it]

DYDXUSDT: Processed files saved at processed\DYDXUSDT


Processing all symbols:  34%|███▍      | 36/106 [15:38<28:50, 24.72s/it]

DYMUSDT: Processed files saved at processed\DYMUSDT


Processing all symbols:  35%|███▍      | 37/106 [16:05<29:15, 25.44s/it]

EDUUSDT: Processed files saved at processed\EDUUSDT


Processing all symbols:  36%|███▌      | 38/106 [16:27<27:40, 24.42s/it]

ENAUSDT: Processed files saved at processed\ENAUSDT


Processing all symbols:  37%|███▋      | 39/106 [16:56<28:46, 25.76s/it]

ENSUSDT: Processed files saved at processed\ENSUSDT


Processing all symbols:  38%|███▊      | 40/106 [17:27<29:51, 27.14s/it]

ETCUSDT: Processed files saved at processed\ETCUSDT


Processing all symbols:  39%|███▊      | 41/106 [17:50<28:01, 25.86s/it]

ETHFIUSDT: Processed files saved at processed\ETHFIUSDT


Processing all symbols:  40%|███▉      | 42/106 [18:19<28:42, 26.91s/it]

ETHUSDT: Processed files saved at processed\ETHUSDT


Processing all symbols:  41%|████      | 43/106 [18:49<29:12, 27.82s/it]

FETUSDT: Processed files saved at processed\FETUSDT


Processing all symbols:  42%|████▏     | 44/106 [19:20<29:49, 28.87s/it]

FILUSDT: Processed files saved at processed\FILUSDT


Processing all symbols:  42%|████▏     | 45/106 [19:38<25:58, 25.55s/it]

FRONTUSDT: Processed files saved at processed\FRONTUSDT


Processing all symbols:  43%|████▎     | 46/106 [20:08<26:57, 26.96s/it]

FTMUSDT: Processed files saved at processed\FTMUSDT


Processing all symbols:  44%|████▍     | 47/106 [20:38<27:24, 27.87s/it]

GALAUSDT: Processed files saved at processed\GALAUSDT


Processing all symbols:  45%|████▌     | 48/106 [21:05<26:44, 27.66s/it]

GLMUSDT: Processed files saved at processed\GLMUSDT


Processing all symbols:  46%|████▌     | 49/106 [21:34<26:28, 27.87s/it]

GMTUSDT: Processed files saved at processed\GMTUSDT


Processing all symbols:  47%|████▋     | 50/106 [22:02<26:11, 28.07s/it]

HBARUSDT: Processed files saved at processed\HBARUSDT


Processing all symbols:  48%|████▊     | 51/106 [22:32<26:18, 28.70s/it]

HIGHUSDT: Processed files saved at processed\HIGHUSDT


Processing all symbols:  49%|████▉     | 52/106 [22:39<19:45, 21.95s/it]

HMSTRUSDT: Processed files saved at processed\HMSTRUSDT


Processing all symbols:  50%|█████     | 53/106 [23:08<21:27, 24.29s/it]

ICPUSDT: Processed files saved at processed\ICPUSDT


Processing all symbols:  51%|█████     | 54/106 [23:39<22:34, 26.04s/it]

IDUSDT: Processed files saved at processed\IDUSDT


Processing all symbols:  52%|█████▏    | 55/106 [24:09<23:17, 27.41s/it]

INJUSDT: Processed files saved at processed\INJUSDT


Processing all symbols:  53%|█████▎    | 56/106 [24:25<19:55, 23.91s/it]

IOUSDT: Processed files saved at processed\IOUSDT


Processing all symbols:  54%|█████▍    | 57/106 [24:56<21:18, 26.10s/it]

JASMYUSDT: Processed files saved at processed\JASMYUSDT


Processing all symbols:  55%|█████▍    | 58/106 [25:25<21:27, 26.83s/it]

JTOUSDT: Processed files saved at processed\JTOUSDT


Processing all symbols:  56%|█████▌    | 59/106 [25:51<21:00, 26.81s/it]

JUPUSDT: Processed files saved at processed\JUPUSDT


Processing all symbols:  57%|█████▋    | 60/106 [26:19<20:43, 27.02s/it]

LDOUSDT: Processed files saved at processed\LDOUSDT


Processing all symbols:  58%|█████▊    | 61/106 [26:46<20:20, 27.13s/it]

LINKUSDT: Processed files saved at processed\LINKUSDT


Processing all symbols:  58%|█████▊    | 62/106 [27:02<17:19, 23.62s/it]

LISTAUSDT: Processed files saved at processed\LISTAUSDT


Processing all symbols:  59%|█████▉    | 63/106 [27:30<17:48, 24.86s/it]

LPTUSDT: Processed files saved at processed\LPTUSDT


Processing all symbols:  60%|██████    | 64/106 [28:00<18:37, 26.60s/it]

LTCUSDT: Processed files saved at processed\LTCUSDT


Processing all symbols:  61%|██████▏   | 65/106 [28:28<18:26, 26.98s/it]

MANTAUSDT: Processed files saved at processed\MANTAUSDT


Processing all symbols:  62%|██████▏   | 66/106 [28:55<18:04, 27.12s/it]

MASKUSDT: Processed files saved at processed\MASKUSDT


Processing all symbols:  63%|██████▎   | 67/106 [29:17<16:31, 25.43s/it]

MATICUSDT: Processed files saved at processed\MATICUSDT


Processing all symbols:  64%|██████▍   | 68/106 [29:44<16:28, 26.01s/it]

NEARUSDT: Processed files saved at processed\NEARUSDT


Processing all symbols:  65%|██████▌   | 69/106 [29:52<12:38, 20.50s/it]

NEIROUSDT: Processed files saved at processed\NEIROUSDT


Processing all symbols:  66%|██████▌   | 70/106 [30:21<13:50, 23.06s/it]

NEOUSDT: Processed files saved at processed\NEOUSDT


Processing all symbols:  67%|██████▋   | 71/106 [30:40<12:46, 21.91s/it]

NOTUSDT: Processed files saved at processed\NOTUSDT


Processing all symbols:  68%|██████▊   | 72/106 [31:07<13:16, 23.42s/it]

ONGUSDT: Processed files saved at processed\ONGUSDT


Processing all symbols:  69%|██████▉   | 73/106 [31:38<14:04, 25.60s/it]

ONTUSDT: Processed files saved at processed\ONTUSDT


Processing all symbols:  70%|██████▉   | 74/106 [32:08<14:23, 26.98s/it]

OPUSDT: Processed files saved at processed\OPUSDT


Processing all symbols:  71%|███████   | 75/106 [32:37<14:13, 27.53s/it]

ORDIUSDT: Processed files saved at processed\ORDIUSDT


Processing all symbols:  72%|███████▏  | 76/106 [33:05<13:51, 27.72s/it]

PENDLEUSDT: Processed files saved at processed\PENDLEUSDT


Processing all symbols:  73%|███████▎  | 77/106 [33:31<13:10, 27.25s/it]

PEOPLEUSDT: Processed files saved at processed\PEOPLEUSDT


Processing all symbols:  74%|███████▎  | 78/106 [34:00<12:52, 27.58s/it]

POLYXUSDT: Processed files saved at processed\POLYXUSDT


Processing all symbols:  75%|███████▍  | 79/106 [34:26<12:19, 27.38s/it]

POWRUSDT: Processed files saved at processed\POWRUSDT


Processing all symbols:  75%|███████▌  | 80/106 [34:51<11:30, 26.55s/it]

PYTHUSDT: Processed files saved at processed\PYTHUSDT


Processing all symbols:  76%|███████▋  | 81/106 [35:18<11:08, 26.75s/it]

RAREUSDT: Processed files saved at processed\RAREUSDT


Processing all symbols:  77%|███████▋  | 82/106 [35:36<09:39, 24.14s/it]

REEFUSDT: Processed files saved at processed\REEFUSDT


Processing all symbols:  78%|███████▊  | 83/106 [35:56<08:42, 22.73s/it]

SAGAUSDT: Processed files saved at processed\SAGAUSDT


Processing all symbols:  79%|███████▉  | 84/106 [36:25<09:02, 24.64s/it]

SEIUSDT: Processed files saved at processed\SEIUSDT


Processing all symbols:  80%|████████  | 85/106 [36:54<09:03, 25.89s/it]

SOLUSDT: Processed files saved at processed\SOLUSDT


Processing all symbols:  81%|████████  | 86/106 [37:21<08:46, 26.34s/it]

STMXUSDT: Processed files saved at processed\STMXUSDT


Processing all symbols:  82%|████████▏ | 87/106 [37:45<08:07, 25.67s/it]

STXUSDT: Processed files saved at processed\STXUSDT


Processing all symbols:  83%|████████▎ | 88/106 [38:08<07:28, 24.93s/it]

SUIUSDT: Processed files saved at processed\SUIUSDT


Processing all symbols:  84%|████████▍ | 89/106 [38:33<07:04, 24.98s/it]

SUNUSDT: Processed files saved at processed\SUNUSDT


Processing all symbols:  85%|████████▍ | 90/106 [38:51<06:03, 22.74s/it]

TAOUSDT: Processed files saved at processed\TAOUSDT


Processing all symbols:  86%|████████▌ | 91/106 [39:13<05:39, 22.65s/it]

TIAUSDT: Processed files saved at processed\TIAUSDT


Processing all symbols:  87%|████████▋ | 92/106 [39:23<04:21, 18.71s/it]

TONUSDT: Processed files saved at processed\TONUSDT


Processing all symbols:  88%|████████▊ | 93/106 [39:48<04:27, 20.54s/it]

TRBUSDT: Processed files saved at processed\TRBUSDT
