In [1]:
import os
import pandas as pd
import numpy as np
from tqdm import tqdm

In [2]:
def remove_multicollinearity(dataframe, threshold=0.9):
    """
    删除具有多重共线性的特征
    """
    corr_matrix = dataframe.corr().abs()
    upper_triangle = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    to_drop = [column for column in upper_triangle.columns if any(upper_triangle[column] > threshold)]
    return dataframe.drop(columns=to_drop, errors='ignore')

def process_symbol_data(symbol, input_dir, output_dir, 
                        raw_ta_filename,
                        target_shift=-1, 
                        train_ratio=0.7, test_ratio=0.15, 
                        correlation_threshold=0.001,
                        start_index=10000):
    """
    对指定币对的数据进行处理并保存。
    """
    file_path = os.path.join(input_dir, raw_ta_filename)
    
    # 读取数据
    df = pd.read_csv(file_path)
    df = df.drop(columns=['Unnamed: 0'], errors='ignore')
    df = df.set_index('date')
    
    # 计算 return
    df['return'] = df['close'].pct_change()
    df = df.iloc[start_index:]
    nan_cols = df.columns[df.isna().any()]
    df = df.drop(columns=nan_cols)
    
    # 定义目标变量
    df['target'] = df['return'].shift(target_shift)
    if target_shift < 0:
        df = df.iloc[:target_shift, :]
    else:
        df = df.iloc[target_shift:, :]
    
    drop_cols = ['Unnamed: 0', 'date'] 
    feature_cols = [c for c in df.columns if c not in drop_cols + ['target']]
    
    X = df[feature_cols].values
    y = df['target'].values
    
    if np.isnan(X).sum() > 0 or np.isnan(y).sum() > 0:
        X = np.nan_to_num(X, nan=np.nanmean(X))
        y = np.nan_to_num(y, nan=np.nanmean(y))
    
    # 初步划分数据集
    N = len(df)
    train_size = int(N * train_ratio)
    test_size = int(N * test_ratio)
    
    train_end = train_size
    test_end = train_size + test_size
    
    X_train = X[:train_end]
    y_train = y[:train_end]
    X_train_df = pd.DataFrame(X_train, columns=feature_cols)
    y_train_series = pd.Series(y_train, name='target')
    
    df_temp = pd.concat([X_train_df, y_train_series], axis=1)
    df_filtered = remove_multicollinearity(df_temp, threshold=0.9)
    
    if 'target' not in df_filtered.columns:
        df_filtered['target'] = y_train_series
    correlations = df_filtered.corr(method='pearson')['target'].drop('target')
    selected_features = correlations[correlations.abs() > correlation_threshold].index.tolist()
    df_filtered = df_filtered[selected_features + ['target']]
    
    all_features_df = pd.DataFrame(X, columns=feature_cols)
    all_df = pd.concat([all_features_df, pd.Series(y, name='target')], axis=1)
    all_df = all_df[selected_features + ['target']]
    
    train_size = int(N * train_ratio)
    test_size = int(N * test_ratio)
    
    train_end = train_size
    test_end = train_size + test_size
    
    X_all = all_df.drop('target', axis=1).values
    y_all = all_df['target'].values
    
    X_train = X_all[:train_end]
    y_train = y_all[:train_end]

    X_test = X_all[train_end:test_end]
    y_test = y_all[train_end:test_end]

    X_val = X_all[test_end:]
    y_val = y_all[test_end:]
    
    X_train_df = pd.DataFrame(X_train, columns=selected_features)
    X_val_df = pd.DataFrame(X_val, columns=selected_features)
    X_test_df = pd.DataFrame(X_test, columns=selected_features)
    
    X_train_df['target'] = y_train
    X_val_df['target'] = y_val
    X_test_df['target'] = y_test
    
    # 创建输出目录
    symbol_dir = os.path.join(output_dir, symbol)
    os.makedirs(symbol_dir, exist_ok=True)
    
    train_file_path = os.path.join(symbol_dir, f"Xy_train_{symbol}.csv")
    val_file_path = os.path.join(symbol_dir, f"Xy_val_{symbol}.csv")
    test_file_path = os.path.join(symbol_dir, f"Xy_test_{symbol}.csv")

    X_train_df.to_csv(train_file_path, index=False)
    X_val_df.to_csv(val_file_path, index=False)
    X_test_df.to_csv(test_file_path, index=False)
    
    print(f"{symbol}: Processed files saved at {symbol_dir}")

def process_all_symbols(input_dir='.', output_dir='.', **kwargs):
    """
    读取输入目录下的所有符合 *_1m_ta.csv 格式的文件，并进行处理。
    """
    # 获取所有符合 *_1m_ta.csv 的文件
    ta_files = [f for f in os.listdir(input_dir) if f.endswith('_1m_ta.csv')]
    
    # 提取交易对名称并处理
    for ta_file in tqdm(ta_files, desc="Processing all symbols"):
        symbol = ta_file.split('_1m_ta.csv')[0]
        process_symbol_data(symbol, input_dir=input_dir, output_dir=output_dir, raw_ta_filename=ta_file, **kwargs)

In [3]:
process_all_symbols(input_dir='data', output_dir='processed', start_index=10000, correlation_threshold=0.001)


Processing all symbols:   0%|          | 0/109 [00:00<?, ?it/s]

Processing all symbols:   1%|          | 1/109 [00:30<55:31, 30.85s/it]

1000SATSUSDT: Processed files saved at processed\1000SATSUSDT


Processing all symbols:   2%|▏         | 2/109 [00:38<30:40, 17.20s/it]

1MBABYDOGEUSDT: Processed files saved at processed\1MBABYDOGEUSDT
AAVEUSDT: Processed files saved at processed\AAVEUSDT


Processing all symbols:   4%|▎         | 4/109 [02:55<1:21:47, 46.74s/it]

ACEUSDT: Processed files saved at processed\ACEUSDT
ADAUSDT: Processed files saved at processed\ADAUSDT


Processing all symbols:   6%|▌         | 6/109 [05:03<1:30:23, 52.66s/it]

AEVOUSDT: Processed files saved at processed\AEVOUSDT


Processing all symbols:   6%|▋         | 7/109 [05:34<1:17:21, 45.51s/it]

AGIXUSDT: Processed files saved at processed\AGIXUSDT


Processing all symbols:   7%|▋         | 8/109 [05:59<1:05:34, 38.95s/it]

AIUSDT: Processed files saved at processed\AIUSDT


Processing all symbols:   8%|▊         | 9/109 [06:22<56:47, 34.08s/it]  

ALTUSDT: Processed files saved at processed\ALTUSDT


Processing all symbols:   9%|▉         | 10/109 [07:32<1:14:29, 45.15s/it]

APEUSDT: Processed files saved at processed\APEUSDT


Processing all symbols:  10%|█         | 11/109 [08:44<1:26:59, 53.26s/it]

API3USDT: Processed files saved at processed\API3USDT


Processing all symbols:  11%|█         | 12/109 [09:36<1:25:33, 52.92s/it]

APTUSDT: Processed files saved at processed\APTUSDT


Processing all symbols:  12%|█▏        | 13/109 [10:24<1:22:23, 51.49s/it]

ARBUSDT: Processed files saved at processed\ARBUSDT
ATOMUSDT: Processed files saved at processed\ATOMUSDT


Processing all symbols:  13%|█▎        | 14/109 [12:01<1:43:03, 65.08s/it]

AVAXUSDT: Processed files saved at processed\AVAXUSDT


Processing all symbols:  15%|█▍        | 16/109 [15:21<2:07:57, 82.55s/it]

BAKEUSDT: Processed files saved at processed\BAKEUSDT


Processing all symbols:  16%|█▌        | 17/109 [15:38<1:36:13, 62.75s/it]

BBUSDT: Processed files saved at processed\BBUSDT
BCHUSDT: Processed files saved at processed\BCHUSDT


Processing all symbols:  17%|█▋        | 19/109 [18:53<2:00:43, 80.48s/it]

BELUSDT: Processed files saved at processed\BELUSDT


Processing all symbols:  18%|█▊        | 20/109 [19:19<1:35:01, 64.06s/it]

BLURUSDT: Processed files saved at processed\BLURUSDT


Processing all symbols:  19%|█▉        | 21/109 [20:55<1:48:09, 73.75s/it]

BNBUSDT: Processed files saved at processed\BNBUSDT


Processing all symbols:  20%|██        | 22/109 [22:05<1:45:13, 72.57s/it]

BNXUSDT: Processed files saved at processed\BNXUSDT


Processing all symbols:  21%|██        | 23/109 [22:25<1:21:20, 56.76s/it]

BOMEUSDT: Processed files saved at processed\BOMEUSDT


Processing all symbols:  22%|██▏       | 24/109 [23:38<1:27:14, 61.58s/it]

BONDUSDT: Processed files saved at processed\BONDUSDT
BTCUSDT: Processed files saved at processed\BTCUSDT


Processing all symbols:  24%|██▍       | 26/109 [25:12<1:10:00, 50.61s/it]

CATIUSDT: Processed files saved at processed\CATIUSDT
CFXUSDT: Processed files saved at processed\CFXUSDT


Processing all symbols:  26%|██▌       | 28/109 [28:14<1:36:20, 71.37s/it]

CHZUSDT: Processed files saved at processed\CHZUSDT


Processing all symbols:  27%|██▋       | 29/109 [29:53<1:46:26, 79.83s/it]

CKBUSDT: Processed files saved at processed\CKBUSDT


Processing all symbols:  28%|██▊       | 30/109 [31:35<1:53:56, 86.54s/it]

COTIUSDT: Processed files saved at processed\COTIUSDT


Processing all symbols:  28%|██▊       | 31/109 [33:10<1:55:43, 89.02s/it]

CRVUSDT: Processed files saved at processed\CRVUSDT


Processing all symbols:  29%|██▉       | 32/109 [34:48<1:57:43, 91.73s/it]

CTSIUSDT: Processed files saved at processed\CTSIUSDT


Processing all symbols:  30%|███       | 33/109 [36:21<1:56:45, 92.17s/it]

DIAUSDT: Processed files saved at processed\DIAUSDT


Processing all symbols:  31%|███       | 34/109 [37:55<1:55:35, 92.48s/it]

DOGEUSDT: Processed files saved at processed\DOGEUSDT


Processing all symbols:  32%|███▏      | 35/109 [38:03<1:23:00, 67.31s/it]

DOGSUSDT: Processed files saved at processed\DOGSUSDT


Processing all symbols:  33%|███▎      | 36/109 [39:33<1:30:11, 74.13s/it]

DOTUSDT: Processed files saved at processed\DOTUSDT


Processing all symbols:  34%|███▍      | 37/109 [41:13<1:38:13, 81.86s/it]

DYDXUSDT: Processed files saved at processed\DYDXUSDT


Processing all symbols:  35%|███▍      | 38/109 [41:37<1:16:27, 64.61s/it]

DYMUSDT: Processed files saved at processed\DYMUSDT


Processing all symbols:  36%|███▌      | 39/109 [42:34<1:12:31, 62.16s/it]

EDUUSDT: Processed files saved at processed\EDUUSDT


Processing all symbols:  37%|███▋      | 40/109 [42:50<55:34, 48.33s/it]  

EIGENUSDT: Processed files saved at processed\EIGENUSDT


Processing all symbols:  38%|███▊      | 41/109 [43:39<55:01, 48.55s/it]

ENAUSDT: Processed files saved at processed\ENAUSDT


Processing all symbols:  39%|███▊      | 42/109 [45:50<1:21:43, 73.19s/it]

ENSUSDT: Processed files saved at processed\ENSUSDT


Processing all symbols:  39%|███▉      | 43/109 [48:55<1:57:26, 106.77s/it]

ETCUSDT: Processed files saved at processed\ETCUSDT


Processing all symbols:  40%|████      | 44/109 [49:35<1:33:59, 86.77s/it] 

ETHFIUSDT: Processed files saved at processed\ETHFIUSDT


Processing all symbols:  41%|████▏     | 45/109 [52:31<2:01:15, 113.69s/it]

ETHUSDT: Processed files saved at processed\ETHUSDT


Processing all symbols:  42%|████▏     | 46/109 [55:46<2:24:53, 138.00s/it]

FETUSDT: Processed files saved at processed\FETUSDT


Processing all symbols:  43%|████▎     | 47/109 [58:53<2:37:52, 152.78s/it]

FILUSDT: Processed files saved at processed\FILUSDT


Processing all symbols:  44%|████▍     | 48/109 [1:01:13<2:31:22, 148.90s/it]

FRONTUSDT: Processed files saved at processed\FRONTUSDT


Processing all symbols:  45%|████▍     | 49/109 [1:04:14<2:38:35, 158.59s/it]

FTMUSDT: Processed files saved at processed\FTMUSDT


Processing all symbols:  46%|████▌     | 50/109 [1:06:20<2:26:05, 148.57s/it]

GALAUSDT: Processed files saved at processed\GALAUSDT


Processing all symbols:  47%|████▋     | 51/109 [1:07:40<2:03:50, 128.11s/it]

GLMUSDT: Processed files saved at processed\GLMUSDT
GMTUSDT: Processed files saved at processed\GMTUSDT


Processing all symbols:  48%|████▊     | 52/109 [1:09:43<2:00:15, 126.58s/it]

HBARUSDT: Processed files saved at processed\HBARUSDT


Processing all symbols:  49%|████▊     | 53/109 [1:13:02<2:18:23, 148.28s/it]

HIGHUSDT: Processed files saved at processed\HIGHUSDT


Processing all symbols:  50%|█████     | 55/109 [1:15:47<1:37:25, 108.26s/it]

HMSTRUSDT: Processed files saved at processed\HMSTRUSDT
ICPUSDT: Processed files saved at processed\ICPUSDT


Processing all symbols:  52%|█████▏    | 57/109 [1:20:05<1:39:03, 114.29s/it]

IDUSDT: Processed files saved at processed\IDUSDT


Processing all symbols:  53%|█████▎    | 58/109 [1:22:51<1:50:21, 129.84s/it]

INJUSDT: Processed files saved at processed\INJUSDT


Processing all symbols:  54%|█████▍    | 59/109 [1:23:16<1:21:58, 98.36s/it] 

IOUSDT: Processed files saved at processed\IOUSDT


Processing all symbols:  55%|█████▌    | 60/109 [1:25:41<1:31:48, 112.42s/it]

JASMYUSDT: Processed files saved at processed\JASMYUSDT


Processing all symbols:  56%|█████▌    | 61/109 [1:26:29<1:14:26, 93.05s/it] 

JTOUSDT: Processed files saved at processed\JTOUSDT


Processing all symbols:  57%|█████▋    | 62/109 [1:27:16<1:01:59, 79.14s/it]

JUPUSDT: Processed files saved at processed\JUPUSDT


Processing all symbols:  58%|█████▊    | 63/109 [1:29:10<1:08:42, 89.61s/it]

LDOUSDT: Processed files saved at processed\LDOUSDT
LINKUSDT: Processed files saved at processed\LINKUSDT


Processing all symbols:  60%|█████▉    | 65/109 [1:32:47<1:07:28, 92.01s/it] 

LISTAUSDT: Processed files saved at processed\LISTAUSDT
LPTUSDT: Processed files saved at processed\LPTUSDT


Processing all symbols:  61%|██████    | 66/109 [1:35:29<1:20:55, 112.93s/it]

LTCUSDT: Processed files saved at processed\LTCUSDT


Processing all symbols:  62%|██████▏   | 68/109 [1:39:48<1:17:49, 113.89s/it]

MANTAUSDT: Processed files saved at processed\MANTAUSDT


Processing all symbols:  63%|██████▎   | 69/109 [1:42:44<1:28:26, 132.66s/it]

MASKUSDT: Processed files saved at processed\MASKUSDT


Processing all symbols:  64%|██████▍   | 70/109 [1:45:59<1:38:21, 151.32s/it]

MATICUSDT: Processed files saved at processed\MATICUSDT
NEARUSDT: Processed files saved at processed\NEARUSDT


Processing all symbols:  66%|██████▌   | 72/109 [1:49:14<1:11:51, 116.53s/it]

NEIROUSDT: Processed files saved at processed\NEIROUSDT
NEOUSDT: Processed files saved at processed\NEOUSDT


Processing all symbols:  68%|██████▊   | 74/109 [1:52:53<1:01:50, 106.03s/it]

NOTUSDT: Processed files saved at processed\NOTUSDT


Processing all symbols:  69%|██████▉   | 75/109 [1:55:22<1:07:28, 119.06s/it]

ONGUSDT: Processed files saved at processed\ONGUSDT


Processing all symbols:  70%|██████▉   | 76/109 [1:57:08<1:03:15, 115.02s/it]

ONTUSDT: Processed files saved at processed\ONTUSDT


Processing all symbols:  71%|███████   | 77/109 [1:58:16<53:55, 101.12s/it]  

OPUSDT: Processed files saved at processed\OPUSDT


Processing all symbols:  72%|███████▏  | 78/109 [1:58:49<41:36, 80.55s/it] 

ORDIUSDT: Processed files saved at processed\ORDIUSDT


Processing all symbols:  72%|███████▏  | 79/109 [1:59:25<33:34, 67.13s/it]

PENDLEUSDT: Processed files saved at processed\PENDLEUSDT


Processing all symbols:  73%|███████▎  | 80/109 [2:00:42<33:57, 70.25s/it]

PEOPLEUSDT: Processed files saved at processed\PEOPLEUSDT


Processing all symbols:  74%|███████▍  | 81/109 [2:01:44<31:35, 67.69s/it]

POLYXUSDT: Processed files saved at processed\POLYXUSDT


Processing all symbols:  75%|███████▌  | 82/109 [2:03:07<32:27, 72.14s/it]

POWRUSDT: Processed files saved at processed\POWRUSDT


Processing all symbols:  76%|███████▌  | 83/109 [2:03:30<24:53, 57.46s/it]

PYTHUSDT: Processed files saved at processed\PYTHUSDT
RAREUSDT: Processed files saved at processed\RAREUSDT


Processing all symbols:  78%|███████▊  | 85/109 [2:06:28<29:43, 74.30s/it]

REEFUSDT: Processed files saved at processed\REEFUSDT


Processing all symbols:  79%|███████▉  | 86/109 [2:06:47<22:10, 57.85s/it]

SAGAUSDT: Processed files saved at processed\SAGAUSDT


Processing all symbols:  80%|███████▉  | 87/109 [2:07:27<19:09, 52.25s/it]

SEIUSDT: Processed files saved at processed\SEIUSDT
SOLUSDT: Processed files saved at processed\SOLUSDT


Processing all symbols:  81%|████████  | 88/109 [2:09:12<23:49, 68.07s/it]

STMXUSDT: Processed files saved at processed\STMXUSDT


Processing all symbols:  83%|████████▎ | 90/109 [2:12:44<27:24, 86.53s/it]

STXUSDT: Processed files saved at processed\STXUSDT


Processing all symbols:  83%|████████▎ | 91/109 [2:13:26<21:56, 73.14s/it]

SUIUSDT: Processed files saved at processed\SUIUSDT


Processing all symbols:  84%|████████▍ | 92/109 [2:15:21<24:20, 85.94s/it]

SUNUSDT: Processed files saved at processed\SUNUSDT


Processing all symbols:  85%|████████▌ | 93/109 [2:15:44<17:50, 66.88s/it]

TAOUSDT: Processed files saved at processed\TAOUSDT


Processing all symbols:  86%|████████▌ | 94/109 [2:16:21<14:29, 57.95s/it]

TIAUSDT: Processed files saved at processed\TIAUSDT


Processing all symbols:  87%|████████▋ | 95/109 [2:16:34<10:20, 44.34s/it]

TONUSDT: Processed files saved at processed\TONUSDT
TRBUSDT: Processed files saved at processed\TRBUSDT


Processing all symbols:  88%|████████▊ | 96/109 [2:18:22<13:44, 63.45s/it]

TRXUSDT: Processed files saved at processed\TRXUSDT


Processing all symbols:  90%|████████▉ | 98/109 [2:19:52<09:17, 50.68s/it]

TURBOUSDT: Processed files saved at processed\TURBOUSDT
UMAUSDT: Processed files saved at processed\UMAUSDT


Processing all symbols:  91%|█████████ | 99/109 [2:21:41<11:22, 68.25s/it]

UNIUSDT: Processed files saved at processed\UNIUSDT


Processing all symbols:  93%|█████████▎| 101/109 [2:24:49<10:46, 80.79s/it]

VIDTUSDT: Processed files saved at processed\VIDTUSDT


Processing all symbols:  94%|█████████▎| 102/109 [2:25:08<07:17, 62.45s/it]

WIFUSDT: Processed files saved at processed\WIFUSDT


Processing all symbols:  94%|█████████▍| 103/109 [2:25:45<05:27, 54.62s/it]

WLDUSDT: Processed files saved at processed\WLDUSDT


Processing all symbols:  95%|█████████▌| 104/109 [2:26:04<03:39, 43.94s/it]

WUSDT: Processed files saved at processed\WUSDT


Processing all symbols:  96%|█████████▋| 105/109 [2:26:29<02:33, 38.49s/it]

XAIUSDT: Processed files saved at processed\XAIUSDT


Processing all symbols:  97%|█████████▋| 106/109 [2:28:09<02:50, 56.68s/it]

XRPUSDT: Processed files saved at processed\XRPUSDT


Processing all symbols:  98%|█████████▊| 107/109 [2:29:25<02:05, 62.57s/it]

YGGUSDT: Processed files saved at processed\YGGUSDT


Processing all symbols:  99%|█████████▉| 108/109 [2:29:39<00:47, 47.99s/it]

ZKUSDT: Processed files saved at processed\ZKUSDT


Processing all symbols: 100%|██████████| 109/109 [2:29:54<00:00, 82.52s/it]

ZROUSDT: Processed files saved at processed\ZROUSDT



