In [1]:
import copy
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import TimeSeriesSplit
from sklearn.linear_model import Ridge

In [2]:
import os
import pandas as pd
import lightgbm as lgb
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
import logging

In [3]:
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

In [4]:
class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(LSTMModel, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        out, _ = self.lstm(x, (h0, c0))
        out = self.fc(out[:, -1, :])
        return out

In [5]:
class TimeSeriesLSTM:
    def __init__(self, input_size, hidden_size=64, num_layers=2, output_size=1,
                 epochs=10, batch_size=32, lr=0.001, device='cpu'):
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.output_size = output_size
        self.epochs = epochs
        self.batch_size = batch_size
        self.lr = lr
        self.device = device
        self.model = LSTMModel(input_size, hidden_size, num_layers, output_size).to(device)
        self.optimizer = optim.Adam(self.model.parameters(), lr=self.lr)
        self.criterion = nn.MSELoss()

    def _prepare_data(self, X, y=None):
        X_tensor = torch.tensor(X.values.reshape(-1, 1, self.input_size), dtype=torch.float32).to(self.device)
        if y is not None:
            y_tensor = torch.tensor(y.values.reshape(-1, self.output_size), dtype=torch.float32).to(self.device)
            return TensorDataset(X_tensor, y_tensor)
        return TensorDataset(X_tensor)

    def fit(self, X_train, y_train):
        train_dataset = self._prepare_data(X_train, y_train)
        train_loader = DataLoader(train_dataset, batch_size=self.batch_size, shuffle=True)

        self.model.train()
        for epoch in range(self.epochs):
            for batch_X, batch_y in train_loader:
                self.optimizer.zero_grad()
                outputs = self.model(batch_X)
                loss = self.criterion(outputs, batch_y)
                loss.backward()
                self.optimizer.step()


    def predict(self, X_test):
        test_dataset = self._prepare_data(X_test)
        test_loader = DataLoader(test_dataset, batch_size=self.batch_size, shuffle=False)

        self.model.eval()
        predictions = []
        with torch.no_grad():
            for batch_X in test_loader:
                outputs = self.model(batch_X[0])
                predictions.append(outputs.cpu().numpy())
        return np.vstack(predictions)

    def score(self, X, y):
        y_pred = self.predict(X)
        y_true = y.values.reshape(-1, self.output_size)

        mse = np.mean((y_true - y_pred)**2)
        return -mse

In [6]:
def train(X,y,model,n_split = 5,normalize=False):
    scores = []
    tscv = TimeSeriesSplit(n_splits=n_split)
    for fold, (train_index, test_index) in enumerate(tscv.split(X)):
        logging.info(f"--- Fold {fold+1} ---")

        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        if normalize:
            scaler = MinMaxScaler()
            X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns, index=X_train.index)
            X_test = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns, index=X_test.index)

        logging.info(f"训练集大小: {len(X_train)}, 测试集大小: {len(X_test)}")
        logging.info(f"训练集索引范围: {X.index[train_index.min()]}-{X.index[train_index.max()]}")
        logging.info(f"测试集索引范围: {X.index[test_index.min()]}-{X.index[test_index.max()]}")

        current_model = copy.deepcopy(model)

        current_model.fit(X_train, y_train)
        predictions = current_model.predict(X_test)
        is_score = current_model.score(X_train, y_train)
        os_score = current_model.score(X_test, y_test)
        scores.append(os_score)

        logging.info(f"训练集评分: {is_score:.4f}, 测试集评分: {os_score:.4f}")
    return -sum(scores)

In [7]:
if __name__=='__main__':
    factor_data_path = '/public/data/factor_data'
    file_name = 'BTCUSDT_15m_2020_2025_factor_data.pkl'

    data = pd.read_pickle(os.path.join(factor_data_path,file_name))

    data['target'] = data['close'].shift(-10)/data['close'] - 1

    begin = '2021-10-01'
    split = '2025-03-01'
    selected_factors = [f'c_chu0{i}' for i in range(37,52)]
    workding_data = data[selected_factors+['target']][begin:split].dropna()

    X_data = workding_data[selected_factors]
    y_data = workding_data['target']

    ridge_model = Ridge(0.0008)
    lgb_model = lgb.LGBMRegressor(random_state=42)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    logging.info(f"Using device: {device}")


    lstm_model = TimeSeriesLSTM(
        input_size=len(selected_factors),
        hidden_size=64,
        num_layers=2,
        epochs=5,
        batch_size=32,
        lr=0.001,
        device=device
    )

    logging.info("\n--- Training Ridge Model ---")
    ridge_score_sum = train(X_data, y_data, ridge_model, normalize=True)
    logging.info(f"Ridge Model Total  Score: {ridge_score_sum:.4f}")

    logging.info("\n--- Training LightGBM Model ---")
    lgb_score_sum = train(X_data, y_data, lgb_model, normalize=False)
    logging.info(f"LightGBM Model Total  Score: {lgb_score_sum:.4f}")

    logging.info("\n--- Training LSTM Model ---")
    lstm_score_sum = train(X_data, y_data, lstm_model, normalize=True)
    logging.info(f"LSTM Model Total  Score: {lstm_score_sum:.4f}")

  data['target'] = data['close'].shift(-10)/data['close'] - 1
2025-07-18 10:57:55,096 - INFO - Using device: cpu
2025-07-18 10:57:55,880 - INFO - 
--- Training Ridge Model ---
2025-07-18 10:57:55,881 - INFO - --- Fold 1 ---
2025-07-18 10:57:55,894 - INFO - 训练集大小: 19967, 测试集大小: 19964
2025-07-18 10:57:55,894 - INFO - 训练集索引范围: 2021-10-01 00:00:00-2022-04-26 23:30:00
2025-07-18 10:57:55,895 - INFO - 测试集索引范围: 2022-04-26 23:45:00-2022-11-21 03:45:00
2025-07-18 10:57:55,956 - INFO - 训练集评分: 0.0029, 测试集评分: -0.0956
2025-07-18 10:57:55,957 - INFO - --- Fold 2 ---
2025-07-18 10:57:55,967 - INFO - 训练集大小: 39931, 测试集大小: 19964
2025-07-18 10:57:55,968 - INFO - 训练集索引范围: 2021-10-01 00:00:00-2022-11-21 03:45:00
2025-07-18 10:57:55,968 - INFO - 测试集索引范围: 2022-11-21 04:00:00-2023-06-17 02:45:00
2025-07-18 10:57:55,980 - INFO - 训练集评分: 0.0033, 测试集评分: -0.0098
2025-07-18 10:57:55,985 - INFO - --- Fold 3 ---
2025-07-18 10:57:55,994 - INFO - 训练集大小: 59895, 测试集大小: 19964
2025-07-18 10:57:55,996 - INFO - 训练集索引范围: 2021

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.074359 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3825
[LightGBM] [Info] Number of data points in the train set: 19967, number of used features: 15
[LightGBM] [Info] Start training from score -0.000010


2025-07-18 10:59:31,968 - INFO - 训练集评分: 0.1918, 测试集评分: -0.0768
2025-07-18 10:59:31,973 - INFO - --- Fold 2 ---
2025-07-18 10:59:31,976 - INFO - 训练集大小: 39931, 测试集大小: 19964
2025-07-18 10:59:31,977 - INFO - 训练集索引范围: 2021-10-01 00:00:00-2022-11-21 03:45:00
2025-07-18 10:59:31,978 - INFO - 测试集索引范围: 2022-11-21 04:00:00-2023-06-17 02:45:00


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.059053 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3825
[LightGBM] [Info] Number of data points in the train set: 39931, number of used features: 15
[LightGBM] [Info] Start training from score -0.000186


2025-07-18 11:01:34,648 - INFO - 训练集评分: 0.1275, 测试集评分: -0.0318
2025-07-18 11:01:34,651 - INFO - --- Fold 3 ---
2025-07-18 11:01:34,656 - INFO - 训练集大小: 59895, 测试集大小: 19964
2025-07-18 11:01:34,656 - INFO - 训练集索引范围: 2021-10-01 00:00:00-2023-06-17 02:45:00
2025-07-18 11:01:34,657 - INFO - 测试集索引范围: 2023-06-17 03:00:00-2024-01-11 01:45:00


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.064339 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3825
[LightGBM] [Info] Number of data points in the train set: 59895, number of used features: 15
[LightGBM] [Info] Start training from score -0.000034


2025-07-18 11:03:38,634 - INFO - 训练集评分: 0.1019, 测试集评分: -0.0149
2025-07-18 11:03:38,639 - INFO - --- Fold 4 ---
2025-07-18 11:03:38,650 - INFO - 训练集大小: 79859, 测试集大小: 19964
2025-07-18 11:03:38,654 - INFO - 训练集索引范围: 2021-10-01 00:00:00-2024-01-11 01:45:00
2025-07-18 11:03:38,655 - INFO - 测试集索引范围: 2024-01-11 02:00:00-2024-08-06 00:45:00


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.067966 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3825
[LightGBM] [Info] Number of data points in the train set: 79859, number of used features: 15
[LightGBM] [Info] Start training from score 0.000051


2025-07-18 11:06:11,340 - INFO - 训练集评分: 0.0882, 测试集评分: -0.0090
2025-07-18 11:06:11,341 - INFO - --- Fold 5 ---
2025-07-18 11:06:11,348 - INFO - 训练集大小: 99823, 测试集大小: 19964
2025-07-18 11:06:11,349 - INFO - 训练集索引范围: 2021-10-01 00:00:00-2024-08-06 00:45:00
2025-07-18 11:06:11,349 - INFO - 测试集索引范围: 2024-08-06 01:00:00-2025-03-01 23:45:00


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.065072 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3825
[LightGBM] [Info] Number of data points in the train set: 99823, number of used features: 15
[LightGBM] [Info] Start training from score 0.000067


2025-07-18 11:08:24,947 - INFO - 训练集评分: 0.0770, 测试集评分: -0.0126
2025-07-18 11:08:24,948 - INFO - LightGBM Model Total  Score: 0.1451
2025-07-18 11:08:24,949 - INFO - 
--- Training LSTM Model ---
2025-07-18 11:08:24,949 - INFO - --- Fold 1 ---
2025-07-18 11:08:24,956 - INFO - 训练集大小: 19967, 测试集大小: 19964
2025-07-18 11:08:24,957 - INFO - 训练集索引范围: 2021-10-01 00:00:00-2022-04-26 23:30:00
2025-07-18 11:08:24,958 - INFO - 测试集索引范围: 2022-04-26 23:45:00-2022-11-21 03:45:00
2025-07-18 11:41:32,952 - INFO - 训练集评分: -0.0001, 测试集评分: -0.0001
2025-07-18 11:41:32,953 - INFO - --- Fold 2 ---
2025-07-18 11:41:32,964 - INFO - 训练集大小: 39931, 测试集大小: 19964
2025-07-18 11:41:32,965 - INFO - 训练集索引范围: 2021-10-01 00:00:00-2022-11-21 03:45:00
2025-07-18 11:41:32,966 - INFO - 测试集索引范围: 2022-11-21 04:00:00-2023-06-17 02:45:00


KeyboardInterrupt: 