In [4]:
import yfinance as yf
from datetime import datetime
import time
import pandas as pd

def download_stock_data(tickers, start_date, end_date):
    """
    下载每支股票数据，并返回包含股票代码和数据的字典
    """
    stock_data = {}
    for ticker in tickers:
        try:
            print(f"正在下载 {ticker} 数据...")
            data = yf.download(ticker, start=start_date, end=end_date)
            if data.empty:
                print(f"股票 {ticker} 数据为空，跳过")
                continue
            stock_data[ticker] = data
            # 暂停 0.1 秒以防请求过快
            time.sleep(0.1)
        except Exception as e:
            print(f"股票 {ticker} 下载时出现异常：{e}")
    return stock_data

# 定义 30 个美股股票代码
tickers = [
    "AAPL"
]

# 设置时间范围：2022 年至 2025 年（end_date 可能受当前日期限制）
start_date = "2015-01-01"
end_date = "2025-12-31"

# 下载股票数据
stock_data_dict = download_stock_data(tickers, start_date, end_date)

# 输出每个股票数据的简单示例，检查前 5 行和后 5 行
for ticker, data in stock_data_dict.items():
    print(f"\nData for {ticker}:")
    print("Head:")
    print(data.head())
    print("Tail:")
    print(data.tail())

# 将所有下载的数据合并，并筛选只保留指定的列
all_data = []
for ticker, data in stock_data_dict.items():
    # 重置索引，把日期从行索引转换为列
    df = data.reset_index()
    # 保留指定的列：Date, Open, High, Low, Close, Volume
    # 注意：这里要求字段名称保持一致，所以不进行小写转换
    df = df[["Date", "Open", "High", "Low", "Close", "Volume"]]
    # 添加股票代码列，并命名为 "stock_id"
    df["stock_id"] = ticker
    all_data.append(df)

if not all_data:
    print("未获取到任何股票数据，退出。")
else:
    # 合并所有数据，并按股票及日期排序（可选）
    all_stock_data = pd.concat(all_data, ignore_index=True)
    all_stock_data = all_stock_data.sort_values(["stock_id", "Date"]).reset_index(drop=True)
    
    # 只保留指定的列（确保最终 CSV 列数正确）
    all_stock_data = all_stock_data[["Date", "Open", "High", "Low", "Close", "Volume", "stock_id"]]
    
    csv_filename = "stock_data.csv"
    all_stock_data.to_csv(csv_filename, index=False)
    print(f"\n所有股票数据已保存到 {csv_filename}")

正在下载 AAPL 数据...


[*********************100%***********************]  1 of 1 completed


Data for AAPL:
Head:
Price           Close       High        Low       Open     Volume
Ticker           AAPL       AAPL       AAPL       AAPL       AAPL
Date                                                             
2015-01-02  24.320431  24.789800  23.879980  24.778677  212818400
2015-01-05  23.635283  24.169162  23.448426  24.089080  257142000
2015-01-06  23.637508  23.897774  23.274914  23.699794  263188400
2015-01-07  23.968962  24.069063  23.735389  23.846614  160423600
2015-01-08  24.889902  24.947740  24.180287  24.298187  237458000
Tail:
Price            Close        High         Low        Open    Volume
Ticker            AAPL        AAPL        AAPL        AAPL      AAPL
Date                                                                
2025-03-11  220.839996  225.839996  217.449997  223.809998  76137400
2025-03-12  216.979996  221.750000  214.910004  220.139999  62547500
2025-03-13  209.679993  216.839996  208.419998  215.949997  61368300
2025-03-14  213.490005  213.94




In [1]:
import warnings
import pandas as pd
import torch
import lightning.pytorch as pl
from lightning.pytorch.callbacks import EarlyStopping, LearningRateMonitor
from pytorch_forecasting import TimeSeriesDataSet, GroupNormalizer
from pytorch_forecasting.data import NaNLabelEncoder
from pytorch_forecasting.models.deepar import DeepAR
from pytorch_forecasting.metrics import NormalDistributionLoss

# 不再提示pd的SettingWithCopyWarning为错误
warnings.simplefilter("ignore", category=pd.errors.SettingWithCopyWarning)

# 1. 加载股票数据
data = pd.read_csv("stock_data.csv")

# 2. 数据预处理
data["Date"] = pd.to_datetime(data["Date"], format="%Y/%m/%d")
data = data.sort_values("Date").reset_index(drop=True)
data["time_idx"] = data.index + 1  # 从1开始构造顺序的时间索引
data["static"] = data["stock_id"]

# 3. 定义采样参数
max_encoder_length = 60
max_prediction_length = 20

# 调整训练截止点，确保验证数据中至少有 max_encoder_length + max_prediction_length 个时间步
training_cutoff = data["time_idx"].max() - (max_encoder_length + max_prediction_length)

print(f"数据总长度: {data['time_idx'].max()}, 训练截止点: {training_cutoff}")

# 4. 构造训练数据集
training = TimeSeriesDataSet(
    data=data[data.time_idx <= training_cutoff],
    time_idx="time_idx",
    target="Close",
    group_ids=["stock_id"],
    categorical_encoders={"stock_id": NaNLabelEncoder().fit(data["stock_id"])},
    static_categoricals=["static"],
    min_encoder_length=max_encoder_length,
    max_encoder_length=max_encoder_length,
    min_prediction_length=max_prediction_length,
    max_prediction_length=max_prediction_length,
    time_varying_unknown_reals=["Close"],
    time_varying_known_reals=["time_idx"],
    target_normalizer=GroupNormalizer(groups=["stock_id"], transformation="softplus"),
    add_relative_time_idx=True,
    add_target_scales=True,
    randomize_length=None,
)

# 5. 构造验证数据集
# 注意这里的数据要确保有足够的样本构成完整的序列
validation_data = data[data.time_idx > training_cutoff]
print(f"验证集数量: {len(validation_data)}")

validation = TimeSeriesDataSet.from_dataset(
    training,
    validation_data,
    stop_randomization=True,
)

# 6. 创建 DataLoader
batch_size = 64
train_dataloader = training.to_dataloader(train=True, batch_size=batch_size, num_workers=0)
val_dataloader = validation.to_dataloader(train=False, batch_size=batch_size, num_workers=0)

# （后续代码省略：回调、Trainer、模型创建与训练等）

数据总长度: 2566, 训练截止点: 2486
验证集数量: 80


In [2]:
# 6. 定义回调函数和 Trainer
early_stop_callback = EarlyStopping(
    monitor="val_loss", min_delta=1e-4, patience=5, verbose=False, mode="min"
)
lr_logger = LearningRateMonitor()

trainer = pl.Trainer(
    max_epochs=10,
    accelerator="gpu",  # 如果没有GPU，可设为 'cpu'
    devices="auto",
    gradient_clip_val=0.1,
    limit_train_batches=30,
    limit_val_batches=3,
    callbacks=[lr_logger, early_stop_callback],
)

# 7. 构建 DeepAR 模型
deepar = DeepAR.from_dataset(
    training,
    learning_rate=0.1,
    hidden_size=32,
    dropout=0.1,
    loss=NormalDistributionLoss(),
    log_interval=10,
    log_val_interval=3,
)

print(f"Number of parameters in network: {deepar.size() / 1e3:.1f}k")

# 8. 开始训练
torch.set_num_threads(10)
trainer.fit(
    deepar,
    train_dataloaders=train_dataloader,
    val_dataloaders=val_dataloader,
)

# 9. 验证集预测和误差计算（以平均绝对误差MAE为例）
actuals = torch.cat([y for x, (y, weight) in iter(val_dataloader)])
predictions = deepar.predict(val_dataloader)
device = predictions.device
actuals = actuals.to(device)
print(f"Mean absolute error of model: {(actuals - predictions).abs().mean()}")

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
D:\anaconda3\envs\wind\lib\site-packages\lightning\pytorch\utilities\parsing.py:209: Attribute 'loss' is an instance of `nn.Module` and is already saved during checkpointing. It is recommended to ignore them using `self.save_hyperparameters(ignore=['loss'])`.
D:\anaconda3\envs\wind\lib\site-packages\lightning\pytorch\utilities\parsing.py:209: Attribute 'logging_metrics' is an instance of `nn.Module` and is already saved during checkpointing. It is recommended to ignore them using `self.save_hyperparameters(ignore=['logging_metrics'])`.
You are using a CUDA device ('NVIDIA GeForce RTX 4060 Laptop GPU') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#to

Number of parameters in network: 13.6k


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name                   | Type                   | Params | Mode 
--------------------------------------------------------------------------
0 | loss                   | NormalDistributionLoss | 0      | train
1 | logging_metrics        | ModuleList             | 0      | train
2 | embeddings             | MultiEmbedding         | 1      | train
3 | rnn                    | LSTM                   | 13.6 K | train
4 | distribution_projector | Linear                 | 66     | train
--------------------------------------------------------------------------
13.6 K    Trainable params
0         Non-trainable params
13.6 K    Total params
0.055     Total estimated model params size (MB)
12        Modules in train mode
0         Modules in eval mode


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

D:\anaconda3\envs\wind\lib\site-packages\lightning\pytorch\trainer\connectors\data_connector.py:425: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=31` in the `DataLoader` to improve performance.
D:\anaconda3\envs\wind\lib\site-packages\lightning\pytorch\trainer\connectors\data_connector.py:425: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=31` in the `DataLoader` to improve performance.
D:\anaconda3\envs\wind\lib\site-packages\lightning\pytorch\loops\fit_loop.py:310: The number of training batches (30) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=10` reached.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
D:\anaconda3\envs\wind\lib\site-packages\lightning\pytorch\trainer\connectors\data_connector.py:425: The 'predict_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=31` in the `DataLoader` to improve performance.


Mean absolute error of model: 63.53936767578125
