In [1]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data
from libs import *
from libs.modules import SoftDTW, loss_function
from tqdm import tqdm
import numpy as np
import plotly.graph_objs as go
import plotly.offline as py
import sys
sys.path.append("../StockPriceForecast/")
dataSetPath = "../StockPriceForecast/dataSet/"

获取数据

In [2]:
stock_id = "600521.SH"
his_data = getDailyData(stock_id=stock_id, start_date="20100104", adj="hfq")
print(his_data)

        ts_code trade_date      open      high       low     close  pre_close  \
0     600521.SH   20220517  343.0026  346.3220  327.6966  337.4703   340.2364   
1     600521.SH   20220516  357.5710  365.1318  338.9456  340.2364   350.3790   
2     600521.SH   20220513  347.6128  361.2592  343.1870  350.3790   343.9246   
3     600521.SH   20220512  330.4627  347.9817  329.1719  343.9246   331.9380   
4     600521.SH   20220511  327.8810  352.2231  327.8810  331.9380   321.9799   
...         ...        ...       ...       ...       ...       ...        ...   
2974  600521.SH   20100108  106.0809  119.5083  106.0809  114.5724   108.6325   
2975  600521.SH   20100107  107.9214  110.8077  107.9214  108.6325   107.7959   
2976  600521.SH   20100106  107.4194  108.2142  105.4116  107.7959   107.4194   
2977  600521.SH   20100105  108.7580  109.1345  106.1227  107.4194   108.3815   
2978  600521.SH   20100104  107.5031  109.5109  107.5031  108.3815   108.7580   

       change  pct_chg     

删去不必要数据

In [3]:
his_data.drop(columns=["ts_code", "trade_date"], inplace=True)
print(his_data.head())

       open      high       low     close  pre_close   change  pct_chg  \
0  343.0026  346.3220  327.6966  337.4703   340.2364  -2.7661  -0.8130   
1  357.5710  365.1318  338.9456  340.2364   350.3790 -10.1426  -2.8948   
2  347.6128  361.2592  343.1870  350.3790   343.9246   6.4544   1.8767   
3  330.4627  347.9817  329.1719  343.9246   331.9380  11.9866   3.6111   
4  327.8810  352.2231  327.8810  331.9380   321.9799   9.9581   3.0928   

         vol      amount  
0  291699.90  527755.572  
1  392180.14  740480.804  
2  374202.63  710452.849  
3  445733.50  824602.814  
4  542118.56  994483.019  


输入数据归一化以及生成目标数据

In [4]:
input_data = (his_data-his_data.min())/(his_data.max()-his_data.min())
target_data = his_data["close"]
target_data = (target_data-target_data.min())/(target_data.max()-target_data.min())
print("input_data :\n", input_data.head())
print("target_data :\n", target_data.head())
print("target_data : ", target_data.describe())

# input_data : (2933, 9)
# target_data : (2933, 1)
input_data = torch.Tensor(input_data.values).float()
data_len, para_num = input_data.shape
target_data = torch.Tensor(target_data.values).unsqueeze(dim=-1).float()

input_data :
        open      high       low     close  pre_close    change   pct_chg  \
0  0.367364  0.357244  0.357923  0.355610   0.359312  0.435809  0.459495   
1  0.387127  0.381997  0.373565  0.359312   0.372889  0.381964  0.355686   
2  0.373618  0.376901  0.379462  0.372889   0.364249  0.503114  0.593616   
3  0.350354  0.359428  0.359974  0.364249   0.348204  0.543496  0.680102   
4  0.346851  0.365010  0.358179  0.348204   0.334874  0.528689  0.654257   

        vol    amount  
0  0.170365  0.120956  
1  0.229314  0.170014  
2  0.218767  0.163089  
3  0.260732  0.189414  
4  0.317278  0.228591  
target_data :
 0    0.355610
1    0.359312
2    0.372889
3    0.364249
4    0.348204
Name: close, dtype: float64
target_data :  count    2979.000000
mean        0.240021
std         0.194091
min         0.000000
25%         0.063420
50%         0.228953
75%         0.351298
max         1.000000
Name: close, dtype: float64


直接输出多步预测

In [5]:
class SPFNet(nn.Module):
    def __init__(self, input_size: int, hidden_size: int, output_size: int, num_layers: int, dropout: float):
        super(SPFNet, self).__init__()
        self.linear_layer0 = nn.Linear(in_features=input_size, out_features=hidden_size)
        self.lstm_layer = nn.LSTM(input_size=hidden_size, hidden_size=hidden_size, num_layers=num_layers, dropout=dropout)
        self.linear_layer1 = nn.Linear(in_features=hidden_size, out_features=output_size)

    def forward(self, X: torch.Tensor):
        # 输入的X的形状为: (batch_size, num_steps, input_size)
        # num_steps = input_steps
        # X : (batch_size, num_steps, hidden_size)
        X = self.linear_layer0(X)

        # X : (num_steps, batch_size, hidden_size)
        X = X.permute(1, 0, 2)
        X, _ = self.lstm_layer(X)
        X = self.linear_layer1(X)

        # output_size = output_steps = predict_days
        # X : (num_steps, batch_size, output_size)
        return X

构建DataSet

In [6]:
class SPFDataSet(torch.utils.data.Dataset):
    """
    返回窗口数据
    """
    def __init__(self, input_data: torch.Tensor, target_data: torch.Tensor,
                 input_steps: int, output_steps: int):
        """
        :param train_data: (days, input_size)
        :param target_data: (days, output_size)
        :param input_steps: input time steps
        :param output_steps: output time steps
        """
        super(SPFDataSet, self).__init__()

        train_days, _ = input_data.shape
        target_days, _ = target_data.shape
        if train_days != target_days:
            raise Exception(f"train_data and input_data are not equal in length on dim=0, "
                            f"The length of input_data is {train_days}, but target_data is {target_days}")

        self.input_steps = input_steps
        self.output_steps = output_steps

        self.input_data = input_data
        self.target_data = target_data

        self._len = self._calculate_len(train_days, input_steps ,output_steps)

    def _calculate_len(self, days: int, input_steps: int, output_steps: int):
        return days - input_steps - output_steps

    def isel(self, start_index: int, end_index: int, inplace: bool=False):
        assert end_index >= start_index
        _new_len = self._calculate_len(days=end_index-start_index, input_steps=self.input_steps, output_steps=self.output_steps)
        assert _new_len > 0
        _input_data = self.input_data[start_index: end_index]
        _target_data = self.target_data[start_index: end_index]
        if inplace:
            self._len = _new_len
            self.input_data = _input_data
            self.target_data = _target_data
            return self
        else:
            return SPFDataSet(input_data=self.input_data, target_data=self.target_data,
                              input_steps=self.input_steps, output_steps=self.output_steps)

    def __getitem__(self, item):
        """生成滑动窗口数据"""
        # inputs : (input_steps, input_size)
        # targets : (output_steps, output_size)
        inputs = self.input_data[item: item + self.input_steps]
        targets = self.target_data[item + self.input_steps: item + self.input_steps + self.output_steps]

        # inputs : (batch_size, input_steps, input_size)
        # targets : (batch_size, output_steps, output_size)
        inputs = inputs.unsqueeze(dim=0)
        targets = targets.unsqueeze(dim=0)
        return inputs, targets

    def __len__(self):
        return self._len

梯度剪裁函数

In [7]:
def grad_clipping(net, theta): # from d2l
    """进行梯度剪裁，避免梯度爆炸"""
    if isinstance(net, nn.Module):
        params = [p for p in net.parameters() if p.requires_grad]
    else:
        params = net.params
    norm = torch.sqrt(sum(torch.sum((p.grad ** 2)) for p in params))
    if norm > theta:
        for param in params:
            param.grad[:] *= theta / norm

训练函数

In [8]:
def train_SPFNet(model: SPFNet,
                 dataLoader: torch.utils.data.DataLoader,
                 lr: float,
                 num_epochs: int,
                 loss_function_name,
                 device: torch.device,
                 use_scheduler: bool,
                 init_weight: bool):
    best_model = None
    best_loss = 100.0
    def init_weights(m):
        if isinstance(m, torch.nn.Linear):
            nn.init.xavier_normal_(m, gain=nn.init.calculate_gain('sigmoid'))
        if isinstance(m, torch.nn.LSTM):
            nn.init.orthogonal(m.weight)
    # 初始化模型各层的权重
    model.to(device)
    if init_weight:
        init_weights(model)
    model.train()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    scheduler = None
    if use_scheduler:
        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer=optimizer, mode="min", factor=0.5, patience=50, cooldown=50)
    loss_func = loss_function(loss_function_name)()
    loss_seq = []
    for epoch in range(num_epochs):
        tqdm_ = tqdm(dataLoader, desc=f"epoch {epoch} training")
        for X, Y in tqdm_: # Y自动添加了一个维度??
            optimizer.zero_grad()
            X = X.to(device)
            X = X.reshape(1, 360, -1)
            Y = Y.to(device)

            # Y_hat : (steps, batch_size, output_size)
            Y_hat = model(X)
            # Y_hat : (1, 1, output_size)
            Y_hat = Y_hat[-1].unsqueeze(dim=0)
            # Y : (1, 1, steps)
            Y = Y.squeeze(dim=-1)
            loss = loss_func(Y_hat, Y).sum()
            loss.backward()
            grad_clipping(model, 1)
            optimizer.step()
            if use_scheduler:
                scheduler.step(loss)
            loss_seq.append(loss.detach().cpu().item())
            tqdm_.set_postfix(loss=loss.detach().cpu().item(), lr=optimizer.param_groups[0]['lr'])

            if best_loss > loss.detach().cpu().item():
                best_loss = loss.detach().cpu().item()
                best_model = model

    return best_model, loss_seq

测试函数

In [9]:
def eval_SPFNet(model: SPFNet,
                dataLoader: torch.utils.data.DataLoader,
                device: torch.device,):
    torch.no_grad()
    model.eval()
    model.to(device)

    output_seq = []
    target_seq = []

    for X, Y in dataLoader:
        X = X.to(device)
        X = X.reshape(1, 360, -1)

        # output : (1, 1, output_size)
        output = model(X)
        # output : (1, output_size)
        output = output[-1]

        # Y : (1, output_steps)
        Y = Y.squeeze(dim=-1).squeeze(dim=0)
        output_seq.append(output.detach().cpu())
        target_seq.append(Y.detach().cpu())

    # output_seq : (dataLoader_len, output_steps)
    # target_seq : (dataLoader_len, output_steps)
    output_seq = torch.cat(output_seq, dim=0)
    target_seq = torch.cat(target_seq, dim=0)
    return output_seq, target_seq

绘制loss曲线

In [10]:
def drawLoss(loss: list):
    trace = dict(
        y = np.array(loss),
        mode = "lines",
        name = "loss"
    )
    fig = go.Figure(trace)
    fig.show()

绘制预测曲线和目标曲线

In [123]:
def drawPredictAndTarget(predicts, targets, show_index: int):
    trace0 = go.Scatter(
        # x = np.linspace(0, 1, 30),
        y = predicts[show_index].numpy(),
        mode = "lines",
        name = "predict"
    )
    trace1 = go.Scatter(
        # x = np.linspace(0, 1, 30),
        y = targets[show_index].numpy(),
        mode = "lines",
        name = "target"
    )
    data = [trace0, trace1]
    py.iplot(data)

绘制历史数据与预测数据

In [113]:
def drawTargetAndPredicts(history_data, predict_data):
    y = history_data.numpy().reshape(-1)
    data_seq = []
    trace = go.Scatter(
        x = np.linspace(0, len(y), len(y)),
        y = y,
        mode = "lines",
        name = "历史数据"
    )
    data_seq.append(trace)
    for index in range(len(predicts)):
        trace = go.Scatter(
            x = np.linspace(train_steps +predict_steps+ index, train_steps + 2*predict_steps + index, predict_steps),
            y = predict_data[index].numpy(),
            mode="lines",
            name = f"预测数据{index}"
        )
        data_seq.append(trace)
    py.iplot(data_seq)

用第step步的预测数据绘制曲线

In [None]:
def drawTargetAndPredict(target_data, predicts, step: int = 0):
    target_ = target_data.numpy().reshape(2979)[train_steps:train_steps+validate_steps]
    predicts_ = predicts[:, step].numpy()
    trace0 = go.Scatter(
        x = np.linspace(train_steps, train_steps + validate_steps, validate_steps),
        y = target_,
        mode = "lines",
        name = "target"
    )
    trace1 = go.Scatter(
        x = np.linspace(train_steps+predict_steps, train_steps+len(predicts_) +predict_steps, len(predicts_)),
        y = predicts_,
        mode = "lines",
        name = "predict"
    )
    data = [trace0, trace1]
    py.iplot(data)

设置参数

In [29]:
input_steps = 360
predict_steps = 30
hidden_size = 1024
num_layers = 4
# 按照 8:1:1 设置训练集、验证集和测试集
train_steps = int(data_len * 0.8)
validate_steps = int(data_len * 0.1)
test_steps = data_len - train_steps - validate_steps
learning_rate = 0.0001
num_epochs = 1
use_scheduler = True
init_weight = True
loss_function_name = "softDTW"
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("train_steps    : ", train_steps)
print("validate_steps : ", validate_steps)
print("test_steps     : ", test_steps)
print("Device         : ", device)

train_steps    :  2383
validate_steps :  297
test_steps     :  299
Device         :  cuda:0


生成dataLoader

In [138]:
train_dataSet = SPFDataSet(input_data=input_data, target_data=target_data,
                        input_steps=input_steps, output_steps=predict_steps).isel(0, train_steps, inplace=True)
validate_dataSet = SPFDataSet(input_data=input_data, target_data=target_data,
                        input_steps=input_steps, output_steps=predict_steps).isel(train_steps-input_steps+predict_steps, train_steps + validate_steps, inplace=True)
test_dataSet = SPFDataSet(input_data=input_data, target_data=target_data,
                        input_steps=input_steps, output_steps=predict_steps).isel(train_steps + validate_steps-input_steps, data_len, inplace=True)
train_dataLoader = torch.utils.data.DataLoader(train_dataSet, shuffle=True)
validate_dataLoader = torch.utils.data.DataLoader(validate_dataSet)
test_dataLoader = torch.utils.data.DataLoader(test_dataSet)
model_store_path = f"Models/spf_sample_{loss_function_name}_{num_epochs}_{len(train_dataLoader)}_{input_steps}_{hidden_size}_{predict_steps}_{num_layers}.pth"
print("Length of train_dataLoader : ", len(train_dataLoader))
print("Length of validate_dataSet : ", len(validate_dataLoader))
print("Length of test_dataSet     : ", len(test_dataLoader))
print("model store path           : ", model_store_path)

Length of train_dataLoader :  1993
Length of validate_dataSet :  237
Length of test_dataSet     :  269
model store path           :  Models/spf_sample_softDTW_1_1993_360_1024_30_4.pth


实例化模型并训练模型

In [13]:
# 生成新模型 input_steps*para_num
net = SPFNet(input_size=para_num, output_size=predict_steps, hidden_size=hidden_size,num_layers=num_layers, dropout=0.1)

In [14]:
# 加载模型
# model_load_path = "Models/spf_sample_softDTW_1_1993_360_1024_30_4.pth"
# net = torch.load(model_load_path)

In [86]:
net, loss_seq = train_SPFNet(model=net,
                             dataLoader=train_dataLoader,
                             device=device,
                             init_weight=False,
                             loss_function_name=loss_function_name,
                             lr = learning_rate,
                             num_epochs=num_epochs,
                             use_scheduler=True)

epoch 0 training: 100%|██████████| 1993/1993 [06:20<00:00,  5.23it/s, loss=0.000559, lr=1.22e-8]


In [87]:
# 绘制loss曲线
drawLoss(loss_seq)

In [94]:
# 保存模型
torch.save(net, model_store_path)

In [88]:
# 使用测验集评估模型
predicts, targets = eval_SPFNet(model=net,
                                dataLoader=validate_dataLoader,
                                device=device)

In [125]:
# 绘制预测数据与目标数据
drawPredictAndTarget(predicts, targets, 2)

In [112]:
# 绘制历史数据与预测数据
drawTargetAndPredicts(target_data, predicts)

In [137]:
# 用预测数据的第step步绘制预测曲线
drawTargetAndPredict(target_data, predicts, step=0)
