In [1]:
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from torch.utils.data import TensorDataset
from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn as nn
import numpy as np
import math

In [2]:
class my_dataset(Dataset):

    def __init__(self,data,type):
        '''
        :param data: 数据
        :param type:训练集、测试集、验证集
        '''
        self.type = type
        self.data = data

    def __getitem__(self,idx):
        # 下标来调用数据
        return self.data[0][idx],self.data[1][idx]

    def __len__(self):
        return len(self.data[0])


def concat_Data(Data,scaler_model):
    # 将data合并
    lenght = [0,len(Data[0])]
    Data_total = Data[0]
    for i in Data[1:]:
        Data_total = pd.concat([Data_total, i.copy()])
        lenght.append(len(i)+lenght[-1])
    # 合并之后进行标准化
    Data_total = scaler_model.fit_transform(np.array(Data_total))
    # 标准化之后在进行拆分
    df_all = []
    for i in range(len(lenght)-1):
        print(i)
        print(lenght[i],lenght[i+1])
        df_all.append(Data_total[lenght[i]:lenght[i+1]-1])
    return df_all,scaler_model


def data_process(root_path, input_size, output_size, timestep,scaler_model):
    '''
    :param root_path: 根目录
    :param input_size:输入的维度，默认为96
    :param output_size:每个样本的预测维度，默认为96，后面会改成336
    :param timestep: 时间步，滑动窗口
    '''
    # 获取对应类型的数据
    data_x = []
    data_y = []
    files = os.listdir(root_path)
    files_csv = sorted([f for f in files if f.endswith('.csv')])
    df_total = []
    for file in files_csv:
        df = pd.read_csv(os.path.join(root_path, file))  # 得到每个文件数据
        df = df.drop('date', axis=1)
        df_total.append(df)
    # 得到了所有数据，开始归一化
    df_all,scaler_model = concat_Data(df_total,scaler_model)
    for df in df_all:
        # 得到样本以及对应的数据集
        for index in range(0, len(df) - input_size - output_size, timestep):
            data_x.append(df[index:index + input_size])
            data_y.append(df[index + input_size:index + input_size + output_size])
    print(len(data_x))
    print(len(data_y))
    # 得到样本之后划分数据集
    # 每次的调用的随机种子不同，测试集永远不变，变得是训练集和验证集的数据
    train_x_set, test_x_set = train_test_split(data_x, test_size=0.2, random_state=42)
    train_y_set, test_y_set = train_test_split(data_y, test_size=0.2, random_state=42)
    # 然后在对训练集进行划分
    seed = np.random.randint(1,50,1)[0]
    train_x_set, valid_x_set = train_test_split(train_x_set, test_size=0.2, random_state=42)
    train_y_set, valid_y_set = train_test_split(train_y_set, test_size=0.2, random_state=42)
    train_x_set = torch.tensor(np.array(train_x_set)).to(torch.float32)
    train_y_set = torch.tensor(np.array(train_y_set)).to(torch.float32)
    test_x_set = torch.tensor(np.array(test_x_set)).to(torch.float32)
    test_y_set = torch.tensor(np.array(test_y_set)).to(torch.float32)
    valid_x_set = torch.tensor(np.array(valid_x_set)).to(torch.float32)
    valid_y_set = torch.tensor(np.array(valid_y_set)).to(torch.float32)
    # 得到不同的数据集
    dataset_train = my_dataset([train_x_set,train_y_set],'train')
    dataset_test = my_dataset([test_x_set, test_y_set],'test')
    dataset_valid = my_dataset([valid_x_set, valid_y_set],'valid')
    return dataset_train,dataset_test,dataset_valid

In [4]:
scaler_model = MinMaxScaler()

In [5]:
dataset_train,dataset_test,dataset_valid = data_process('ETT-small',96,96,1,scaler_model)

0
0 17420
1
17420 34840
2
34840 104520
3
104520 174200
173428
173428


In [6]:
# 获得gpu
def get_device():
    if torch.cuda.is_available():
        device = torch.device('cuda')
    else:
        device = torch.device('cpu')
    print(f'当前设备为{device}')
    return device

In [7]:
get_device()

当前设备为cuda


device(type='cuda')