In [13]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '7'
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch.utils.data.distributed import DistributedSampler
from sklearn.preprocessing import StandardScaler, MinMaxScaler

class PowerDataset(Dataset):
    def __init__(self, input_size, output_size):
        # if args.mode == 'train':
        df = pd.read_csv('/opt/data/private/hyl/code/ml-work/data/train_new.csv')
        # elif args.mode == 'test':
        #     df = pd.read_csv(args.data_path + '/test_new.csv')

        # 数据清理
        df.replace('?', np.nan, inplace=True)
        df.dropna(inplace=True)
        df = df.drop(columns=['DateTime'])
        
        # 使用MinMaxScaler进行标准化（范围0-1）
        self.scaler = MinMaxScaler()
        self.data = self.scaler.fit_transform(np.array(df))

        # for i, col in enumerate(df.columns):
        #     print(f"  {col}: [{self.data[:, i].min():.3f}, {self.data[:, i].max():.3f}]")

        self.input_size = input_size
        self.output_size = output_size

        self.data_x = []
        self.data_yin = []
        self.data_yout = []
        self.split_data()

    def split_data(self):
        dataX = []
        dataY = [] 
        dataY_in = []

        # 将输入窗口的数据保存到X中，将输出窗口保存到Y中
        window_size = self.input_size + self.output_size
        for index in range(len(self.data) - window_size):
            dataX.append(self.data[index: index + self.input_size][:])
            dataY.append(self.data[index + self.input_size: index + window_size][:])
        print(f"生成了 {len(dataX)} 个训练样本")
        # start of sentence
        SOS = np.zeros((1, 13))
        
        for i in range(len(dataY)):
            dataY_in.append(np.concatenate((SOS, dataY[i][:-1,:]), axis=0)) # SOS+dataY[i](90,13)
            
        dataY_out = dataY
        self.data_x = np.array(dataX)
        self.data_yin = np.array(dataY_in)
        self.data_yout = np.array(dataY_out)
    
    def __len__(self):
        # 返回数据的总数
        return len(self.data_x)
    
    def __getitem__(self, idx):
        data = torch.tensor(self.data_x[idx], dtype=torch.float32)
        decoder_input = torch.tensor(self.data_yin[idx], dtype=torch.float32)
        label = torch.tensor(self.data_yout[idx], dtype=torch.float32)
        return data, decoder_input, label
    
def dataloader():
    raw_dataset = PowerDataset(input_size=90, output_size=90)
    dataloader = DataLoader(raw_dataset, batch_size=8, drop_last=True)

    return dataloader

In [14]:
train_dataloader = dataloader()
for inputs, tgt_input, label in train_dataloader:
    break

生成了 565 个训练样本


In [3]:
def create_masks(src, tgt):
    # src: [bs, n]
    # tgt: [bs, m-1]
    def get_pad_mask(src):
        return (src == 0).unsqueeze(-2).to('cuda') # [bs, 1, n] 在dim=1上广播
    src_mask = get_pad_mask(src) # [bs, n, n]

    def get_subsequent_mask(tgt): # 上三角矩阵
        bs, len_q = tgt.size()
        subsequent_mask = (torch.triu(torch.ones((1, len_q, len_q)), diagonal=1)).bool()
        # triu(, diagonal=1) 保留主对角线上面一行，及其往上的全部
        return subsequent_mask.to('cuda')
    tgt_mask = get_pad_mask(tgt) | get_subsequent_mask(tgt) # decoder自己本来对句末padding的mask和遮蔽当前时刻后的mask叠加

    return src_mask, tgt_mask

In [6]:
input_mask, trg_mask = create_masks(torch.ones(8,90), torch.ones(8,90))

In [26]:
import torch
from torch import nn
import torch.nn.functional as F
from torch.autograd import Variable
import math
from typing import Optional

class MultiHeadAttention(nn.Module):
    def __init__(self, hidden_size, n_head, dropout):
        super().__init__()
        self.hidden_size = hidden_size
        self.num_attention_heads = n_head
        self.head_dim = self.hidden_size // self.num_attention_heads

        self.q_linear = nn.Linear(self.hidden_size, self.hidden_size)
        self.k_linear = nn.Linear(self.hidden_size, self.hidden_size)
        self.v_linear = nn.Linear(self.hidden_size, self.hidden_size)
        self.droptout = nn.Dropout(dropout)
        self.out = nn.Linear(self.hidden_size, self.hidden_size)
    
    def attention(self, q, k, v, head_dim, mask=None, dropout=None):
        scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(head_dim) # (bs, h, -1, head_dim) * (bs, h, head_dim, -1) = （bs, h, n, n）

        if mask is not None: # mask: (bs, n, n)
            mask = mask.unsqueeze(1) # 在头维度进行一样的mask操作
            scores = scores.masked_fill(mask == 0, -1e9)
        scores = F.softmax(scores, dim=-1)

        if dropout is not None:
            scores = dropout(scores) # what is the type of dropout?

        output = torch.matmul(scores, v)
        return output

    def forward(self, hidden_states, mask=None):
        # hidden_states.shape = [batch_size, seq_len, hidden_dim]
        input_shape = hidden_states.shape[:-1] # [8, 90]
        hidden_shape = (*input_shape, -1, self.head_dim) # (8, 90, -1, 64)
        batch_size = hidden_states.size(0)

        q = self.q_linear(hidden_states).view(hidden_shape).transpose(1, 2) # [8, 8, 90, 64]
        k = self.k_linear(hidden_states).view(hidden_shape).transpose(1, 2)
        v = self.v_linear(hidden_states).view(hidden_shape).transpose(1, 2)

        attention_scores = self.attention(q, k, v, self.head_dim, mask, self.droptout)
        concat = attention_scores.transpose(1, 2).contiguous().view(batch_size, -1, self.hidden_size)
        return self.out(concat)

class RMSNorm(nn.Module):
    def __init__(self,hidden_size,eps):
        super().__init__()
        self.weight = nn.Parameter(torch.ones(hidden_size))
        self.variance_epsilon = eps
        
    def forward(self, hidden_states):
        input_dtype = hidden_states.dtype
        hidden_states = hidden_states.to(torch.float32)
        variance = hidden_states.pow(2).mean(-1, keepdim=True)
        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
        return self.weight * hidden_states.to(input_dtype)

    def extra_repr(self):
        return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
    
    # def forward(self, x):
    #     norm = self.alpha * (x - x.mean(dim=-1, keepdim=True)) / (x.std(dim=-1, keepdim=True) + self.eps) + self.bias
    #     return norm

class FeedForwardNetwork(nn.Module):
    def __init__(self, hidden_size, d_ff, dropout):
        super().__init__()
        self.linear_1 = nn.Linear(hidden_size, d_ff)
        self.dropout = nn.Dropout(dropout)
        self.linear_2 = nn.Linear(d_ff, hidden_size)

    def forward(self, x):
        x = self.dropout(self.linear_1(x))
        x = self.linear_2(x)
        return x

class PositionEmbedding(nn.Module):
    '''
    parameter: 
    input: sentence after embedding []
    '''
    def __init__(self, hidden_size, output_size):
        super().__init__()
        self.hidden_size = hidden_size

        # Matrix PE is a constant
        pe = torch.zeros(output_size, self.hidden_size)
        for pos in range(output_size):
            for i in range(0, self.hidden_size, 2): # i = 0, 2,...,510
                pe[pos, i] = math.sin(pos / (10000 ** ((2 * i) / self.hidden_size)))
                pe[pos, i+1] = math.cos(pos / (10000 ** ((2 * (i+1)) / self.hidden_size)))
        
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x):
        # 使得单词嵌入表示相对大一些
        x = x * math.sqrt(self.hidden_size)

        seq_len = x.size(1) # 90
        x = x + Variable(self.pe[:,:seq_len], requires_grad=False).cuda()
        return x

class Embedder(nn.Module):
    def __init__(self, hidden_size):
        super().__init__()
        self.features = 13
        self.embedding = nn.Linear(self.features, hidden_size)
    
    def forward(self, input_seq):

        return self.embedding(input_seq) # to[8, 90, 512]
    
class DecoderLayer(nn.Module):
    def __init__(self, hidden_size,eps, dropout, n_head, d_ff):
        super().__init__()
        self.input_layernorm = RMSNorm(hidden_size,eps)
        self.cross_layernorm = RMSNorm(hidden_size,eps)
        self.final_layernorm = RMSNorm(hidden_size,eps)
        self.dropout_1 = nn.Dropout(dropout)
        self.dropout_2 = nn.Dropout(dropout)
        self.dropout_3 = nn.Dropout(dropout)
        self.self_attn = MultiHeadAttention(hidden_size, n_head, dropout)
        self.cross_attn = MultiHeadAttention(hidden_size, n_head, dropout)
        self.ffn = FeedForwardNetwork(hidden_size, d_ff, dropout)

    def forward(self, hidden_states, encoder_output, src_mask, tgt_mask): 
        residual = hidden_states
        hidden_states = self.input_layernorm(hidden_states)
        hidden_states = self.dropout_1(self.self_attn(hidden_states, tgt_mask))
        hidden_states = residual + hidden_states

        residual = hidden_states
        hidden_states = self.cross_layernorm(hidden_states)
        hidden_states = self.dropout_2(self.cross_attn(encoder_output, src_mask))
        hidden_states = residual + hidden_states

        residual = hidden_states
        hidden_states = self.final_layernorm(hidden_states)
        hidden_states = self.dropout_3(self.ffn(hidden_states))
        hidden_states = residual + hidden_states
        return hidden_states
    
class Decoder(nn.Module):
    def __init__(self, n_decoder, hidden_size, eps, dropout, n_head, d_ff, output_size):
        super().__init__()

        self.n_decoder = n_decoder

        # Embedding & Positional Encoding
        self.embed_tokens = Embedder(hidden_size)
        self.embed_positions = PositionEmbedding(hidden_size, output_size)

        # Decoder Layers
        self.layers = nn.ModuleList([
            DecoderLayer(hidden_size,eps, dropout, n_head, d_ff) for _ in range(self.n_decoder)
        ])

        self.norm = RMSNorm(hidden_size,eps)

    def forward(
        self,
        encoder_output, tgt, inputs_mask, tgt_mask
    ) -> torch.Tensor:
        inputs_embeds = self.embed_tokens(tgt)

        x = self.embed_positions(inputs_embeds)

        for layer in self.layers:
            x = layer(x, encoder_output, inputs_mask, tgt_mask)

        x = self.norm(x)
        return x
    
class EncoderLayer(nn.Module):
    def __init__(self, hidden_size,eps, dropout, n_head, d_ff):
        super().__init__()
        self.norm_1 = RMSNorm(hidden_size,eps)
        self.norm_2 = RMSNorm(hidden_size,eps)
        self.dropout_1 = nn.Dropout(dropout)
        self.dropout_2 = nn.Dropout(dropout)
        self.FFN = FeedForwardNetwork(hidden_size, d_ff, dropout)
        self.MHA = MultiHeadAttention(hidden_size, n_head, dropout)

    def forward(self, x, src_mask):
        x1 = self.norm_1(x) 
        x = x + self.dropout_1(self.MHA(x, src_mask))
        x2 = self.norm_2(x)
        x = x + self.dropout_2(self.FFN(x2))
        return x


class Encoder(nn.Module):
    def __init__(self, hidden_size, n_encoder, eps, dropout, n_head, d_ff, output_size):
        super().__init__()
        self.n_encoder = n_encoder
        self.embed = Embedder(hidden_size)
        self.pe = PositionEmbedding(hidden_size, output_size)
        self.layers = nn.ModuleList([EncoderLayer(hidden_size,eps, dropout, n_head, d_ff) for i in range(n_encoder)])
        self.norm_last = RMSNorm(hidden_size=hidden_size, eps=eps)

    def forward(self, src, src_mask):
        x = self.embed(src)
        x = self.pe(x)
        for i in range(self.n_encoder):
            x = self.layers[i](x, src_mask)
        return self.norm_last(x)

In [27]:
class Transformer(nn.Module):
    def __init__(self, hidden_size, n_encoder, eps, dropout, n_head, d_ff, output_size):
        super().__init__()
        self.Encoder = Encoder(hidden_size, n_encoder, eps, dropout, n_head, d_ff, output_size)
        self.Decoder = Decoder(n_decoder = n_encoder, hidden_size= hidden_size, eps=eps, dropout=dropout, n_head=n_head, d_ff=d_ff, output_size=output_size)
        self.out = nn.Linear(hidden_size, 13)

    def forward(self, src, tgt, src_mask, tgt_mask):
        encoder_outputs = self.Encoder(src, src_mask)
        decoder_outputs = self.Decoder(encoder_outputs, tgt, src_mask, tgt_mask)
        outputs = self.out(decoder_outputs)

        return outputs

In [28]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = Transformer(hidden_size=512, n_encoder=6, eps=1e-6, dropout=0.1, n_head=8, d_ff=2048, output_size=90).to(device)
for inputs, tgt_input, label in train_dataloader:
    tgt_input = tgt_input.float().to(device) # [8, 90, 13]
    inputs = inputs.float().to(device) # [8, 90, 13]
    label = label.float().to(device) # [8, 90, 13]

    input_mask, trg_mask = create_masks(torch.ones(8,90), torch.ones(8,90)) # input_mask:[8, 1, 90]
    outputs = model(inputs, tgt_input, input_mask, trg_mask)
    break
