In [3]:
# データ準備

In [None]:
import torch
import torch.nn as nn
import torch.nn.init as init
import torch.optim as optim
import torch.nn.functional as F

import pickle

%matplotlib inline
import numpy as np
from matplotlib import pyplot as plt

def load_data() :
    %cd /content/drive/My Drive/Colab Notebooks/twitterBot/datasets
    #単語ファイルロード
    with open('words.pickle', 'rb') as ff :
        words=pickle.load(ff)         

    #Encoder Inputデータをロード
    with open('e.pickle', 'rb') as f :
        encoder = pickle.load(f)

    #Decoder Inputデータをロード
    with open('d.pickle', 'rb') as g :
        decoder = pickle.load(g)

    #ラベルデータをロード
    with open('t.pickle', 'rb') as h :
        label = pickle.load(h)

    #maxlenロード
    with open('maxlen.pickle', 'rb') as maxlen :
        [maxlen_e, maxlen_d] = pickle.load(maxlen)

    #id2word
    with open('indices_word.pickle', 'rb') as i2w :
        indices2word = pickle.load(i2w)

    #word2id
    with open('word_indices.pickle', 'rb') as w2i :
        word2indices = pickle.load(w2i)

    print(word2indices["　"])
    row = encoder.shape[0]

    encoder = encoder.reshape((row, maxlen_e))
    decoder = decoder.reshape((row, maxlen_d))
    label = label.reshape((row, maxlen_d))

    data = {
            'encoder'           :encoder,
            'decoder'           :decoder,
            'label'           :label,
            'maxlen_e'    :maxlen_e,
            'maxlen_d'    :maxlen_d,
            'indices2word' : indices2word,
            'word2indices' : word2indices,
            'input_dim'   : len(words),
            'output_dim'  : len(words)
            }
    return data

dataset = load_data()
maxlen_e     = dataset['maxlen_e']
maxlen_d     = dataset['maxlen_d']
encoder            = dataset['encoder']
decoder            = dataset['decoder']
label            = dataset['label'] 
indices_word = dataset['indices2word'] 
word_indices = dataset['word2indices'] 
data_row     = encoder.shape[0]                  # 訓練データの行数
n_split      = int(data_row*0.9)           # データの分割比率
# データを訓練用とテスト用に分割
encoder_train, encoder_test = np.vsplit(encoder,[n_split])   #エンコーダインプット分割
decoder_train, decoder_test = np.vsplit(decoder,[n_split])   #デコーダインプット分割
label_train, label_test = np.vsplit(label,[n_split])   #ラベルデータ分割
print(len(label_train))
#train_dataset

from sklearn.model_selection import train_test_split
import random
from sklearn.utils import shuffle

# データをバッチ化するための関数
def train2batch(input_data, output_data, batch_size):
    input_batch = []
    output_batch = []
    input_shuffle, output_shuffle = shuffle(input_data, output_data)
    for i in range(0, len(input_data), batch_size):
      input_batch.append(input_shuffle[i:i+batch_size])
      output_batch.append(output_shuffle[i:i+batch_size])
    return input_batch, output_batch

In [None]:
# モデル定義

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import math

class PositionalEncoder(nn.Module):
    def __init__(self, d_model=300, max_seq_len=256, dropout = 0.1):
        super().__init__()
        self.d_model = 300

        self.dropout = nn.Dropout(p = dropout)
        
        pe = torch.zeros(max_seq_len, d_model)

        for pos in range(max_seq_len):
            for d in range(0,d_model,2):
                pe[pos,i] = math.sin(pos/(1000**((2*i)/d_model)))
                pe[pos,i+1] = math.cos(pos/(1000**((2*i)/d_model)))

        self.pe = pe.unsqueeze(0)
        self.pe.requires_grad = False

    def forward(self,x):
        ret = math.sqar(self.d_model)*x+self.pe
        return self.dropout(ret)

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import math
class Embedder(nn.Module):
    def __init__(self, text_embedding_vectors):
        super(Embedder, self).__init__()
        self.embeddings = nn.Embeddings.from_pretrained(embeddings = text_embedding_vectors, freeze=True)

    def forward(self, x):
        x_vec = self.embeddings(x)
        return x_vec

In [None]:
class Transformer():
    def __init__(self, d_model: int = 300, nhead: int = 8, num_encoder_layers: int = 6,
                 num_decoder_layers: int = 6, dim_feedforward: int = 1024, dropout: float = 0.1,
                 activation: str = "relu",target_vocab_length: int = 60000,text_embedding_vectors) -> None:
        super(MyTransformer, self).__init__()
        self.source_embedding = Embedder(text_embedding_vectors)
        self.pos_encoder = PositionalEncoding(d_model, 256, dropout)
        encoder_layer = nn.TransformerEncoderLayer(d_model, nhead, dim_feedforward, dropout, activation)
        encoder_norm = nn.LayerNorm(d_model)
        self.encoder = nn.TransformerEncoder(encoder_layer, num_encoder_layers, encoder_norm)
        
        self.target_embedding = Embedder(text_embedding_vectors)
        decoder_layer = nn.TransformerDecoderLayer(d_model, nhead, dim_feedforward, dropout, activation)
        decoder_norm = nn.LayerNorm(d_model)
        self.decoder = nn.TransformerDecoder(decoder_layer, num_decoder_layers, decoder_norm)
        self.out = nn.Linear(d_model, target_vocab_length)
        self._reset_parameters()
        self.d_model = d_model
        self.nhead = nhead

    def forward(self, text_embedding_vectors, tgt: Tensor, src_mask: Optional[Tensor] = None, tgt_mask: Optional[Tensor] = None,
                memory_mask: Optional[Tensor] = None, src_key_padding_mask: Optional[Tensor] = None,
                tgt_key_padding_mask: Optional[Tensor] = None, memory_key_padding_mask: Optional[Tensor] = None) -> Tensor:
        if src.size(1) != tgt.size(1):
            raise RuntimeError("the batch number of src and tgt must be equal")
        src = self.source_embedding(text_embedding_vectors)
        src = self.pos_encoder(src)
        memory = self.encoder(src, mask=src_mask, src_key_padding_mask=src_key_padding_mask)
        tgt = self.target_embedding(tgt)
        tgt = self.pos_encoder(tgt)
        output = self.decoder(tgt, memory, tgt_mask=tgt_mask, memory_mask=memory_mask,
                              tgt_key_padding_mask=tgt_key_padding_mask,
                              memory_key_padding_mask=memory_key_padding_mask)
        output = self.out(output)
        return output


    def _reset_parameters(self):
        r"""Initiate parameters in the transformer model."""
        for p in self.parameters():
            if p.dim() > 1:
                xavier_uniform_(p)

In [None]:
def train(train_iter, val_iter, model, optim, num_epochs,use_gpu=True): 
    train_losses = []
    valid_losses = []
    for epoch in range(num_epochs):
        train_loss = 0
        valid_loss = 0
        # Train model
        model.train()
        
        input_batch, output_batch = train2batch(encoder_train, decoder_train, batch_size=BATCH_NUM)
        for i, batch in enumerate(input_batch):
            device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
            src = torch.tensor(input_batch[i], device=device, dtype=torch.long)
            trg = torch.tensor(output_batch[i], device=device, dtype=torch.long)
            
            trg_input = trg[:, :-1]
            targets = trg[:, 1:].contiguous().view(-1)

            src_mask = (src != 0)
            src_mask = src_mask.float().masked_fill(src_mask == 0, float('-inf')).masked_fill(src_mask == 1, float(0.0))
            src_mask = src_mask.cuda() if use_gpu else src_mask

            trg_mask = (trg_input != 0)
            trg_mask = trg_mask.float().masked_fill(trg_mask == 0, float('-inf')).masked_fill(trg_mask == 1, float(0.0))
            trg_mask = trg_mask.cuda() if use_gpu else trg_mask

            size = trg_input.size(1)
            #print(size)
            np_mask = torch.triu(torch.ones(size, size)==1).transpose(0,1)
            np_mask = np_mask.float().masked_fill(np_mask == 0, float('-inf')).masked_fill(np_mask == 1, float(0.0))
            np_mask = np_mask.cuda() if use_gpu else np_mask   

            # Forward, backprop, optimizer
            optim.zero_grad()
            preds = model(src, trg_input, text_embedding_vectors=text_embedding_vectors, tgt_mask = np_mask, src_mask = src_mask, tgt_key_padding_mask=trg_mask)
            preds = preds.contiguous().view(-1, preds.size(-1))
            loss = F.cross_entropy(preds,targets, ignore_index=0,reduction='sum')
            loss.backward()
            optim.step()
            train_loss += loss.item()/BATCH_SIZE
        
        model.eval()
        with torch.no_grad():
            for i, batch in enumerate(val_iter):
                src = batch.src.cuda() if use_gpu else batch.src
                trg = batch.trg.cuda() if use_gpu else batch.trg
                #change to shape (bs , max_seq_len)
                src = src.transpose(0,1)
                #change to shape (bs , max_seq_len+1) , Since right shifted
                trg = trg.transpose(0,1)
                trg_input = trg[:, :-1]
                targets = trg[:, 1:].contiguous().view(-1)
                src_mask = (src != 0)
                src_mask = src_mask.float().masked_fill(src_mask == 0, float('-inf')).masked_fill(src_mask == 1, float(0.0))
                src_mask = src_mask.cuda() if use_gpu else src_mask
                trg_mask = (trg_input != 0)
                trg_mask = trg_mask.float().masked_fill(trg_mask == 0, float('-inf')).masked_fill(trg_mask == 1, float(0.0))
                trg_mask = trg_mask.cuda() if use_gpu else trg_mask
                size = trg_input.size(1)
                #print(size)
                np_mask = torch.triu(torch.ones(size, size)==1).transpose(0,1)
                np_mask = np_mask.float().masked_fill(np_mask == 0, float('-inf')).masked_fill(np_mask == 1, float(0.0))
                np_mask = np_mask.cuda() if use_gpu else np_mask

                preds = model(src.transpose(0,1), trg_input.transpose(0,1), tgt_mask = np_mask)#, src_mask = src_mask)#, tgt_key_padding_mask=trg_mask)
                preds = preds.transpose(0,1).contiguous().view(-1, preds.size(-1))         
                loss = F.cross_entropy(preds,targets, ignore_index=0,reduction='sum')
                valid_loss += loss.item()/1
            
        # Log after each epoch
        print(f'''Epoch [{epoch+1}/{num_epochs}] complete. Train Loss: {train_loss/len(train_iter):.3f}. Val Loss: {valid_loss/len(val_iter):.3f}''')
        
        #Save best model till now:
        if valid_loss/len(val_iter)<min(valid_losses,default=1e9): 
            print("saving state dict")
            torch.save(model.state_dict(), f"checkpoint_best_epoch.pt")
        
        train_losses.append(train_loss/len(train_iter))
        valid_losses.append(valid_loss/len(val_iter))
        
        # Check Example after each epoch:
        sentences = ["This is an example to check how our model is performing."]
        for sentence in sentences:
            print(f"Original Sentence: {sentence}")
            print(f"Translated Sentence: {greeedy_decode_sentence(model,sentence)}")
    return train_losses,valid_losses