# 範例 : Transformer decoder
***
- 實做 Transformer decoder 以更了解　Transformer 
- 應用 Transformer decoder 建立一個簡單的 ptt 貼文回應器 驗證 Transformer decoder 可以運行

# [教學目標]
- 了解如何實作 transformer decoder 和其結構
- 了解如何應用 transformer decoder 並證明 decoder 可以作用


# [範例重點]
- 觀察 TransformerDecoder 的建立
- 觀察 TransformerDecoderLayer 的建立
  - 使用 encoder 相同的 MultiHeadAttentionSubLayer
  - 使用 encoder 相同的 PosFeedForwardSubLayer
- 觀察如何使用 建立的 TransformerDecoder 
  - 使用 TransformerDecoder 做序列生成 SequenceGenerate
  - 如何使用 SequenceGenerate 模型 訓練一個 ptt 回應機

# [範例結構]
- TransformerDecoder 模型和 SequenceGenerate 實作
- ptt 資料準備
- 應用 SequenceGenerate 訓練 ptt answer machine

In [1]:
# 連接個人資料 讀取 PTT 訓練資料和儲存模型
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!nvidia-smi

Thu May 13 07:45:05 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.19.01    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   51C    P8    10W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [43]:
import re
import csv
import time
import torch
import torchtext
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from collections import Counter
from torchtext.vocab import Vocab
from torchtext.data.utils import get_tokenizer
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader

# 實做 TransformerDecoder
- 如果只用 Transfomer decoder 而已 不和 encoder　一起使用 
  - skip_encoder_attn 不需要和 encoder attention
  - enc_hidden　和 enc_mask　不用輸入

In [44]:
class TransformerDecoder(nn.Module):
    def __init__(self, hidden_dim, feedforward_dim, n_dec_layers, n_attn_heads, dropout, dec_voca_length, 
               max_pos_length , device , skip_encoder_attn = False):
        # hidden_dim = 256
        # feedforward_dim = 512
        # n_dec_layers = 3 
        # n_attn_heads = 8 
        # dropout = 0.1
        # dec_voca_length = len(cmn_vocab)
        super().__init__()
        self.device = device
        # 建立 decoder token embedding 
        self.dec_tok_embedding = nn.Embedding(dec_voca_length, hidden_dim)
        # 建立 decoder position embedding 
        self.dec_pos_embedding = nn.Embedding(max_pos_length, hidden_dim)

        # 建立 n_dec_layers 個 TransformerDecoderLayer 層
        self.transformer_decoder_layers = nn.ModuleList([TransformerDecoderLayer(hidden_dim,
                                                                                 feedforward_dim,
                                                                                 n_attn_heads, 
                                                                                 dropout, 
                                                                                 device, skip_encoder_attn) for _ in range(n_dec_layers)])
        # 輸出層 輸出 vocabulary 個長度
        self.full_conn_out = nn.Linear(hidden_dim, dec_voca_length)
        self.dropout = nn.Dropout(dropout)
        self.scale = torch.sqrt(torch.FloatTensor([hidden_dim])).to(device)

    def forward(self, dec_seq, enc_hidden , dec_mask, enc_mask):
        # dec_seq shape: [batch size, dec seq len]
        # enc_hidden shape: [batch size, enc seq len, hid dim] # optional 不需要時輸入空值
        # dec_mask shape: [batch size, dec seq len]
        # enc_mask shape: [batch size, enc seq len] # optional 不需要時輸入空值
                    
        batch_size = dec_seq.shape[0]
        dec_len = dec_seq.shape[1]
            
        pos = torch.arange(0, dec_len).unsqueeze(0).repeat(batch_size, 1).to(self.device)                      
        # pos shape: [batch size, dec seq len]
                
        # 將 decoder token embedding 加上 decoder postion embedding
        dec_seq = self.dropout(self.dec_tok_embedding(dec_seq)  + self.dec_pos_embedding(pos))
        # dec_seq shape: [batch size, dec seq len, hid dim]
            
        for layer in self.transformer_decoder_layers:
            dec_seq, encoder_decoder_attention , decoder_self_attention = layer(dec_seq, enc_hidden, dec_mask, enc_mask)
        # dec_seq shape: [batch size, dec seq  len, hid dim]
        # attention shape: [batch size, n heads, trg len, src len]
            
        output = self.full_conn_out(dec_seq)  
        # output shape: [batch size, trg len, output dim]
                
        return output, encoder_decoder_attention , decoder_self_attention

# 實做 TransformerDecoderLayer
- 實作在transformerDecoder 使用多層 的TransformerDecoderLayer
- 如果只使用 decoder 則不用 encoder attention, --> skip_encoder_attn = True 

In [45]:
class TransformerDecoderLayer(nn.Module):
    def __init__(self, hidden_dim , feedforward_dim, n_attn_heads, dropout , device , skip_encoder_attn = False):
        # hidden_dim = 256
        # feedforward_dim = 512
        # n_attn_heads = 8
        # dropout = 0.1
        super().__init__()
        self.skip_encoder_attn = skip_encoder_attn 
        self.self_attention_sublayer = MultiHeadAttentionSubLayer(hidden_dim, n_attn_heads, dropout, device)
        self.self_attn_layernorm = nn.LayerNorm(hidden_dim)

        if not skip_encoder_attn:
            self.encoder_attention_sublayer = MultiHeadAttentionSubLayer(hidden_dim, n_attn_heads, dropout, device)
            self.encoder_attn_layernorm = nn.LayerNorm(hidden_dim)

        self.positionwise_feedforward = PosFeedForwardSubLayer(hidden_dim, feedforward_dim, dropout)
        self.feedforward_layernorm = nn.LayerNorm(hidden_dim)
        
        self.dropout = nn.Dropout(dropout)     

    def forward(self, dec_seq, enc_hidden , dec_mask, enc_mask):
        # dec_seq    shape: [batch size, dec seq len, hid dim]
        # enc_hidden shape: [batch size, enc seq len, hid dim] # optional 不需要時輸入空值
        # dec_mask   shape: [batch size, dec seq len]
        # enc_mask   shape: [batch size, enc seq len] # optional 不需要時輸入空值
        
        # self attention 子層
        _dec_seq, decoder_self_attention = self.self_attention_sublayer(dec_seq, dec_seq, dec_seq, dec_mask)
            
        # dropout, residual connection and layer norm　(Add and Norm)
        dec_seq = self.self_attn_layernorm(dec_seq + self.dropout(_dec_seq))
        # dec_seq  shape: [batch size, decode sequence len, hid dim]
                
        # 需不需要建立　encoder attention 層        
        if not self.skip_encoder_attn:
            # encoder attention
            _dec_seq, encoder_decoder_attention = self.encoder_attention_sublayer(dec_seq, enc_hidden, enc_hidden, enc_mask)
            # dropout, residual connection and layer norm
            dec_seq = self.encoder_attn_layernorm(dec_seq + self.dropout(_dec_seq))
        else:
            encoder_decoder_attention = None
                        
        # dec_seq shape: [batch size, decode sequence len, hid dim]
        # positionwise feedforward
        _dec_seq = self.positionwise_feedforward(dec_seq)
            
        # dropout, residual and layer norm (Add and Norm)
        dec_seq = self.feedforward_layernorm(dec_seq + self.dropout(_dec_seq))
        # dec_seq shape: [batch size, decode sequence len, hid dim]
        # attention shape: [batch size, n heads, decode sequence len, encode sequence len]
            
        return dec_seq, encoder_decoder_attention , decoder_self_attention

# 實做 MultiHeadAttentionSubLayer
- 實作 encoder and decoder 同時共用的 MultiHeadAttention SubLayer 


In [46]:
class MultiHeadAttentionSubLayer(nn.Module):
    def __init__(self, hidden_dim , n_attn_heads, dropout, device):
        # hidden_dim = 256
        # n_attn_heads = 8
        # dropout = 0.1
        super().__init__()
        # 確定設定的 hidden layer 維度可以被 attention head 整除
        assert hidden_dim % n_attn_heads ==0

        # hidden layer 維度
        self.hidden_dim = hidden_dim
        # num of heads
        self.n_attn_heads = n_attn_heads
        # 平均分到每個 multi-head 的 維度
        self.head_dim = hidden_dim // n_attn_heads
        # 就是在課程中提到的 Wq Wk Wv
        self.full_conn_q = nn.Linear(hidden_dim, hidden_dim)
        self.full_conn_k = nn.Linear(hidden_dim, hidden_dim)
        self.full_conn_v = nn.Linear(hidden_dim, hidden_dim)

        # 最後結果再過一層 線性轉換
        self.full_conn_o = nn.Linear(hidden_dim, hidden_dim)
        self.dropout = nn.Dropout(dropout)
        # 根據維度大小調整 attention 值 以免維度太大 Q dot K 結果過大影響學習效率    
        self.scale = torch.sqrt(torch.FloatTensor([self.head_dim])).to(device)

    def forward(self, query_input, key_input, value_input, mask = None):
        batch_size = query_input.shape[0]
        # query_input shape [batch size, query len, hid dim]
        # key_input   shape [batch size, key len, hid dim]
        # value_input shape [batch size, value len, hid dim]

        Q = self.full_conn_q(query_input)
        K = self.full_conn_k(key_input)
        V = self.full_conn_v(value_input)
        # Q shape [batch size, query len, hid dim]
        # K shape [batch size, key len, hid dim]
        # V shape [batch size, value len, hid dim]

        # 將 attention 切成多塊小的 attention 並將 attention 的 2 和 3 維度轉置 以達到將 attention head 提到前面而分開每個 attention head
        Q = Q.view(batch_size, self.n_attn_heads, -1, self.head_dim)
        K = K.view(batch_size, self.n_attn_heads, -1, self.head_dim)
        V = V.view(batch_size, self.n_attn_heads, -1, self.head_dim)
        
        # 調整過的 dot product attention, 由於之前分開了每個 attention head 
        # 所以現在只要把 Ｋ的最後兩個維度轉置 就可以 by attention head 求得 Q dot K
        scaled_dot_product_similarity = torch.matmul(Q, K.permute(0, 1, 3, 2)) / self.scale
        # scaled_dot_product_similarity 輸出 [batch size, n heads, query len, key len]

        if mask is not None:
            scaled_dot_product_similarity = scaled_dot_product_similarity.masked_fill(mask == 0, -1e10)

        attention = torch.softmax(scaled_dot_product_similarity, dim = -1)
        # attention shape: [batch size, n heads, query len, key len]

        x = torch.matmul(self.dropout(attention), V)  
        # x shape: [batch size, n heads, query len, head dim]
            
        x = x.permute(0, 2, 1, 3).contiguous()
        # x shape: [batch size, query len, n heads, head dim]
            
        x = x.view(batch_size, -1, self.hidden_dim)  
        # x shape: [batch size, query len, hid dim]
            
        x = self.full_conn_o(x)  
        # x shape: [batch size, query len, hid dim]
            
        return x, attention

# 實做 PosFeedForwardSubLayer
- 實作 encoder and decoder 同時共用的 PosFeedForward SubLayer 

In [47]:
class PosFeedForwardSubLayer(nn.Module):
    def __init__(self, hidden_dim, ff_dim, dropout):
      super().__init__()
      self.full_conn_1 = nn.Linear(hidden_dim, ff_dim)
      self.full_conn_2 = nn.Linear(ff_dim,  hidden_dim)
      self.dropout = nn.Dropout(dropout)

    def forward(self, x):
      # x shape: [batch size, seq len, hid dim]   
      x = self.dropout(torch.relu(self.full_conn_1(x)))  
      # x shape: [batch size, seq len, pf dim]
          
      x = self.full_conn_2(x)
      # x shape: [batch size, seq len, hid dim]
          
      return x

# 實做 SequenceGenerate 
- 處理 序列生成工作
- 叫用 TransformerDecoderLayer
  - 不使用 encoder decoder attention 子層


In [48]:
class SequenceGenerate(nn.Module):
    def __init__(self, decoder, dec_pad_idx, device):
        super().__init__()
        self.decoder = decoder
        self.dec_pad_idx = dec_pad_idx
        self.device = device

    def make_dec_mask(self, dec_seq):    
      # dec_seq shape: [batch size, dec seq len]   
      dec_pad_mask = (dec_seq != self.dec_pad_idx).unsqueeze(1).unsqueeze(2)  
      # dec_pad_mask shape: [batch size, 1, 1, dec seq len]
      
      dec_len = dec_seq.shape[1]  
      dec_sub_mask = torch.tril(torch.ones((dec_len, dec_len), device = self.device)).bool()
      # dec_sub_mask shape: [dec seq len, dec seq len]
              
      dec_mask = dec_pad_mask & dec_sub_mask  
      # dec_mask shape: [batch size, 1, dec seq len, dec seq len]

      return dec_mask

    def forward(self, dec_seq):
      # dec_seq shape:　tensor [batch size, trg len]          
      dec_mask = self.make_dec_mask(dec_seq)
      # dec_mask shape: [batch size, 1, trg len, trg len]
          
      # 呼叫　transformer decoder 不需要輸入　encoder 相關資訊
      # 也不用接收　encoder decoder attnetion            
      output, _ , decoder_self_attention = self.decoder(dec_seq, None, dec_mask, None)
      # output shape: [batch size, trg len, output dim]
      # attention shape: [batch size, n heads, trg len, src len]
          
      return output, decoder_self_attention

# PTT 資料準備

- 我們的資料來源是 https://github.com/zake7749/Gossiping-Chinese-Corpus
- 詳情請看 github


In [49]:
data_dir = '/content/drive/My Drive/DL_NLP_marathon/data/Day26_transformer_decoder/'
with open(data_dir + 'Gossiping-QA-Dataset-2_0.csv' , encoding='utf-8') as fin:
    csvreader = csv.reader(fin)
    next(csvreader)
    ptt_qa_pairs = [row for row in csvreader]

print ("Sample: " , ptt_qa_pairs[1000])
print ("Total records:" , len(ptt_qa_pairs))

Sample:  ['有沒有跑車很常見高級轎車卻很少的八卦', '高雄常看到賓利啊…勞斯萊斯就真的只看過兩次']
Total records: 774114


# do training test split 如果已經分過了 可以跳過這段

In [None]:
from sklearn.model_selection import train_test_split

train, val = train_test_split(ptt_qa_pairs, test_size=70000)
print ("training data:{} , val data: {} ".format(len(train),len(val)))
    
def write_csv(trn_data, file_path):
    with open(file_path ,'w', newline='', encoding='utf-8') as fout:
        writer = csv.writer (fout)
        for itm in trn_data: 
            writer.writerow ([itm[0] + "|" + itm[1] , itm[0] + "|" + itm[1]])
            
file_path = data_dir + 'train.csv'
write_csv(train, file_path)

file_path = data_dir + 'val.csv'
write_csv(val, file_path)

# file_path = data_dir + 'test.csv'
# write_csv(test, file_path )

training data:704114 , val data: 70000 


# 資料處理

In [50]:
def qa_tokenizer(text):
  # 去掉非中文字元
  regex = re.compile(r'[^\u4e00-\u9fa5A-Za-z0-9]')
  text = text.replace("\\","").split("|")
  return [word for word in regex.sub(text[0],' ') if word.strip()] + ["<sep>"] + [word for word in regex.sub(text[1],' ') if word.strip()]

def trg_tokenizer(text):
  # 去掉非中文字元
  regex = re.compile(r'[^\u4e00-\u9fa5A-Za-z0-9]')
  text = text.replace("\\","").split("|")
  return ['<pad>' for word in regex.sub(text[0],' ') if word.strip()] + ["<pad>"] + [word for word in regex.sub(text[1],' ') if word.strip()] 

def build_vocab(filepath, tokenizer):
    counter = Counter()
    with open(filepath, encoding="utf8") as f:
        for string_ in f:
            counter.update(tokenizer(string_))
    return Vocab(counter, specials=['<unk>', '<pad>', '<bos>', '<eos>', '<sep>'])


train_filepath = data_dir + 'train.csv'
val_filepath = data_dir + 'val.csv'
cmn_vocab = build_vocab(train_filepath, qa_tokenizer)

In [51]:
def data_process(filepath):
    data = []
    with open(filepath, encoding='utf-8') as fin:
        csvreader = csv.reader(fin)
        for i, row in enumerate(csvreader):
            qa, trg = row[0], row[1]
            qa_tensor_ = torch.tensor([cmn_vocab[token] for token in qa_tokenizer(qa)], 
                                    dtype=torch.long)
            trg_tensor_ = torch.tensor([cmn_vocab[token] for token in trg_tokenizer(trg)],
                                    dtype=torch.long)
            data.append((qa_tensor_, trg_tensor_))
    return data

train_data = data_process(train_filepath)
val_data = data_process(val_filepath)
# test_data = data_process(test_filepaths)

# 我們要使用的資料格式
- 建立 vocabulary
- qa: ptt 上蒐集的問題和回答中間用 “sep”隔開
- trg: 我們的訓練目標只有回答的部分，其他的字元（包括“\<sep\>”）我們都以 “\<pad\>” 取代 , 計算 loss 的時候系統會忽略 ”pad“ token 註記的目標

In [52]:
print ("中文語料的字元表長度: " , len(cmn_vocab))
print ("Sample Q and A:", [cmn_vocab.itos[idx] for idx in val_data[2000][0]])
print ("Sample Target:",  [cmn_vocab.itos[idx] for idx in val_data[2000][1]])

中文語料的字元表長度:  7645
Sample Q and A: ['約', '克', '夏', '與', '約', '克', '羊', '哪', '個', '好', '養', '<sep>', '牠', '最', '近', '沒', '粗', '乃', '喔']
Sample Target: ['<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '牠', '最', '近', '沒', '粗', '乃', '喔']


# 準備 train_iterator and valid_iterator

In [53]:
BATCH_SIZE = 768

PAD_IDX = cmn_vocab['<pad>']
BOS_IDX = cmn_vocab['<bos>']
EOS_IDX = cmn_vocab['<eos>']

def generate_batch(data_batch):
    qa_batch, trg_batch = [], []
    for (qa_item, trg_item) in data_batch:
        qa_batch.append(torch.cat([torch.tensor([BOS_IDX]), qa_item, torch.tensor([EOS_IDX])], dim=0))
        trg_batch.append(torch.cat([torch.tensor([BOS_IDX]), trg_item, torch.tensor([EOS_IDX])], dim=0))
    
    sorted_idx = sorted(range(len(qa_batch)), key=lambda i: len(qa_batch[i]), reverse=True)
    qa_batch = [qa_batch[i] for i in sorted_idx]
    trg_batch = [trg_batch[i] for i in sorted_idx]
    qa_batch = pad_sequence(qa_batch, padding_value=PAD_IDX, batch_first=True)
    trg_batch = pad_sequence(trg_batch, padding_value=PAD_IDX, batch_first=True)
    return qa_batch, trg_batch


train_iter = DataLoader(train_data, batch_size=BATCH_SIZE,
                        shuffle=True, collate_fn=generate_batch)
valid_iter = DataLoader(val_data, batch_size=BATCH_SIZE,
                        shuffle=True, collate_fn=generate_batch)
# test_iter = DataLoader(test_data, batch_size=BATCH_SIZE,
#                        shuffle=True, collate_fn=generate_batch)

# model training and evaluate function
- 注意 我們要輸入的字和目標要shift 一位 
- 也就是輸入 為', '什', '麼', '淘', '寶', '一', '堆', '賣', '家', '能', '國', '內', '免', '運', '?', '<sep>' --> 希望輸出 '有'
- 輸入 為', '什', '麼', '淘', '寶', '一', '堆', '賣', '家', '能', '國', '內', '免', '運', '?', '<sep>', '有' --> 希望輸出 '的'

In [55]:
def train(model, iterator, optimizer, criterion, clip):    
    model.train()
    epoch_loss = 0
    
    for i, batch in enumerate(iterator):
        qa = batch[0].to(device)
        trg = batch[1].to(device)
        # qa shape: [batch size, qa len]
        # trg shape: [batch size, trg len]
        
        optimizer.zero_grad()
        output, _  = model(qa[:, :-1])
        # output shape: [batch size, trg len - 1, output dim]

        output_dim = output.shape[-1]
        
        output = output.contiguous().view(-1, output_dim)
        trg = trg[:, 1:].contiguous().view(-1)
        # output shape: [batch size * (trg len - 1), output dim]
        # trg    shape: [batch size * (trg len - 1)]
        
        loss = criterion(output, trg)
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        epoch_loss += loss.item()

        if i % 1000 == 0: print ("Train Batch:" , i , "Loss:" , loss.item())

    return epoch_loss / len(iterator)

In [56]:
def evaluate(model, iterator, criterion):
    model.eval()
    epoch_loss = 0
    
    with torch.no_grad():
        for i, batch in enumerate(iterator):
            qa = batch[0].to(device)
            trg = batch[1].to(device)
            # qa shape: [batch size, qa len]
            # trg shape: [batch size, trg len]
            
            output, _  = model(qa[:, :-1])
            # output shape: [batch size, (trg len - 1), output dim]
            
            output_dim = output.shape[-1]
            
            output = output.contiguous().view(-1, output_dim)
            trg = trg[:, 1:].contiguous().view(-1)
            # output shape: [batch size * (trg len - 1), output dim]
            # trg    shape: [batch size * (trg len - 1)]
            
            loss = criterion(output, trg)
            epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

# 實際建立模型
- 設定重要參數
  - 建立一個 hidden embedding 256，三層decoder layer，八個attention heads
  - position wise feedforward 中間層 512 dropout 0.1 learning rate: 0.0005
  - 最長句長 70
- 如果要保留訓練出來的模型，建議和 vocabulary 一起儲存

In [57]:
model_dir = '/content/drive/My Drive/DL_NLP_marathon/model/Day26_transformer/'

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

VOC_SIZE = len(cmn_vocab)
MAX_SENT_LENGTH = 70
HID_DIM = 256
DEC_LAYERS = 3
DEC_HEADS = 8
DEC_FF_DIM = 512
DEC_DROPOUT = 0.1
LEARNING_RATE = 0.0005
CMN_PAD_IDX = cmn_vocab['<pad>']

dec = TransformerDecoder(HID_DIM, 
                         DEC_FF_DIM,
                         DEC_LAYERS, 
                         DEC_HEADS,  
                         DEC_DROPOUT,
                         VOC_SIZE, 
                         MAX_SENT_LENGTH,
                         device, skip_encoder_attn=True)

# TransformerSequenceGenerate
model = SequenceGenerate(dec, CMN_PAD_IDX, device).to(device)
optimizer = optim.Adam(model.parameters(), lr = LEARNING_RATE)
criterion = nn.CrossEntropyLoss(ignore_index = CMN_PAD_IDX)

In [58]:
def initialize_weights(m):
    if hasattr(m, 'weight') and m.weight.dim() > 1:
        nn.init.xavier_uniform_(m.weight.data)
model.apply(initialize_weights);

# 實際訓練
- Ｔ4 大約 四分半一個 epoch
- 訓練十個 epoch 就有一定的成績了
- 如果沒時間訓練 也可以下載我們訓練好的權重

In [59]:
N_EPOCHS = 100
CLIP = 1

best_valid_loss = 9999999

for epoch in range(N_EPOCHS):    
    start_time = time.time()
    
    train_loss = train(model, train_iter, optimizer, criterion, CLIP)
    valid_loss = evaluate(model, valid_iter, criterion)
    
    end_time = time.time()
    
    # epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    torch.save(model.state_dict(), model_dir + 'model-ptt-{}.pt'.format(epoch))
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), model_dir + 'model-ptt-best.pt')

    print ("Epoch {} training time: {:.2f} sec Training Loss: {:.3f} , Valiation Loss: {:.3f}".format(epoch, 
                                                                                                      end_time - start_time, 
                                                                                                      train_loss , 
                                                                                                      valid_loss))

Train Batch: 0 Loss: 8.970396995544434
Epoch 0 training time: 370.28 sec Training Loss: 4.121 , Valiation Loss: 1.920
Train Batch: 0 Loss: 1.7315258979797363
Epoch 1 training time: 368.00 sec Training Loss: 1.442 , Valiation Loss: 1.098
Train Batch: 0 Loss: 1.1887110471725464
Epoch 2 training time: 368.64 sec Training Loss: 1.034 , Valiation Loss: 0.633
Train Batch: 0 Loss: 0.790983259677887
Epoch 3 training time: 372.20 sec Training Loss: 0.580 , Valiation Loss: 0.330
Train Batch: 0 Loss: 0.30305853486061096


KeyboardInterrupt: ignored

In [60]:
!nvidia-smi

Thu May 13 08:55:32 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.19.01    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   74C    P0    34W /  70W |  14656MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

# 如果要保留訓練出來的模型，建議和 vocabulary 一起儲存

In [61]:
model_dir = '/content/drive/My Drive/DL_NLP_marathon/model/Day26_transformer/'
torch.save(cmn_vocab, model_dir + 'vocab.pt')

# 讀取訓練最佳結果
-- 如果下載我們的訓練結果 別忘了讀取 vocabulary

In [62]:
# 保留讀取之前儲存的 vocabulary
cmn_vocab = torch.load(model_dir + 'vocab.pt')

model.load_state_dict(torch.load(model_dir + 'model-ptt-best.pt'))
# model.load_state_dict(torch.load(model_dir + 'model-ptt-1.pt'))
test_loss = evaluate(model, valid_iter, criterion)

print(f'| Test Loss: {test_loss:.3f}')

| Test Loss: 0.250


# 使用訓練結果產生回答
- 用模型每一步最佳猜測產生回答

In [63]:
def simple_answer_ptt_question(sentence, qa_vocab, model, device, max_len = 50):   
    model.eval()
        
    tokens = [token.lower() for token in sentence]
    tokens = ['<bos>'] + tokens + ['<sep>']
    qa_indexes = [qa_vocab[token] for token in tokens]
    # qa_tensor = torch.LongTensor(qa_indexes).unsqueeze(0).to(device)
    
    for i in range(max_len):
        qa_tensor = torch.LongTensor(qa_indexes).unsqueeze(0).to(device)
        with torch.no_grad():
            dec_qa, decoder_self_attention = model(qa_tensor)
        
        pred_token = dec_qa.argmax(2)[:,-1].item()
        qa_indexes.append(pred_token)

        if pred_token == qa_vocab["<eos>"]:
            break
    
    # answer = "".join([qa_vocab.itos[i] for i in qa_indexes])
    # print(qa_indexes)
    # qa_tokens = [qa_vocab.itos[i] for i in qa_indexes]
    # answer = "".join(qa_tokens)
    qa_tokens = [qa_vocab.itos[i] for i in qa_indexes]
    answer = "".join(qa_tokens)
    # answer = "".join(qa_tokens[qa_tokens.index("<sep>")+1:-1])
            
    return answer,  decoder_self_attention

# Fun Time
-- 自己上 ptt 找新的標題來玩吧

In [68]:
question = "日月光是找老婆的好地方嗎"
# question = '長這麼大，做過最壞的事是什麼？'
# question = '看到前女友生小孩是什麼感覺'
# question = '把中國人惹翻了會怎麼樣嗎？'
# question = '泰國人民為何不推翻王室?'
qa_result, _ = simple_answer_ptt_question(question, cmn_vocab, model, device, max_len = 50)

print(qa_result)

<bos>日月光是找老婆的好地方嗎<sep>高雄，嗎.高高興趣的比較多的比較有多的<eos>


In [40]:
_.size()

torch.Size([1, 8, 17, 17])