# 参考文献

[Visualizing Bert Embeddings](https://krishansubudhi.github.io/deeplearning/2020/08/27/bert-embeddings-visualization.html)

In [1]:
import torchtext
print(torchtext.__version__)

0.6.0


In [2]:
print('hello world')

hello world


In [3]:
import torch
import torch.nn as nn
import torch.optim as optim

from torchtext.data import Field, BucketIterator

import numpy as np

import random
import math
import time
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
import torchtext
import torch
from torchtext import data
from torchtext import datasets
import pandas as pd

In [5]:
from transformers import BertJapaneseTokenizer, BertForPreTraining
tok = BertJapaneseTokenizer.from_pretrained('cl-tohoku/bert-base-japanese')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=257706.0, style=ProgressStyle(descripti…




In [6]:
def tokenizer(text):
  return tok.tokenize(text)

In [7]:
sent = "今日は雨だけど、明日は晴れそう。今日は全然勝てなかった"

In [8]:
tokenizer(sent)

['今日',
 'は',
 '雨',
 'だ',
 'けど',
 '、',
 '明日',
 'は',
 '晴れ',
 'そう',
 '。',
 '今日',
 'は',
 '全',
 '##然',
 '勝て',
 'なかっ',
 'た']

In [9]:
SRC = data.Field(sequential=True, tokenize = tokenizer, init_token='<sos>', eos_token='<eos>', lower = True)
TRG = data.Field(sequential=True, tokenize = tokenizer, init_token='<sos>', eos_token='<eos>', lower = True)

In [13]:
train, val, test = data.TabularDataset.splits(
        path="./", train='train.tsv',
        validation='val.tsv', test='test.tsv', format='tsv',
        fields=[('SRC', SRC), ('TRG', TRG)])

In [14]:
SRC.build_vocab(train)
TRG.build_vocab(train)

In [16]:
bert_model = BertForPreTraining.from_pretrained(
    "cl-tohoku/bert-base-japanese", # 日本語Pre trainedモデルの指定
    num_labels = 2, # ラベル数（今回はBinayなので2、数値を増やせばマルチラベルも対応可）
    output_attentions = False, # アテンションベクトルを出力するか
    output_hidden_states = True, # 隠れ層を出力するか
)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=445021143.0, style=ProgressStyle(descri…




Some weights of BertForPreTraining were not initialized from the model checkpoint at cl-tohoku/bert-base-japanese and are newly initialized: ['cls.predictions.decoder.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
bert_model.resize_token_embeddings(len(SRC.vocab))

Embedding(3559, 768)

In [18]:
bert_model.get_input_embeddings()

Embedding(3559, 768)

In [19]:
bert_model.get_output_embeddings()

Linear(in_features=768, out_features=3559, bias=True)

In [20]:
train_batch_size = 50
test_batch_size = 10
eval_batch_size = 2
train_iter, val_iter, test_iter = data.BucketIterator.splits((train, val, test), sort = False,  batch_sizes = (train_batch_size,eval_batch_size, test_batch_size), device= device)

In [21]:
class Encoder(nn.Module):
  def __init__(self, input_dim, emb_dim, hid_dim, n_layers, dropout):
    super().__init__()
    self.hid_dim = hid_dim
    self.n_layers = n_layers
    #self.embedding = nn.Embedding(input_dim, emb_dim)
    self.embedding = bert_model.get_input_embeddings()
    self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout = dropout)
    self.dropout = nn.Dropout(dropout)

  def forward(self, src):
    embedded = self.dropout(self.embedding(src))
    outputs, (hidden, cell) = self.rnn(embedded)
    return hidden, cell

In [22]:
class Decoder(nn.Module):
  def __init__(self, output_dim, emb_dim, hid_dim, n_layers, dropout):
    super().__init__()
    self.output_dim = output_dim
    self.hid_dim = hid_dim
    self.n_layers = n_layers

    #self.embedding = nn.Embedding(output_dim, emb_dim)
    self.embedding = bert_model.get_input_embeddings()
    self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout = dropout)

    #self.fc_out = nn.Linear(hid_dim, output_dim)
    self.fc_out = bert_model.get_output_embeddings()
    self.dropout = nn.Dropout(dropout)
  
  def forward(self, input, hidden, cell):
    input = input.unsqueeze(0)
    embedded = self.dropout(self.embedding(input))
    output, (hidden, cell) = self.rnn(embedded, (hidden, cell))
    #print(output.squeeze(0).size())
    prediction = self.fc_out(output.squeeze(0))

    return prediction, hidden, cell

In [23]:
class Seq2Seq(nn.Module):
  def __init__(self, encoder, decoder, device):
    super().__init__()
    self.encoder = encoder
    self.decoder = decoder
    self.device = device

  def forward(self, src, trg):
    batch_size = trg.shape[1]
    trg_len = trg.shape[0]
    trg_vocab_size = self.decoder.output_dim

    outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)

    hidden, cell = self.encoder(src)

    input = trg[0,:]

    for t in range(1, trg_len):
      output, hidden, cell = self.decoder(input, hidden, cell)
      
      outputs[t] = output
      top1 = output.argmax(1)
    
    return outputs

In [24]:
INPUT_DIM = len(SRC.vocab)
OUTPUT_DIM = 3559
ENC_EMB_DIM = 768
DEC_EMB_DIM = 768
ENC_HID_DIM = 768
DEC_HID_DIM = 768
N_LAYERS = 1
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

enc = Encoder(INPUT_DIM, ENC_EMB_DIM, ENC_HID_DIM, N_LAYERS, ENC_DROPOUT)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, DEC_HID_DIM, N_LAYERS, DEC_DROPOUT)

model = Seq2Seq(enc, dec, device).to(device)

  "num_layers={}".format(dropout, num_layers))


In [25]:
def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.uniform_(param.data, -0.08, 0.08)

model.apply(init_weights)

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(3559, 768)
    (rnn): LSTM(768, 768, dropout=0.5)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Decoder(
    (embedding): Embedding(3559, 768)
    (rnn): LSTM(768, 768, dropout=0.5)
    (fc_out): Linear(in_features=768, out_features=3559, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
)

In [26]:
optimizer = optim.Adam(model.parameters())

In [27]:
TRG_PAD_IDX = TRG.vocab.stoi[TRG.pad_token]

criterion = nn.CrossEntropyLoss(ignore_index = TRG_PAD_IDX)

In [28]:
def train(model, iterator, optimizer, criterion, clip):
  model.train()

  epoch_loss = 0

  for i, batch in enumerate(iterator):
    #print(i)
    src = batch.SRC
    trg = batch.TRG
    optimizer.zero_grad()

    output = model(src, trg)

    output_dim = output.shape[-1]
    output = output[1:].view(-1, output_dim)
    trg = trg[1:].view(-1)

    loss = criterion(output, trg)
    loss.backward()
    torch.nn.utils.clip_grad_norm(model.parameters(), clip)
    optimizer.step()

    epoch_loss += loss.item()

  return epoch_loss / len(iterator)

In [29]:
def evaluate(model, iterator, criterion):
  model.eval()

  epoch_loss = 0

  with torch.no_grad():

    for i, batch in enumerate(iterator):

      src = batch.SRC
      trg = batch.TRG

      output = model(src, trg)

      output_dim = output.shape[-1]

      output = output[1:].view(-1, output_dim)
      trg = trg[1:].view(-1)

      loss = criterion(output, trg)
      epoch_loss += loss.item()

    return epoch_loss / len(iterator)

In [30]:
def epoch_time(start_time, end_time):
  elapsed_time = end_time - start_time
  elapsed_mins = int(elapsed_time / 60)
  elapsed_secs = int(elapsed_time - (elapsed_mins*60))
  return elapsed_mins, elapsed_secs

In [32]:
epochs = 50
clip = 1
#def  model_train(epochs, clip):
best_valid_loss = float('inf')

for epoch in range(epochs):
    
    start_time = time.time()
    
    train_loss = train(model, train_iter, optimizer, criterion, clip)
    valid_loss = evaluate(model, val_iter, criterion)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), './model/bert_embedded_seq2seq.pth')
    
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')



Epoch: 01 | Time: 0m 55s
	Train Loss: 3.601 | Train PPL:  36.629
	 Val. Loss: 3.549 |  Val. PPL:  34.768
Epoch: 02 | Time: 0m 55s
	Train Loss: 3.584 | Train PPL:  36.003
	 Val. Loss: 3.573 |  Val. PPL:  35.625
Epoch: 03 | Time: 0m 55s
	Train Loss: 3.576 | Train PPL:  35.729
	 Val. Loss: 3.554 |  Val. PPL:  34.950
Epoch: 04 | Time: 0m 55s
	Train Loss: 3.572 | Train PPL:  35.572
	 Val. Loss: 3.564 |  Val. PPL:  35.314


KeyboardInterrupt: 

In [None]:
#model.apply(init_weights)
#model_train(10,1)