In [113]:
from google.colab import drive
drive.mount('/content/dirve')
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

Drive already mounted at /content/dirve; to attempt to forcibly remount, call drive.mount("/content/dirve", force_remount=True).


In [114]:
!pip install transformers fugashi mecab-python3 ipadic



In [115]:
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import TransformerEncoder, TransformerEncoderLayer, TransformerDecoder, TransformerDecoderLayer

class TransformerModel(nn.Module):

    def __init__(self, ntoken, ninp, nhead, nhid, nlayers, dropout=0.5):
        super(TransformerModel, self).__init__()
        self.model_type = 'Transformer'
        self.linear = nn.Linear(ninp ,32000)
        self.pos_encoder = PositionalEncoding(ninp, dropout)
        encoder_layers = TransformerEncoderLayer(ninp, nhead, nhid, dropout)
        self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)
        self.encoder = bert_model.get_input_embeddings()
        self.ninp = ninp
        # self.decoder = bert_model.get_input_embeddings()
        self.decoder = nn.Embedding(ntoken, ninp)
        decoder_layers = TransformerDecoderLayer(ninp, nhead, nhid, dropout)
        self.transformer_decoder = TransformerDecoder(decoder_layers, nlayers, norm=self.linear)
        #self.init_weights()

    def generate_square_subsequent_mask(self, sz):
        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
        return mask

    def init_weights(self):
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)
        #self.decoder.bias.data.zero_()
        self.decoder.weight.data.uniform_(-initrange, initrange)

    def forward(self, src, trg):
        trg_mask = model.generate_square_subsequent_mask(trg.size()[0]).to(device)
        # 分散表現に変換
        src = self.encoder(src)
        trg = self.decoder(trg)
        # 位置情報を入れる
        src = self.pos_encoder(src)
        trg = self.pos_encoder(trg)
        # モデルにデータを入れる
        output = self.transformer_encoder(src)
        # デコーダにエンコーダの出力を入れる（ここがおかしい）
        output = self.transformer_decoder(trg, output,tgt_mask = trg_mask)
        return output

class PositionalEncoding(nn.Module):

    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        # print(x.size())
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

In [116]:
from torchtext import data
from torchtext import datasets
from transformers import BertJapaneseTokenizer, BertForPreTraining
import random 
import numpy as np

In [117]:
SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [118]:
tok = BertJapaneseTokenizer.from_pretrained('cl-tohoku/bert-base-japanese')

In [119]:
def tokenizer(text):
  return tok.tokenize(text)

In [120]:
SRC = data.Field(sequential=True, tokenize = tokenizer, init_token='<sos>', eos_token='<eos>', lower = True)

In [121]:
# 重複のないデータセットか重複のあるデータセットを選ぶ
# flagがTrueの時重複のないデータを返す
def choose_dataset(flag = False):
  if flag:
    return data.TabularDataset.splits(
        path="/content/dirve/My Drive/Colab Notebooks/data/", train='one_train.tsv',
        validation='one_val.tsv', test='one_test.tsv', format='tsv',
        fields=[('SRC', SRC), ('TRG', SRC)]), "/content/dirve/My Drive/Colab Notebooks/csv/one_result_bert_embedded_transformer.csv"
  else:
    return data.TabularDataset.splits(
        path="/content/dirve/My Drive/Colab Notebooks/data/", train='train.tsv',
        validation='val.tsv', test='test.tsv', format='tsv',
        fields=[('SRC', SRC), ('TRG', SRC)]), "/content/dirve/My Drive/Colab Notebooks/csv/result_bert_embedded_transformer.csv"

In [122]:
train, val, test, filename = choose_dataset(False)
SRC.build_vocab(train)
bert_model = BertForPreTraining.from_pretrained(
    "cl-tohoku/bert-base-japanese", # 日本語Pre trainedモデルの指定
    num_labels = 2, # ラベル数（今回はBinayなので2、数値を増やせばマルチラベルも対応可）
    output_attentions = False, # アテンションベクトルを出力するか
    output_hidden_states = True, # 隠れ層を出力するか
)

Some weights of BertForPreTraining were not initialized from the model checkpoint at cl-tohoku/bert-base-japanese and are newly initialized: ['cls.predictions.decoder.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [123]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
train_batch_size = 100
test_batch_size = 100
eval_batch_size = 100
train_iter, val_iter, test_iter = data.BucketIterator.splits((train, val, test), sort = False,  batch_sizes = (train_batch_size,eval_batch_size, test_batch_size), device= device)

In [124]:
ntokens = 3996 # the size of vocabulary
emsize = 768 # embedding dimension
nhid = 768 # the dimension of the feedforward network model in nn.TransformerEncoder
nlayers = 2 # the number of nn.TransformerEncoderLayer in nn.TransformerEncoder
nhead = 2 # the number of heads in the multiheadattention models
dropout = 0.3 # the dropout value
model = TransformerModel(ntokens, emsize, nhead, nhid, nlayers, dropout).to(device)

In [125]:
model

TransformerModel(
  (linear): Linear(in_features=768, out_features=32000, bias=True)
  (pos_encoder): PositionalEncoding(
    (dropout): Dropout(p=0.3, inplace=False)
  )
  (transformer_encoder): TransformerEncoder(
    (layers): ModuleList(
      (0): TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): _LinearWithBias(in_features=768, out_features=768, bias=True)
        )
        (linear1): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.3, inplace=False)
        (linear2): Linear(in_features=768, out_features=768, bias=True)
        (norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.3, inplace=False)
        (dropout2): Dropout(p=0.3, inplace=False)
      )
      (1): TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): _LinearWithBias(in_features=768, out_features

In [126]:
criterion = nn.CrossEntropyLoss(ignore_index=SRC.vocab.stoi["<pad>"])
lr = 5 # learning rate
optimizer = torch.optim.SGD(model.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95)

import time
def train(iterator):
    model.train() # Turn on the train mode
    total_loss = 0.
    start_time = time.time()
    for i, batch in enumerate(iterator):
        src = batch.SRC
        trg = batch.TRG
        optimizer.zero_grad()
        output = model(src, trg)
        #print("output from model size:", output.size())
        #print("targets of unsqueezed size:", trg.size())
        output = output[:].view(-1, output.shape[-1])
        trg = trg[:].view(-1)
        #print("output size:", output.size())
        #print("targets size:", trg.size())
        loss = criterion(output, trg)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
        optimizer.step()

        total_loss += loss.item()
        

def evaluate(eval_model, data_source):
    eval_model.eval() # Turn on the evaluation mode
    total_loss = 0.
    with torch.no_grad():
      for i, batch in enumerate(data_source):
        data = batch.SRC
        targets = batch.TRG
        #src_mask = model.generate_square_subsequent_mask(data.shape[0]).to(device)
        output = eval_model(data, targets)
        output_flat = output[:].view(-1, output.shape[-1])
        targets = targets[:].view(-1)
        total_loss += len(data) * criterion(output_flat, targets).item()
    return total_loss / (len(data_source) - 1)

In [127]:
best_val_loss = float("inf")
epochs = 20 # The number of epochs
best_model = None
model.init_weights()

for epoch in range(1, epochs + 1):
    epoch_start_time = time.time()
    train(train_iter)
    val_loss = evaluate(model, val_iter)
    print('-' * 89)
    print('| epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
          .format(epoch, (time.time() - epoch_start_time), val_loss))

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        best_model = model

    scheduler.step()

-----------------------------------------------------------------------------------------
| epoch   1 | time: 49.79s | valid loss 339.36 | 
-----------------------------------------------------------------------------------------
| epoch   2 | time: 47.83s | valid loss 368.12 | 
-----------------------------------------------------------------------------------------
| epoch   3 | time: 48.39s | valid loss 343.00 | 
-----------------------------------------------------------------------------------------
| epoch   4 | time: 47.97s | valid loss 223.44 | 
-----------------------------------------------------------------------------------------
| epoch   5 | time: 48.29s | valid loss 219.59 | 
-----------------------------------------------------------------------------------------
| epoch   6 | time: 48.29s | valid loss 239.57 | 
-----------------------------------------------------------------------------------------
| epoch   7 | time: 47.94s | valid loss 149.42 | 
--------------------

In [128]:
test_loss = evaluate(best_model, test_iter)
print('=' * 89)
print('| End of training | test loss {:5.2f} |'.format(
    test_loss))
print('=' * 89)

| End of training | test loss 61.75 |


In [129]:
torch.save(best_model.state_dict(), "/content/dirve/My Drive/Colab Notebooks/model/bert_embedded_transformer.pth")

In [None]:
model.state_dict(torch.load("/content/dirve/My Drive/Colab Notebooks/model/bert_embedded_transformer.pth"))

In [131]:
"""
def gen_sentence(sentence, src_field, trg_field, model, max_len = 50):
  model.eval()

  tokens = [src_field.init_token] + tokenizer(sentence) + [src_field.eos_token]
  
  src_index = [src_field.vocab.stoi[i] for i in tokens]
  src_tensor = torch.LongTensor(src_index).unsqueeze(0).to(device)
  # src_len = torch.LongTensor([len(src_index)]).to(device)
  src_tensor = model.encoder(src_tensor)
  src_tensor = mode.pos_encoder(src_tensor)
  with torch.no_grad():
    enc_output = model.transformer_encoder(src_tensor)
  
  trg_index = [trg_field.vocab.stoi[trg_field.init_token]]
  for i in range(max_len):
    trg_tensor = torch.LongTensor(trg_index[-1]).unsqueeze(2).to(device)
    trg_tensor = model.encoder(trg_tensor)
    trg_tensor = model.pos_encoder(trg_tensor)
    with torch.no_grad():
      output = model.transformer_deocder(trg_tensor, enc_output)
    
    pred_token = output.argmax(1).item()
    trg_index.append(pred_token)
    if pred_token == trg_field.vocab.stoi[trg_field.eos_token]:
      break

  trg_tokens = [trg_field.vocab.itos[i] for i in trg_index]
  return trg_tokens

  def gen_sentence_list(path): 
  col, pred = [], []
  input, output = [], []
  with open(path, mode = 'r') as f:
    for file_list in f:
      col.append(file_list.split('\t'))
  for i in col:
    input.append(i[0])
    output.append(i[1])

  for sentence in input:
    pred.append(gen_sentence(sentence, SRC, SRC, model))
  return input, output, pred

path = "/content/dirve/My Drive/Colab Notebooks/data/test.tsv"
test_input, test_output, test_pred = gen_sentence_list(path)
path = "/content/dirve/My Drive/Colab Notebooks/data/train.tsv"
train_input, train_output, train_pred = gen_sentence_list(path)
path = "/content/dirve/My Drive/Colab Notebooks/data/val.tsv"
val_input, val_output, val_pred = gen_sentence_list(path)
  """

'\ndef gen_sentence(sentence, src_field, trg_field, model, max_len = 50):\n  model.eval()\n\n  tokens = [src_field.init_token] + tokenizer(sentence) + [src_field.eos_token]\n  \n  src_index = [src_field.vocab.stoi[i] for i in tokens]\n  src_tensor = torch.LongTensor(src_index).unsqueeze(0).to(device)\n  # src_len = torch.LongTensor([len(src_index)]).to(device)\n  src_tensor = model.encoder(src_tensor)\n  src_tensor = mode.pos=encoder(src_tensor)\n  with torch.no_grad():\n    enc_output = model.transformer_encoder(src_tensor)\n  \n  trg_index = [trg_field.vocab.stoi[trg_field.init_token]]\n  for i in range(max_len):\n    trg_tensor = torch.LongTensor([trg_index[-1]]).to(device)\n    with torch.no_grad():\n      output = model.transformer_deocder(trg_tensor, enc_output)\n    \n    pred_token = output.argmax(1).item()\n    trg_index.append(pred_token)\n    if pred_token == trg_field.vocab.stoi[trg_field.eos_token]:\n      break\n\n  trg_tokens = [trg_field.vocab.itos[i] for i in trg_index

In [132]:
def gen_sentence(sentence, src_field, trg_field, model, batch_size):
  model.eval()
  in_str, out_str, pred, tmp = [], [], [], []
  length = len(sentence)

  with torch.no_grad():
    for _, batch in enumerate(sentence):
      src = batch.SRC
      trg = batch.TRG
      output = model(src, trg)
          
      for j in range(min(length, batch_size)):
        _, topi = output.data.topk(1)
        _, topi_s = output.data.topk(2) 
        for k in range(topi.size()[1]):
          if topi[:, k][0] == trg_field.vocab.stoi["<eos>"]:
            for m in range(topi_s.size()[0]):
              for l in range(topi_s.size()[1]):
                topi[m][l][0] = topi_s[m][l][1]
          for i in range(topi.size()[0]):
            if trg_field.vocab.itos[topi[:, k][i]] == "<eos>":
              break
            tmp.append(trg_field.vocab.itos[topi[:, k][i]])
          pred.append(tmp)
          tmp = []
        #print(src.size())
        in_str.append([src_field.vocab.itos[i.item()] for i in src[:,j] if src_field.vocab.itos[i.item()] != "<eos>"])
        out_str.append([trg_field.vocab.itos[i.item()] for i in trg[:,j] if trg_field.vocab.itos[i.item()] != "<eos>"])
      
  return in_str, out_str, pred

In [133]:
# 中間発表時にはテストデータは用いない
test_in, test_out, test_pred = [],[],[]
test_in, test_out, test_pred = gen_sentence(test_iter, SRC, SRC, best_model, test_batch_size)
val_in, val_out, val_pred = [],[],[]
val_in, val_out, val_pred = gen_sentence(val_iter, SRC, SRC, model, eval_batch_size)
train_in, train_out, train_pred = [],[],[]
train_in, train_out, train_pred = gen_sentence(train_iter, SRC, SRC, model, train_batch_size)

In [134]:
import pandas as pd

In [135]:
def convert_list_to_df(in_list, out_list, pred_list):
  row = []
  for i in range(len(in_list)):
    batch_input = in_list[i]
    batch_output = out_list[i]
    batch_pred = pred_list[i]
    input = [j for j in batch_input if j != "<pad>" and j != "<sos>" and j != "<eos>" and j != "<unk>"]
    output = [j for j in batch_output if j != "<pad>" and j != "<sos>" and j != "<eos>" and j != "<unk>"]
    predict = [j for j in batch_pred if j != "<pad>" and j != "<sos>" and j != "<eos>" and j != "<unk>"]
    input_str = "".join(input).replace("#", "")
    output_str ="".join(output).replace("#", "")
    predict_str = "".join(predict).replace("#", "")
    row.append([input_str, output_str, predict_str])

  df = pd.DataFrame(row, columns=["input","answer","predict"])
  df = df.sort_values('input')
  return df

In [136]:
train_df = convert_list_to_df(train_in, train_out, train_pred)
val_df = convert_list_to_df(val_in, val_out, val_pred)
test_df = convert_list_to_df(test_in, test_out, test_pred)

In [137]:
df_s = pd.concat([train_df, test_df]).sort_values('input')

In [138]:
df_s.head(10)

Unnamed: 0,input,answer,predict
18359,11時半頃,十一時半に,あん
14143,11時半頃,いらっしゃったうーん,うーん
3813,15分ぐらいまで,十五分,あであー
19914,15分ぐらいまで,あっ,あー
19074,15分ぐらいまで駅に,15分はい,あーそうなんですね
1702,15分ぐらいまで駅に,あはい,あはははそうなんですね
6629,15分ぐらいまで駅に着くまでにかかりました,はいだったんですねありがとう,あーうーんうん
4196,15分ぐらいまで駅に着くまでにかかりました,ええあそうですねそうだったんですね,はーそうなんですか
8474,15分ぐらいまで駅に着くまでにかかりました,あそうですか,はーしのしのなんですね
15389,15分ぐらいまで駅に着くまでにかかりました,はいうーんうん,でですねしのねーうーん


In [139]:
df_s.to_csv(filename)