In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import random
import numpy as np
from tqdm.notebook import tqdm
from scr.roberta import Model
from scr.tokenizer import Tokenizer
from scr.datasets import Roberta_datasets
from scr.utils import get_trainable_parameters

In [2]:
def torch_fix_seed(seed=42):
    # Python random
    random.seed(seed)
    # Numpy
    np.random.seed(seed)
    # Pytorch
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.use_deterministic_algorithms = True

torch_fix_seed(42)

In [3]:
tokenizer = Tokenizer('tokenizer/tokenizer.model')
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
max_len = 128
vocab_size = tokenizer.vocab_size()
config = {
    'vocab_size': vocab_size,
    'max_len': max_len,
    'num_layers': 6,
    'num_attn_heads': 4,
    'hidden_dim': 384,
    'dropout': 0.1
}
batch_size = 64
learning_rate = 5e-4
num_epoch = 5
PAD = tokenizer.label_2_id('[PAD]')
MASK = tokenizer.label_2_id('[MASK]')
model = Model(**config)
model

Model(
  (embedding): Embeddings(
    (word_embs): Embedding(1500, 384, padding_idx=3)
    (pos_enc): Embedding(129, 384)
    (seg_emb): Embedding(2, 384)
    (layer_norm): LayerNorm((384,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (roberta): Encoder(
    (layer): ModuleList(
      (0-5): 6 x Layer(
        (attention): Attention(
          (query): Linear(in_features=384, out_features=384, bias=True)
          (key): Linear(in_features=384, out_features=384, bias=True)
          (value): Linear(in_features=384, out_features=384, bias=True)
          (MHA): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=384, out_features=384, bias=True)
          )
          (layer_norm): LayerNorm((384,), eps=1e-05, elementwise_affine=True)
        )
        (intermediate): Intermediate(
          (linear1): Linear(in_features=384, out_features=1536, bias=True)
          (act_fn): GELU(approximate='none')
          

In [4]:
with open('datasets/text.txt', 'r', encoding='utf-8')as f:
    datas = f.read()
datas = datas.split('\n')
print(datas[:5])
datas = tokenizer.encode_texts(datas)
print(datas[:5])

['[CLS]メロスは激怒した。[EOS]', '[CLS]必ず、かの邪智暴虐の王を除かなければならぬと決意した。[EOS]', '[CLS]メロスには政治がわからぬ。[EOS]', '[CLS]メロスは、村の牧人である。[EOS]', '[CLS]笛を吹き、羊と遊んで暮して来た。[EOS]']
[[4, 16, 652, 18, 739, 8], [4, 624, 738, 480, 680, 897, 1196, 740, 794, 746, 0, 471, 752, 649, 18, 739, 8], [4, 462, 1363, 1394, 759, 454, 739, 8], [4, 16, 738, 373, 380, 57, 739, 8], [4, 1445, 746, 1267, 765, 738, 956, 752, 0, 100, 949, 446, 739, 8]]


In [5]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(datas, test_size=0.1, random_state=42)
train_data = Roberta_datasets(train, tokenizer, max_len,vocab_size, PAD, MASK)
test_data = Roberta_datasets(test, tokenizer, max_len, vocab_size, PAD, MASK)

train_dataloader = torch.utils.data.DataLoader(train_data, batch_size=batch_size, shuffle=True)
test_dataloader = torch.utils.data.DataLoader(test_data, batch_size=batch_size, shuffle=True)

special_token_weight = torch.tensor([1.0]*vocab_size)
special_token_weight[:9] = 0.0
special_token_weight = special_token_weight.to(device)

optimizer = torch.optim.AdamW(get_trainable_parameters(model), lr=learning_rate, betas=(0.9, 0.999), eps=1e-6, weight_decay=0.0005)
criterion = nn.NLLLoss(weight=special_token_weight)
log_softmax = nn.LogSoftmax(dim=-1)

In [6]:
for e in tqdm(range(num_epoch)):
    train_loss = 0
    val_loss = 0
    model.train()
    for x, pos, token_ids, mask, mask_labels in train_dataloader:
        output = model.forward(x, pos, token_ids, mask)
        output = log_softmax(output)
        loss = criterion(output.transpose(1, 2), mask_labels)
        loss.backward()
        train_loss += loss.item()
        optimizer.step()
        optimizer.zero_grad()
    model.eval()
    with torch.no_grad():
        for x, pos, token_ids, mask, mask_labels in test_dataloader:
            output = model.forward(x, pos, token_ids, mask)
            output = log_softmax(output)
            loss = criterion(output.transpose(1, 2), mask_labels)
            val_loss += loss.item()
    print(f'epoch : {e+1}/{num_epoch}, train_loss : {train_loss/len(train_dataloader)}, val_loss : {val_loss/len(test_dataloader)}')
torch.save(model.state_dict(), 'model/roberta_pretrained.pth')

  0%|          | 0/5 [00:00<?, ?it/s]

epoch : 1/5, train_loss : 6.839238779885428, val_loss : 6.441524982452393
epoch : 2/5, train_loss : 6.416412217276437, val_loss : 5.995678901672363
epoch : 3/5, train_loss : 6.395083972385952, val_loss : 6.229027271270752
epoch : 4/5, train_loss : 6.236432347978864, val_loss : 6.7290449142456055
epoch : 5/5, train_loss : 6.3231392587934225, val_loss : 6.299596309661865


In [7]:
from scr.utils import make_input
model.eval()
text = '[CLS]メロスは[MASK]した。[EOS]'
print(tokenizer.text_2_token(text))
x, pos, token_ids, mask, mask_idx = make_input(text, tokenizer)
with torch.no_grad():
    out = model.forward(x, pos, token_ids, mask)
print('[MASK] is ')
for i, t in enumerate(out[0][mask_idx].topk(10)[1]):
    pred = tokenizer.id_2_label(t.item())
    print(f'top {i+1} : {pred}')

['[CLS]', 'メロスは', '[MASK]', 'した', '。', '[EOS]']
[MASK] is 
top 1 : 、
top 2 : 。
top 3 : 「
top 4 : の
top 5 : を
top 6 : に
top 7 : 。」
top 8 : は
top 9 : も
top 10 : メロスは
