In [None]:
import sys
sys.dont_write_bytecode = True

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import random
import numpy as np
from scr.roberta import Model
from scr.tokenizer import Tokenizer
from scr.datasets import Roberta_datasets

In [None]:
def torch_fix_seed(seed=42):
    # Python random
    random.seed(seed)
    # Numpy
    np.random.seed(seed)
    # Pytorch
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.use_deterministic_algorithms = True

torch_fix_seed(42)

In [None]:
tokenizer = Tokenizer('tokenizer/tokenizer.model')
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
max_len = 128
vocab_size = tokenizer.vocab_size()
config = {
    'vocab_size': vocab_size,
    'max_len': max_len,
    'num_layers': 4,
    'num_attn_heads': 4,
    'hidden_dim': 768,
    'dropout': 0.1
}
batch_size = 16
lerning_rate = 5e-5
num_epoch = 10
PAD = tokenizer.label_2_id('[PAD]')
MASK = tokenizer.label_2_id('[MASK]')
model = Model(**config)
model

In [None]:
with open('datasets/text.txt', 'r', encoding='utf-8')as f:
    datas = f.read()
datas = datas.split('\n')
datas[:5]

In [None]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(datas, test_size=0.1)
train_data = Roberta_datasets(train, tokenizer, max_len, PAD)
test_data = Roberta_datasets(test, tokenizer, max_len, PAD)

train_dataloader = torch.utils.data.DataLoader(train_data, batch_size=batch_size, shuffle=True)
test_dataloader = torch.utils.data.DataLoader(test_data, batch_size=batch_size, shuffle=True)

special_token_weight = torch.tensor([1.0]*vocab_size)
special_token_weight[:9] = 0.0
special_token_weight = special_token_weight.to(device)