In [2]:
from transformers import RobertaConfig, RobertaForMaskedLM, RobertaTokenizerFast, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments

import torch
from torch.utils.data import Dataset
from tqdm import tqdm
from pathlib import Path
import wandb

In [2]:
wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33mmacosta[0m (use `wandb login --relogin` to force relogin)


True

In [3]:
TOKENIZER_SAVEDIR = Path('/home/macosta/ttmp/primus-tokenizer/')
LM_MODEL_SAVEDIR = Path('/home/macosta/ttmp/primus-lm-model/')
Path(LM_MODEL_SAVEDIR).mkdir(exist_ok=True)
PRIMUS_TXT_FILES = Path('/home/macosta/ttmp/primus-txt/')

In [4]:
[torch.cuda.device(i) for i in range(torch.cuda.device_count())]

[<torch.cuda.device at 0x7f93d5482bd0>, <torch.cuda.device at 0x7f93d5482590>]

In [5]:
torch.cuda.set_device(0)
print('Cuda available: ', torch.cuda.is_available())

Cuda available:  True


In [4]:
VOCAB_SIZE = 30000
MAX_LEN = 128
MASKING_PROPORTION = 0.15

In [5]:
config = RobertaConfig(
    vocab_size=VOCAB_SIZE,
    max_position_embeddings=514,
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=1,
)

In [6]:
model = RobertaForMaskedLM(config=config)
print('Num parameters:', model.num_parameters())

Num parameters: 66586416


In [7]:
tokenizer = RobertaTokenizerFast.from_pretrained(TOKENIZER_SAVEDIR, max_len=MAX_LEN)

file /home/macosta/ttmp/primus-tokenizer/config.json not found


In [8]:
tokenizer.vocab

{'00000000001111111111111111111111111111111111111111111111111111111111111110011111111110000000000': 25256,
 '00000000001110000000000000011111111110000000001110000000000000001110000000000000001110000000011': 22467,
 '00111111111111111100000000001110000000000000001110000000000000001110000111111111101110000000000': 20345,
 '00000000001111111111111111101110000000000000001110000000000000001110001111111110001111111111100': 16339,
 '00000000001110000000000000001110000000000000001110000000000000001110000001111110001110111111100': 11696,
 '11111111111110000000000000001110000000000000011111100000000000001110000000000000001100000000000': 19388,
 '11110000001110000000000000001110000000000000001110000000000000001111111111100000111111111000000': 22923,
 '00000000001111111111111111001110000000000000011111111111111111101110000000000000001110000000000': 15968,
 '11111101111111100000000011111111111110000000001110000000000000001110000000000000001110000000000': 7909,
 '000000000011100000000000000011100111

In [10]:
class CustomDataset(Dataset):
    def __init__(self, src_files, tokenizer, max_length):
        self.examples = []
        for src_file in tqdm(src_files):
            words = src_file.read_text(encoding="utf-8").split()
            chunks = [' '.join(words[i:i+max_length]) for i in range(0, len(words), max_length)]
            for example in chunks:
                x = tokenizer.encode_plus(example, max_length=max_length, padding='max_length', truncation=True)
                assert(len(x.input_ids) == max_length)
                self.examples += [x.input_ids]

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, i):
        return torch.tensor(self.examples[i])

In [11]:
def create_train_test_datasets(tokenizer, max_length, fraction=1.0, test_size=0.1):
    src_files = list(Path(PRIMUS_TXT_FILES).glob("**/*.txt"))
    src_files = src_files[:int(len(src_files) * fraction)]
    split_index = int(len(src_files) * (1 - test_size))
    train_files = src_files[:split_index]
    test_files = src_files[split_index:]
    train_dataset = CustomDataset(train_files, tokenizer, max_length=max_length)
    test_dataset = CustomDataset(test_files, tokenizer, max_length=max_length)
    return train_dataset, test_dataset

In [12]:
train_dataset, test_dataset = create_train_test_datasets(tokenizer, MAX_LEN, fraction=1.0, test_size=0.01)

100%|█████████████████████████████████████████████████████████| 788/788 [00:08<00:00, 89.50it/s]
100%|███████████████████████████████████████████████████████████| 88/88 [00:00<00:00, 91.76it/s]


In [13]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=MASKING_PROPORTION
)

In [14]:
training_args = TrainingArguments(
    output_dir=LM_MODEL_SAVEDIR,
    overwrite_output_dir=True,
    num_train_epochs=10,
    per_device_train_batch_size=32,
    save_steps=10000,
    logging_steps=1000,
    evaluation_strategy="steps",
    eval_steps=1000,
    save_total_limit=1,
    prediction_loss_only=False,
    report_to="wandb"
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

In [15]:
ret = trainer.train()

***** Running training *****
  Num examples = 8596
  Num Epochs = 10
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 1350
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




Step,Training Loss,Validation Loss
100,4.734,4.116951
200,3.9794,3.87865
300,3.7117,3.654954
400,3.5596,3.501044
500,3.385,3.27574


***** Running Evaluation *****
  Num examples = 943
  Batch size = 16
***** Running Evaluation *****
  Num examples = 943
  Batch size = 16
***** Running Evaluation *****
  Num examples = 943
  Batch size = 16
***** Running Evaluation *****
  Num examples = 943
  Batch size = 16
***** Running Evaluation *****
  Num examples = 943
  Batch size = 16


KeyboardInterrupt: 

In [None]:
trainer.save_model(LM_MODEL_SAVEDIR)