In [1]:
from transformers import GPT2Config, GPT2LMHeadModel, GPT2TokenizerFast, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments

import torch
from torch.utils.data import Dataset
from tqdm import tqdm
from pathlib import Path
import wandb
import os
import json

In [2]:
wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33mmacosta[0m (use `wandb login --relogin` to force relogin)


True

In [3]:
TOKENIZER_SAVEDIR = Path('/home/macosta/ttmp/primus-data/cropped/cropped-txt-tokenizer/')
LM_MODEL_SAVEDIR = Path('/home/macosta/ttmp/primus-models/gpt2-lm-columnwise/')
Path(LM_MODEL_SAVEDIR).mkdir(exist_ok=True)
TXT_FILES = Path('/home/macosta/ttmp/primus-data/cropped/cropped-txt/')

In [4]:
[torch.cuda.device(i) for i in range(torch.cuda.device_count())]

[<torch.cuda.device at 0x7f68f3ce0290>, <torch.cuda.device at 0x7f68f3ce0150>]

In [5]:
torch.cuda.set_device(0)
print('Cuda available: ', torch.cuda.is_available())

Cuda available:  True


In [6]:
torch.cuda.current_device()

0

In [7]:
VOCAB_SIZE = 30000
MAX_LEN = 1028
tokenizer = GPT2TokenizerFast.from_pretrained(TOKENIZER_SAVEDIR, max_len=MAX_LEN)

file /home/macosta/ttmp/primus-data/cropped/cropped-txt-tokenizer/config.json not found
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [8]:
list(tokenizer.vocab.items())[:5]

[('0000000000000000000000000000000000000000000000000011100000000000000011100000000000000011100000000000000011100000000011111111111111000000000000000000000000000000000000000000000',
  4383),
 ('0000000000000000000000000000000000000000000111000011111000000000000011100000000000000011100000000000000011100000000000000011100000000000000000000000000000000000000000000000000',
  22357),
 ('0000000000000000000000000000000000000000000000000011100000000000000011100000000000000011100000000000000011100000000000000011100001100000000000000000000000000000000000000000000',
  9341),
 ('0000000000000000000000000000000000000000000000000011100001111110000011100000000000000011100000000000000011100000000000000011100000000000000000000000000001111111110000000000000',
  23875),
 ('0000000000000000000000000000000000000000000000000011100000000000000011100000000000111111111100001111111111100000000000000011100000000000000011111111111111111111000000000000000',
  15702)]

In [9]:
tokenizer.unk_token = '<unk>'
tokenizer.bos_token = '<s>'
tokenizer.eos_token = '</s>'
tokenizer.add_special_tokens({'pad_token': '<pad>'})

0

In [10]:
ACTUAL_VOCAB_SIZE = len(tokenizer.vocab)

In [11]:
ACTUAL_VOCAB_SIZE

30001

In [12]:
config = GPT2Config(
    vocab_size=ACTUAL_VOCAB_SIZE,
    n_positions=MAX_LEN,
    n_head=12,
)

model = GPT2LMHeadModel(config=config)
print('Num parameters:', model.num_parameters())

Num parameters: 108886272


In [13]:
class CustomDataset(Dataset):
    def __init__(self, src_files, tokenizer, max_length):
        self.examples = []
        for src_file in tqdm(src_files):
            words = src_file.read_text(encoding="utf-8")
            words = words.split()
            words = ['<s>'] + words + ['</s>']
            for i in range(0, len(words), max_length):
                word_string = ' '.join(words[i:i+max_length])
                tokenized = tokenizer.encode(word_string, max_length=max_length, padding='max_length')
                assert(len(tokenized) == max_length)
                self.examples.append(tokenized)

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, i):
        return torch.tensor(self.examples[i])

In [14]:
def create_train_test_datasets(tokenizer, max_length, fraction=1.0, test_size=0.1):
    src_files = list(Path(TXT_FILES).glob("**/*"))
    src_files = src_files[:int(len(src_files) * fraction)]
    split_index = int(len(src_files) * (1 - test_size))
    train_files = src_files[:split_index]
    test_files = src_files[split_index:]
    train_dataset = CustomDataset(train_files, tokenizer, max_length=max_length)
    test_dataset = CustomDataset(test_files, tokenizer, max_length=max_length)
    return train_dataset, test_dataset

In [15]:
train_dataset, test_dataset = create_train_test_datasets(tokenizer, MAX_LEN, fraction=1, test_size=0.05)

100%|███████████████████████████████████████████████████████████████████████████████████| 83294/83294 [37:49<00:00, 36.71it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████| 4384/4384 [02:03<00:00, 35.59it/s]


In [16]:
train_dataset.__getitem__(102)

tensor([4, 4, 4,  ..., 1, 1, 1])

In [17]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [24]:
N_EPOCHS = 1
BATCH_SIZE = 8
N_EVALS = 20

In [25]:
N_STEPS = len(train_dataset) * N_EPOCHS // BATCH_SIZE

In [26]:
STEPS_PER_EVAL = N_STEPS // N_EVALS

In [27]:
training_args = TrainingArguments(
    output_dir=LM_MODEL_SAVEDIR,
    overwrite_output_dir=True,
    num_train_epochs=N_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    logging_steps=STEPS_PER_EVAL,
    evaluation_strategy="steps",
    eval_steps=STEPS_PER_EVAL,
    save_total_limit=1,
    prediction_loss_only=False,
    report_to="wandb",
    save_steps=5000
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

PyTorch: setting up devices


In [28]:
ret = trainer.train()

***** Running training *****
  Num examples = 151047
  Num Epochs = 1
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 9441
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


RuntimeError: Caught RuntimeError in replica 0 on device 0.
Original Traceback (most recent call last):
  File "/home/macosta/anaconda3/envs/mir2/lib/python3.7/site-packages/torch/nn/parallel/parallel_apply.py", line 61, in _worker
    output = module(*input, **kwargs)
  File "/home/macosta/anaconda3/envs/mir2/lib/python3.7/site-packages/torch/nn/modules/module.py", line 1102, in _call_impl
    return forward_call(*input, **kwargs)
  File "/home/macosta/anaconda3/envs/mir2/lib/python3.7/site-packages/transformers/models/gpt2/modeling_gpt2.py", line 1060, in forward
    return_dict=return_dict,
  File "/home/macosta/anaconda3/envs/mir2/lib/python3.7/site-packages/torch/nn/modules/module.py", line 1102, in _call_impl
    return forward_call(*input, **kwargs)
  File "/home/macosta/anaconda3/envs/mir2/lib/python3.7/site-packages/transformers/models/gpt2/modeling_gpt2.py", line 898, in forward
    output_attentions=output_attentions,
  File "/home/macosta/anaconda3/envs/mir2/lib/python3.7/site-packages/torch/nn/modules/module.py", line 1102, in _call_impl
    return forward_call(*input, **kwargs)
  File "/home/macosta/anaconda3/envs/mir2/lib/python3.7/site-packages/transformers/models/gpt2/modeling_gpt2.py", line 401, in forward
    output_attentions=output_attentions,
  File "/home/macosta/anaconda3/envs/mir2/lib/python3.7/site-packages/torch/nn/modules/module.py", line 1102, in _call_impl
    return forward_call(*input, **kwargs)
  File "/home/macosta/anaconda3/envs/mir2/lib/python3.7/site-packages/transformers/models/gpt2/modeling_gpt2.py", line 336, in forward
    attn_output, attn_weights = self._attn(query, key, value, attention_mask, head_mask)
  File "/home/macosta/anaconda3/envs/mir2/lib/python3.7/site-packages/transformers/models/gpt2/modeling_gpt2.py", line 216, in _attn
    attn_weights = self.attn_dropout(attn_weights)
  File "/home/macosta/anaconda3/envs/mir2/lib/python3.7/site-packages/torch/nn/modules/module.py", line 1102, in _call_impl
    return forward_call(*input, **kwargs)
  File "/home/macosta/anaconda3/envs/mir2/lib/python3.7/site-packages/torch/nn/modules/dropout.py", line 58, in forward
    return F.dropout(input, self.p, self.training, self.inplace)
  File "/home/macosta/anaconda3/envs/mir2/lib/python3.7/site-packages/torch/nn/functional.py", line 1169, in dropout
    return _VF.dropout_(input, p, training) if inplace else _VF.dropout(input, p, training)
RuntimeError: CUDA out of memory. Tried to allocate 388.00 MiB (GPU 0; 23.70 GiB total capacity; 19.13 GiB already allocated; 275.56 MiB free; 19.29 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF


In [None]:
trainer.save_model(LM_MODEL_SAVEDIR)