In [1]:
from transformers import GPT2Config, GPT2LMHeadModel, GPT2Tokenizer, DataCollatorForLanguageModeling, TrainingArguments, Trainer
from datasets import load_dataset




In [2]:
new_tokenizer = GPT2Tokenizer.from_pretrained("./smaller_tokenizer")
new_tokenizer.add_special_tokens({
    "eos_token":"</s>",
    "bos_token":"<s>",
    "unk_token":"<unk>",
    "pad_token":"<pad>",
    "mask_token":"<mask>"
})
data_path = 'smaller.txt'

In [3]:
config = GPT2Config(
    vocab_size=new_tokenizer.vocab_size,
    bos_token = new_tokenizer.bos_token_id,
    eos_token = new_tokenizer.eos_token_id
)

model = GPT2LMHeadModel(config)
dataset = load_dataset("text", data_files=[data_path])

In [4]:
# We need to tokenize dataset using the new dataset
def encode(lines):
    return new_tokenizer(lines['text'], add_special_tokens=True, truncation=True, max_length=512)

dataset.set_transform(encode)
dataset = dataset['train']

data_collator = DataCollatorForLanguageModeling(tokenizer=new_tokenizer, mlm = True, mlm_probability=0.15)


In [5]:
training_args = TrainingArguments(
    output_dir = "small_model",
    overwrite_output_dir= True,
    num_train_epochs=1,
    per_device_train_batch_size=10,
    save_steps = 100,
    save_total_limit=2,
    prediction_loss_only=True,
    remove_unused_columns=False
)

In [15]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset
)

In [None]:
trainer.train()

In [None]:
trainer.save_model("small_model")

In [8]:
import torch

model = GPT2LMHeadModel.from_pretrained("small_model_E1S10")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)


In [11]:
from colored import Fore, Back, Style

while True:
    inp = input(">>>")
    input_ids = new_tokenizer.encode(inp, return_tensors="pt").to("cuda")
    beam_output = model.generate(
        input_ids,
        max_length = 512,
        temperature = 0.7,
        no_repeat_ngram_size = 5,
        num_return_sequences = 1
    )

    for beam in beam_output:
        out = new_tokenizer.decode(beam)
        fout = out.replace("<N>", "\n")

        print(Fore.RED + Style.BOLD + str(fout))


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[38;5;1m[1mimport sklearn.from_from_from_test_test_test

from_test_from_test = _from_test_train_test_test, y_test_test = _test_test_train, 'test_test_set_test_test.test_test_dir_test_test)

import _test_test = 0.get_test_test(X_test_test=None, y_test, 'test_set_set_test, y, y_test)<NN>
    'test_test = 'test_test.get_test = 0_test_test['test_test_file_test_test,


X_test_train = 0.get(X_test.get(X = 0.0.0, 'test_train_set_test.0.0.get_test.0, 'X_test_set.get_test, 'X_set_test(X, '1.0, '1, '1.get_test)
    'X_test, '1, y_test.0_test_test', '1, '0.0, 0.0, '0.1_test_test',

<_test_test: '1.0.0_test, '0, '1._test_test1, '1_test_train.0.0', '1.0_test.0 = 0.0, 0_test_set(X_test, 0.0.get(X, '0._test_set, 'X_train_test, '_test_test))

<.0, '_test, 'c_test_test('0, '1_set_test)
# #  #  #  0.0.1.0.get.0.0 = 0x_test_test-1, '0, 0.get_set_test = 0x1_test_set = 0.0_test = 0, '1.1, '1, 0.0_set_test=0, '1)

<>

< 0.0._test, 'x_test_set=1.0.1_train_test.0


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[38;5;1m[1mimport numpy as as

import _from_test_test_test


<from>#import _from_from_test_from_test
from_test_train_test_test = _from_test< as _from_test, 'test_test_train, 'test_train_train_test, 'train_test_train.get_test_test, 'X_test_test.get_test, '_test_test)

from_train_test = _test_test_set_test_test(X_test_train = _test_set_train_test._test_test['test_test_dir_test_test,

<_test_test import 'test_test.path._test_train
from = _test_train_set_test.get(X_test.get = 0.get_test.get.get_test = 0.get(X(X_test, '1, '_test, '0.0.0._test_set.get_test)
from_set_test, ', 'X_train_test(X, 'X_set_test)<NN>
<.0.get_test(X = 0.get.get(X = 0_test_test=0.get_set_test = 0_test, 'data.get_test=0, 'test_set_set_test=0_test_test('0.0, 'X_dir_test, 'c_test_test', 'test_test, y_test_test1_test_test):

< 0.0, 'test, '0, '_test.0, '0, y_test, 'x_test_test))

<, 'test_dir_set_test,
    'X_test, y = 0.get = 0_test.0.0, y_train_test)
#_test_test: 'test_test = 0x_test_train)

<>
    'test_test)< 0.0.0_t

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


IndexError: index -1 is out of bounds for dimension 1 with size 0