In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer
from transformers import GPT2Config, GPT2LMHeadModel
from transformers import TrainingArguments, Trainer, pipeline
import os

In [None]:
tokenizer = AutoTokenizer.from_pretrained("gpt2")

In [None]:
# Set pad_token to eos_token if it's not set
tokenizer.pad_token = tokenizer.eos_token

def tokenize_function(examples):
    tokenized = tokenizer(examples['text'], truncation=True, padding='max_length', max_length=512)
    # Shift the input_ids to create labels
    tokenized['labels'] = tokenized['input_ids'].copy()
    return tokenized


In [None]:
dataset = load_dataset("text", data_files={"train": "data.txt"})
tokenized_datasets = dataset.map(tokenize_function, batched=True)

In [None]:
config = GPT2Config(
    vocab_size=tokenizer.vocab_size,
    n_positions=512,
    n_ctx=512,
    n_embd=256,
    n_layer=4,
    n_head=4,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id,
)
model = GPT2LMHeadModel(config)

In [None]:
training_args = TrainingArguments(
    output_dir='./results',
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=16,
    save_steps=10_000,
    save_total_limit=2,
    logging_dir='./logs',
    logging_steps=500,
    prediction_loss_only=True,
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
)

In [None]:
model.save_pretrained("model")
tokenizer.save_pretrained("model")

In [None]:
def train_on_text(model, training_args, text: list[str]):
    # text -> dataset -> tokenized dataset -> trainer.train()
    with open("data_TMP.txt", "w") as f:
        for line in text:
            f.write(line + "\n")
    dataset = load_dataset("text", data_files={"train": "data_TMP.txt"})
    tokenized_datasets = dataset.map(tokenize_function, batched=True)
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_datasets["train"],
    )
    trainer.train()
    return model

In [None]:
for file in os.listdir("./data"):
    if file.endswith(".txt"):
        with open(f"./data/{file}", "r") as f:
            text = f.readlines()
        model = train_on_text(model, training_args, text)


In [None]:
def generate_text(model, tokenizer, prompt: str):
    input_ids = tokenizer.encode(prompt, return_tensors='pt')
    output = model.generate(input_ids)
    return tokenizer.batch_decode(output, skip_special_tokens=True)


In [None]:
prompt = "Aripov is a member of"
print(generate_text(model, tokenizer, prompt)[0])