In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="3"

# Data prerocessing

In [None]:
from datasets import load_dataset, DatasetDict
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("ai-forever/rugpt3large_based_on_gpt2")

In [None]:
from datasets import load_dataset
path = "/archive/savkin/python_course_datasets/volk.json"
volk_dataset = load_dataset("json", data_files=path)
volk_dataset = DatasetDict(volk_dataset)
volk_dataset

In [None]:
def preprocess_function(examples):
    return tokenizer([" ".join(x) for x in examples["text"]])

tokenized_dataset = volk_dataset.map(
    preprocess_function,
    batched=True,
    num_proc=4
)

In [None]:
block_size = 1024


def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
    # customize this part to your needs.
    if total_length >= block_size:
        total_length = (total_length // block_size) * block_size
    # Split by chunks of block_size.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

lm_dataset = tokenized_dataset.map(group_texts, batched=True, num_proc=4)

# Training

In [None]:
from transformers import DataCollatorForLanguageModeling
from transformers import AutoModelForCausalLM, TrainingArguments, Trainer

tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
model = AutoModelForCausalLM.from_pretrained("ai-forever/rugpt3large_based_on_gpt2")

In [None]:
epochs = 20
training_args = TrainingArguments(
    output_dir="rugpt3large_based_on_gpt2_volk/checkpoints",
    save_strategy="no",
    save_total_limit=1,
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=epochs,
    per_device_train_batch_size=1,
    warmup_ratio=0.1,
    push_to_hub=False,
    load_best_model_at_end=False
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_dataset["train"],
    eval_dataset=lm_dataset["train"],
    data_collator=data_collator,
)

trainer.train()

# Evaluation

In [None]:
import math

eval_results = trainer.evaluate()
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

In [None]:
save_path = f"rugpt3large_based_on_gpt2_volk/checkpoints/epochs-{epochs}"
trainer.save_model(save_path)
tokenizer.save_pretrained(save_path)