In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="3"

# Data prerocessing

In [2]:
from datasets import load_dataset, DatasetDict
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("ai-forever/rugpt3large_based_on_gpt2")

# eli5 = DatasetDict()
# eli5["train"] = load_dataset("eli5", split="train_asks[:5000]")
# eli5["valid"] = load_dataset("eli5", split="validation_asks")
# eli5["test"] = load_dataset("eli5", split="test_asks")
# eli5 = eli5.flatten()

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
from datasets import load_dataset
path = "/archive/savkin/python_course_datasets/volk.json"
volk_dataset = load_dataset("json", data_files=path)
volk_dataset = DatasetDict(volk_dataset)
volk_dataset

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 301
    })
})

In [4]:
def preprocess_function(examples):
    return tokenizer([" ".join(x) for x in examples["text"]])

tokenized_dataset = volk_dataset.map(
    preprocess_function,
    batched=True,
    num_proc=4
)

  table = cls._concat_blocks(blocks, axis=0)


In [5]:
block_size = 1024


def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
    # customize this part to your needs.
    if total_length >= block_size:
        total_length = (total_length // block_size) * block_size
    # Split by chunks of block_size.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

lm_dataset = tokenized_dataset.map(group_texts, batched=True, num_proc=4)

# Training

In [6]:
from transformers import DataCollatorForLanguageModeling
from transformers import AutoModelForCausalLM, TrainingArguments, Trainer

tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
model = AutoModelForCausalLM.from_pretrained("ai-forever/rugpt3large_based_on_gpt2")

In [13]:
epochs = 3
training_args = TrainingArguments(
    output_dir="rugpt3large_based_on_gpt2_volk/checkpoints",
    save_strategy="no",
    save_total_limit=1,
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=epochs,
    per_device_train_batch_size=2,
    warmup_ratio=0.1,
    fp16=True,
    push_to_hub=False,
    load_best_model_at_end=False
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_dataset["train"],
    eval_dataset=lm_dataset["train"],
    data_collator=data_collator,
)

trainer.train()

[codecarbon INFO @ 15:46:21] [setup] RAM Tracking...
[codecarbon INFO @ 15:46:21] [setup] GPU Tracking...
[codecarbon INFO @ 15:46:21] Tracking Nvidia GPU via pynvml
[codecarbon INFO @ 15:46:21] [setup] CPU Tracking...
[codecarbon INFO @ 15:46:23] CPU Model on constant consumption mode: AMD EPYC 7352 24-Core Processor
[codecarbon INFO @ 15:46:23] >>> Tracker's metadata:
[codecarbon INFO @ 15:46:23]   Platform system: Linux-5.15.0-76-generic-x86_64-with-glibc2.29
[codecarbon INFO @ 15:46:23]   Python version: 3.8.10
[codecarbon INFO @ 15:46:23]   CodeCarbon version: 2.2.3
[codecarbon INFO @ 15:46:23]   Available RAM : 1007.728 GB
[codecarbon INFO @ 15:46:23]   CPU count: 96
[codecarbon INFO @ 15:46:23]   CPU model: AMD EPYC 7352 24-Core Processor
[codecarbon INFO @ 15:46:23]   GPU count: 4
[codecarbon INFO @ 15:46:23]   GPU model: 4 x NVIDIA A100-SXM4-40GB


Epoch,Training Loss,Validation Loss
1,No log,2.626907
2,No log,2.482962
3,No log,2.421771


[codecarbon INFO @ 15:46:28] Energy consumed for RAM : 0.000234 kWh. RAM Power : 377.8979558944702 W
[codecarbon INFO @ 15:46:28] Energy consumed for all GPUs : 0.000519 kWh. Total GPU Power : 839.3340000000001 W
[codecarbon INFO @ 15:46:28] Energy consumed for all CPUs : 0.000049 kWh. Total CPU Power : 77.5 W
[codecarbon INFO @ 15:46:28] 0.000801 kWh of electricity used since the beginning.


TrainOutput(global_step=6, training_loss=2.6396478017171225, metrics={'train_runtime': 2.2282, 'train_samples_per_second': 5.386, 'train_steps_per_second': 2.693, 'total_flos': 50132057849856.0, 'train_loss': 2.6396478017171225, 'epoch': 3.0})

# Evaluation

In [11]:
import math

eval_results = trainer.evaluate()
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

Perplexity: 15.37


In [12]:
save_path = f"rugpt3large_based_on_gpt2_volk/checkpoints/epochs-{epochs}"
trainer.save_model(save_path)
tokenizer.save_pretrained(save_path)

('rugpt3large_based_on_gpt2_volk/checkpoints/epochs-2/tokenizer_config.json',
 'rugpt3large_based_on_gpt2_volk/checkpoints/epochs-2/special_tokens_map.json',
 'rugpt3large_based_on_gpt2_volk/checkpoints/epochs-2/vocab.json',
 'rugpt3large_based_on_gpt2_volk/checkpoints/epochs-2/merges.txt',
 'rugpt3large_based_on_gpt2_volk/checkpoints/epochs-2/added_tokens.json',
 'rugpt3large_based_on_gpt2_volk/checkpoints/epochs-2/tokenizer.json')