In [8]:
!pip install sacremoses
!pip install accelerate -U



In [6]:
from transformers import RobertaForCausalLM, AutoTokenizer, Trainer, TrainingArguments, DataCollatorForLanguageModeling
import datasets

# Ładowanie modelu i tokenizer
model = RobertaForCausalLM.from_pretrained("allegro/herbert-klej-cased-v1").to("cuda")
tokenizer = AutoTokenizer.from_pretrained("allegro/herbert-klej-cased-tokenizer-v1")

# Ładowanie datasetu
ds = datasets.load_dataset("text", data_files={
   "train": "pan_tadeusz_1_10.txt",
   "validation": "pan_tadeusz_11.txt",
   "test": "pan_tadeusz_12.txt",
})

# Tokenizacja
def tokenize_function(examples):
   return tokenizer(examples["text"], padding="max_length", max_length=137, truncation=True)

tokenized_datasets = ds.map(tokenize_function, batched=True, num_proc=1, remove_columns=["text"])

# Konfiguracja data collatora dla maskowania tokenów
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=True,
    mlm_probability=0.15
)

# Definicja argumentów treningowych
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_steps=5,
    save_total_limit=2,
)

# Stworzenie trenera
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator
)

# Trening modelu
trainer.train()

# Ewaluacja modelu
eval_results = trainer.evaluate()
print(f"Perplexity: {eval_results['eval_loss']**0.5}")


If you want to use `RobertaLMHeadModel` as a standalone, add `is_decoder=True.`
Some weights of RobertaForCausalLM were not initialized from the model checkpoint at allegro/herbert-klej-cased-v1 and are newly initialized: ['lm_head.bias', 'lm_head.decoder.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,6.0126,6.708447
2,5.3819,6.212048
3,5.3013,6.062158


Perplexity: 2.4456394939738897


In [7]:
# Generowanie przykładowego tekstu
init_text = "Jam jest Jacek"
input_ids = tokenizer.encode(init_text, return_tensors="pt").to("cuda")
generated_ids = model.generate(input_ids, max_length=150, num_return_sequences=1)
generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
print(f"Generated Text: {generated_text}")

print("Training complete.")

Generated Text: Jam jest Jacek I jak że,, " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " - - - - - -,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,! " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " "
Training complete.
