In [None]:
!pip install sacremoses
!pip install datasets



In [10]:
from transformers import RobertaForCausalLM, AutoTokenizer
import datasets
import torch

model = RobertaForCausalLM.from_pretrained("allegro/herbert-klej-cased-v1", is_decoder=True).to("cuda")
tokenizer = AutoTokenizer.from_pretrained("allegro/herbert-klej-cased-tokenizer-v1")

ds = datasets.load_dataset("text", data_files={
   "train": "pan_tadeusz_1_10.txt",
   "validation": "pan_tadeusz_11.txt",
   "test": "pan_tadeusz_12.txt",
})

def tokenize_function(examples):
   return tokenizer(examples["text"], padding="max_length", max_length=137)

tokenized_datasets = ds.map(tokenize_function, batched=True, num_proc=1, remove_columns=["text"])
print(tokenized_datasets)

init_text = "Jam jest Jacek"

class MLMDataset(torch.utils.data.Dataset):
    def __init__(self, tokenized_dataset, tokenizer, mlm_probability=0.15):
        self.input_ids = tokenized_dataset["input_ids"]
        self.attention_mask = tokenized_dataset["attention_mask"]
        self.tokenizer = tokenizer
        self.mlm_probability = mlm_probability

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        input_ids = torch.tensor(self.input_ids[idx])
        attention_mask = torch.tensor(self.attention_mask[idx])

        # Klonujemy oryginalne tokeny i traktujemy jako labele
        labels = input_ids.clone()

        # Tworzymy maskę z prawdopodobieństwem 15%, nie maskujemy tokenów specjalnych i tokenów paddignu
        rand = torch.rand(input_ids.shape)
        mask_arr = (rand < self.mlm_probability) * (input_ids != self.tokenizer.cls_token_id) * (input_ids != self.tokenizer.sep_token_id) * (input_ids != self.tokenizer.pad_token_id)
        selection = torch.flatten(mask_arr.nonzero()).tolist()

        # Maskujemy oryginalne
        input_ids[selection] = self.tokenizer.mask_token_id

        return [
            input_ids,
            attention_mask,
            labels
        ]

train_dataset = MLMDataset(tokenized_datasets["train"], tokenizer)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True)

valid_dataset = MLMDataset(tokenized_datasets["validation"], tokenizer)
valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=32, shuffle=True)

model.train()
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)

epoch_num = 5
for epoch in range(epoch_num):
    # Trening
    model.train()
    for i, (input_ids, attention_mask, labels) in enumerate(train_loader):
        optimizer.zero_grad()
        input_ids = input_ids.to("cuda")
        attention_mask = attention_mask.to("cuda")
        labels = labels.to("cuda")

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        if i % 5 == 0:
            print(f"Epoch: {epoch + 1}/{epoch_num}; Step: {i}/{len(train_loader)}; Loss: {loss.item()}")

    print(f"Epoch {epoch + 1}, Training Loss: {loss.item()}")

    model.eval()
    val_loss = 0
    num_batches = 0
    num_samples = 0

    with torch.no_grad():
        for input_ids, attention_mask, labels in valid_loader:
            input_ids = input_ids.to("cuda")
            attention_mask = attention_mask.to("cuda")
            labels = labels.to("cuda")

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            val_loss += outputs.loss.item() * input_ids.size(0)  # sum the loss for each batch
            num_samples += input_ids.size(0)
            num_batches += 1

    avg_val_loss = val_loss / num_samples

    # Generowanie przykładowego tekstu
    input_ids = tokenizer.encode(init_text, return_tensors="pt").to("cuda")

    temperature = 0.2
    top_p = 0.9
    generated_ids = model.generate(
        input_ids,
        max_length=150,
        num_return_sequences=1,
        temperature=temperature,
        top_p=top_p,
        do_sample=True,
    )

    generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
    print(f"Epoch {epoch + 1}, Avg loss: {avg_val_loss}, Generated Text: {generated_text}")

print("Training complete.")

# Wydrukowanie kształtu tensorów wejściowych
ids = torch.tensor(tokenized_datasets["train"]["input_ids"])
print(ids.shape)  # torch.Size([8960, 137])


Some weights of RobertaForCausalLM were not initialized from the model checkpoint at allegro/herbert-klej-cased-v1 and are newly initialized: ['lm_head.bias', 'lm_head.decoder.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 8960
    })
    validation: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 747
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1099
    })
})
Epoch: 1/5; Step: 0/280; Loss: 10.054091453552246
Epoch: 1/5; Step: 5/280; Loss: 0.938270092010498
Epoch: 1/5; Step: 10/280; Loss: 0.783644437789917
Epoch: 1/5; Step: 15/280; Loss: 0.6947688460350037
Epoch: 1/5; Step: 20/280; Loss: 0.6816601753234863
Epoch: 1/5; Step: 25/280; Loss: 0.6460989117622375
Epoch: 1/5; Step: 30/280; Loss: 0.6351413130760193
Epoch: 1/5; Step: 35/280; Loss: 0.6485887765884399
Epoch: 1/5; Step: 40/280; Loss: 0.6476826071739197
Epoch: 1/5; Step: 45/280; Loss: 0.5985161662101746
Epoch: 1/5; Step: 50/280; Loss: 0.5839060544967651
Epoch: 1/5; Step: 55/280; Loss: 0.5832929015159607
Epoch: 1/5; 

KeyboardInterrupt: 