In [8]:
!pip install sacremoses
!pip install datasets



In [12]:
from transformers import RobertaForCausalLM, AutoTokenizer
import datasets
import torch

model = RobertaForCausalLM.from_pretrained("allegro/herbert-klej-cased-v1", is_decoder=True).to("cuda")
tokenizer = AutoTokenizer.from_pretrained("allegro/herbert-klej-cased-tokenizer-v1")

ds = datasets.load_dataset("text", data_files={
   "train": "pan_tadeusz_1_10.txt",
   "validation": "pan_tadeusz_11.txt",
   "test": "pan_tadeusz_12.txt",
})

def flatten(xss):
    return [x for xs in xss for x in xs]

def tokenize_function(examples):
    return tokenizer(examples["text"])

tokenized_datasets = ds.map(tokenize_function, batched=True, num_proc=1, remove_columns=["text"])
print(tokenized_datasets)

init_text = "Jam jest Jacek"

class MLMDataset(torch.utils.data.Dataset):
    def __init__(self, tokenized_dataset, tokenizer, mlm_probability=0.15, window_size=50):
        self.input_ids = flatten(tokenized_dataset["input_ids"])
        self.attention_mask = flatten(tokenized_dataset["attention_mask"])
        self.tokenizer = tokenizer
        self.mlm_probability = mlm_probability
        self.window_size = window_size

    def __len__(self):
        return (len(self.input_ids)//self.window_size)*self.window_size - self.window_size

    def __getitem__(self, idx):
        input_ids = torch.tensor(self.input_ids[idx:idx+self.window_size])

        # Klonujemy oryginalne tokeny i traktujemy jako labele
        labels = input_ids.clone()

        # Tworzymy maskę z prawdopodobieństwem 15%
        rand = torch.rand(input_ids.shape)
        mask_arr = (rand < self.mlm_probability)
        selection = torch.flatten(mask_arr.nonzero()).tolist()

        # Maskujemy oryginalne
        input_ids[selection] = self.tokenizer.mask_token_id

        return [
            input_ids,
            labels
        ]

train_dataset = MLMDataset(tokenized_datasets["train"], tokenizer)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True)

valid_dataset = MLMDataset(tokenized_datasets["validation"], tokenizer)
valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=32, shuffle=False)

print(len(train_dataset))
masked, labeled = next(iter(train_dataset))

masked_text = tokenizer.decode(masked, skip_special_tokens=False)
labeled_text = tokenizer.decode(labeled, skip_special_tokens=False)

print("Masked: ", masked_text)
print("Labeled: ", labeled_text)

Some weights of RobertaForCausalLM were not initialized from the model checkpoint at allegro/herbert-klej-cased-v1 and are newly initialized: ['lm_head.bias', 'lm_head.decoder.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 8960
    })
    validation: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 747
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1099
    })
})
113800
Masked:  <s>Księga pierwsza </s><s></s><s><mask><s><mask><s>Gospodarstwo </s><s></s><s>Powrót panicza - <mask>się pierwsze w pokoiku <mask>drugie u stołu - Ważna Sędziego nauka o grzeczności - Podkomo<mask>go uwagi polityczne nad mod
Labeled:  <s>Księga pierwsza </s><s></s><s></s><s></s><s>Gospodarstwo </s><s></s><s>Powrót panicza - Spotkanie się pierwsze w pokoiku, drugie u stołu - Ważna Sędziego nauka o grzeczności - Podkomorzego uwagi polityczne nad mod


In [13]:
model.train()
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)

epoch_num = 1
for epoch in range(epoch_num):
    # Trening
    model.train()
    for i, (input_ids, labels) in enumerate(train_loader):
        optimizer.zero_grad()
        input_ids = input_ids.to("cuda")
        labels = labels.to("cuda")

        outputs = model(input_ids, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        if i % 5 == 0:
            print(f"Epoch: {epoch + 1}/{epoch_num}; Step: {i}/{len(train_loader)}; Loss: {loss.item()}")
            pass

    print(f"Epoch {epoch + 1}, Training Loss: {loss.item()}")

    model.eval()
    val_loss = 0
    num_batches = 0
    num_samples = 0

    with torch.no_grad():
        for input_ids, labels in valid_loader:
            input_ids = input_ids.to("cuda")
            labels = labels.to("cuda")

            outputs = model(input_ids, labels=labels)
            val_loss += outputs.loss.item() * input_ids.size(0)  # sum the loss for each batch
            num_samples += input_ids.size(0)
            num_batches += 1

    avg_val_loss = val_loss / num_samples

    # Generowanie przykładowego tekstu
    input_ids = tokenizer.encode(init_text, return_tensors="pt").to("cuda")

    temperature = 1
    top_p = 0.1
    generated_ids = model.generate(
        input_ids,
        max_length=150,
        num_return_sequences=1,
        temperature=temperature,
        top_p=top_p,
        do_sample=True,
    )

    generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
    print(f"Epoch {epoch + 1}, Avg loss: {avg_val_loss}, Generated Text: {generated_text}")

print("Training complete.")

# Wydrukowanie kształtu tensorów wejściowych
ids = torch.tensor(tokenized_datasets["train"]["input_ids"])
print(ids.shape)  # torch.Size([8960, 137])


Epoch: 1/5; Step: 0/3557; Loss: 11.826803207397461
Epoch: 1/5; Step: 5/3557; Loss: 7.672152996063232
Epoch: 1/5; Step: 10/3557; Loss: 7.27687406539917
Epoch: 1/5; Step: 15/3557; Loss: 7.043516635894775
Epoch: 1/5; Step: 20/3557; Loss: 6.806664943695068
Epoch: 1/5; Step: 25/3557; Loss: 6.6434454917907715
Epoch: 1/5; Step: 30/3557; Loss: 6.557663440704346
Epoch: 1/5; Step: 35/3557; Loss: 6.407878875732422
Epoch: 1/5; Step: 40/3557; Loss: 6.436079978942871
Epoch: 1/5; Step: 45/3557; Loss: 6.350093841552734
Epoch: 1/5; Step: 50/3557; Loss: 6.223112106323242
Epoch: 1/5; Step: 55/3557; Loss: 6.154550552368164
Epoch: 1/5; Step: 60/3557; Loss: 6.053224086761475
Epoch: 1/5; Step: 65/3557; Loss: 6.08909273147583
Epoch: 1/5; Step: 70/3557; Loss: 5.983164310455322
Epoch: 1/5; Step: 75/3557; Loss: 5.960041046142578
Epoch: 1/5; Step: 80/3557; Loss: 6.0462470054626465
Epoch: 1/5; Step: 85/3557; Loss: 5.914473056793213
Epoch: 1/5; Step: 90/3557; Loss: 5.943591117858887
Epoch: 1/5; Step: 95/3557; Loss:

KeyboardInterrupt: 