In [26]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer
from datasets import Dataset
import pandas as pd
import numpy as np
import torch
from peft import get_peft_model, LoraConfig, TaskType

# Wczytanie danych z pliku CSV
data_path = "data/annotations_all_batches - WORD - SECOND BATCH.csv"
df = pd.read_csv(data_path)

# Usunięcie wierszy z brakującą wartością w kolumnie `final-annotation`
df = df.dropna(subset=["final-annotation"])

# Mapowanie etykiet do liczb
label_mapping = {1: 0, 2: 1, 3: 2}  # Można zmienić w zależności od potrzeb
df["final-annotation"] = df["final-annotation"].astype(int).map(label_mapping)

# Grupowanie słów w zdania
grouped = df.groupby("sentence_id").agg({
    "word": list,
    "final-annotation": list
}).reset_index()

# Konwersja danych do formatu dla Dataset
dataset_dict = {
    "words": grouped["word"].tolist(),
    "labels": grouped["final-annotation"].tolist()
}
dataset = Dataset.from_dict(dataset_dict)

# Tokenizer
model_name = "allegro/herbert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Funkcja tokenizująca dane
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["words"],
        is_split_into_words=True,
        padding=True,
        truncation=True,
        max_length=128
    )
    labels = []
    for i, label in enumerate(examples["labels"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        aligned_labels = []
        previous_word_idx = None
        for word_idx in word_ids:
            if word_idx is None:
                aligned_labels.append(-100)
            elif word_idx != previous_word_idx:
                aligned_labels.append(label[word_idx])
            else:
                aligned_labels.append(-100)
            previous_word_idx = word_idx
        labels.append(aligned_labels)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)

# Model
model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(label_mapping)
)

# Dodanie adapterów Lora
peft_config = LoraConfig(task_type=TaskType.TOKEN_CLS, r=8, lora_alpha=32, lora_dropout=0.1)
model = get_peft_model(model, peft_config)

# Argumenty treningowe
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_strategy="epoch",
    remove_unused_columns=False
)

# Definicja trenera
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer
)

# Trening modelu
trainer.train()



Map: 100%|██████████| 5/5 [00:00<00:00, 625.60 examples/s]
Some weights of BertForTokenClassification were not initialized from the model checkpoint at allegro/herbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  0%|          | 0/3 [06:31<?, ?it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

ValueError: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your features (`words` in this case) have excessive nesting (inputs type `list` where type `int` is expected).