In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
import torch
from transformers import GPT2TokenizerFast, GPT2ForTokenClassification, Trainer, TrainingArguments
from datasets import Dataset, DatasetDict
from sklearn.metrics import classification_report


In [3]:

# Wczytywanie danych
data = pd.read_csv('data/annotations_all_batches - WORD - SECOND BATCH.csv')
data = data[['sentence_id', 'word', 'final-annotation']].dropna()

# Mapowanie etykiet
label_mapping = {
    'negatywny': 0,
    'neutralny': 1,
    'pozytywny': 2,
    'inne': 3,
}
data['final-annotation'] = data['final-annotation'].map(label_mapping)

# Grupowanie danych po zdaniach
grouped = data.groupby('sentence_id').agg({'word': list, 'final-annotation': list}).reset_index()

# Podział na zbiór treningowy i testowy
train_data, test_data = train_test_split(grouped, test_size=0.1, random_state=42)


In [16]:

# Tokenizacja
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2", add_prefix_space=True)
tokenizer.pad_token = tokenizer.eos_token

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["word"], 
        is_split_into_words=True, 
        padding="max_length", 
        truncation=True, 
        max_length=128
    )
    labels = []
    for i, label in enumerate(examples["final-annotation"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = []
        previous_word_idx = None
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)  # Ignorowanie
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)  # Ignorowanie
            previous_word_idx = word_idx
        labels.append(label_ids)

    # Dodaj debugowanie
    if any(lab is None for lab in labels):
        print("Warning: Found None in labels after alignment.")

    tokenized_inputs["labels"] = labels
    return tokenized_inputs


# Tworzenie zbioru treningowego i testowego
train_dataset = Dataset.from_pandas(train_data)
test_dataset = Dataset.from_pandas(test_data)
train_dataset = train_dataset.map(tokenize_and_align_labels, batched=True)
test_dataset = test_dataset.map(tokenize_and_align_labels, batched=True)


Map: 100%|██████████| 4/4 [00:00<00:00, 691.99 examples/s]
Map: 100%|██████████| 1/1 [00:00<00:00, 196.41 examples/s]


In [18]:
# Sprawdzenie dla wartości None w danych wejściowych przed treningiem
print("Checking for None values in dataset...")
for i, sample in enumerate(train_dataset):
    print(sample)
    if None in sample["labels"]:
        print(f"Found None in labels at index {i}")
        break


Checking for None values in dataset...
{'sentence_id': 5, 'word': ['Super', 'uchwyt', 'Jak', 'go', 'dostałem', 'myślałem', 'że', 'go', 'zrobili', 'w', 'Niemczech', 'a', 'tu', 'Chiny', 'Bardzo', 'dobra', 'jakość', 'wykonania', 'i', 'wszystko', 'jest', 'w', 'zestawie', 'co', 'potrzeba', 'do', 'zamontowania', 'go', 'na', 'ścianie', 'i', 'przykręcenia', 'telewizora', 'Do', '40', 'TV', 'też', 'super', 'pasuje', 'tylko', 'trzeba', 'sprawdzić', 'najpierw', 'tv', 'jaki', 'ma', 'rozstaw', 'vesaWygląda', 'też', 'super', 'Solidny', 'polecam'], 'final-annotation': [None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None], '__index_level_0__': 4, 'input_ids': [3115, 334, 354, 86, 20760, 25845, 467, 288, 39818, 41615, 368, 616, 129, 249, 5031, 41615

In [11]:
# Konfiguracja modelu
model = GPT2ForTokenClassification.from_pretrained("gpt2", num_labels=len(label_mapping))
model.config.pad_token_id = model.config.eos_token_id

# Parametry treningowe
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
)


Some weights of GPT2ForTokenClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
train_dataset

Dataset({
    features: ['sentence_id', 'word', 'final-annotation', '__index_level_0__', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 4
})

In [15]:
test_dataset

Dataset({
    features: ['sentence_id', 'word', 'final-annotation', '__index_level_0__', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 1
})

In [12]:

# Trening
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

trainer.train()


  0%|          | 0/3 [00:00<?, ?it/s]

RuntimeError: Could not infer dtype of NoneType

In [None]:
import numpy as np
# Ocena na zbiorze testowym
predictions, labels, _ = trainer.predict(test_dataset)
preds = np.argmax(predictions, axis=2)

# Przekształcanie wyników do klasyfikacji tokenów
true_labels = [[label for label in sent if label != -100] for sent in labels]
pred_labels = [[pred for pred, lab in zip(sent_pred, sent_lab) if lab != -100] for sent_pred, sent_lab in zip(preds, labels)]

# Raport wyników
print(classification_report(true_labels, pred_labels, target_names=label_mapping.keys()))
