In [1]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizerFast, BertForTokenClassification, AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import warnings
warnings.filterwarnings('ignore')


  from .autonotebook import tqdm as notebook_tqdm


In [2]:

# Stałe
BATCH_SIZE = 16
EPOCHS = 3
LEARNING_RATE = 2e-5
MAX_LEN = 128
TRAIN_SIZE = 0.8


In [3]:

# Wczytanie i przygotowanie danych
df = pd.read_csv('data/annotations_all_batches - WORD - SECOND BATCH.csv')
print(df.head())
df = df.fillna(method='ffill')

# Grupowanie po sentence_id
sentences = df.groupby('sentence_id').agg({
    'word': lambda x: list(x),
    'final-annotation': lambda x: list(x)
}).reset_index()


   sentence_id  word_id        word  Olek  Kuba Zgodne?  Stachu  \
0            1        1          Do     3     3       T     NaN   
1            1        2       Bosch     1     1       T     NaN   
2            1        3  SMV53L10EU     1     1       T     NaN   
3            1        4      pasuje     2     2       T     NaN   
4            1        5    IDEALNIE     2     2       T     NaN   

   final-annotation  Unnamed: 8  
0                 3         NaN  
1                 1         NaN  
2                 1         NaN  
3                 2         NaN  
4                 2         NaN  


In [4]:

from peft import LoraConfig, get_peft_model
from transformers import AutoModelForTokenClassification, AutoTokenizer

# Model setup with PEFT
model_name = "allegro/herbert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
base_model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=4)

# Configure LoRA (Low-Rank Adaptation)
peft_config = LoraConfig(
    r=8,  # Rank
    lora_alpha=32,
    target_modules=["query", "value"],  # Layers to apply LoRA
    lora_dropout=0.1,
    bias="none"
)

# Wrap the model with PEFT
model = get_peft_model(base_model, peft_config)
print("PEFT model is ready!")


Some weights of BertForTokenClassification were not initialized from the model checkpoint at allegro/herbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


PEFT model is ready!


In [5]:

# Klasa dataset
class TokenClassificationDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        words = self.texts[idx]
        labels = self.labels[idx]
        
        # Tokenizacja
        encoding = self.tokenizer(
            words,
            is_split_into_words=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        # Dostosowanie etykiet do tokenów
        word_ids = encoding.word_ids()
        label_ids = []
        
        for word_id in word_ids:
            if word_id is None:
                label_ids.append(-100)
            else:
                label_ids.append(labels[word_id])
                
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label_ids)
        }


In [6]:

# Przygotowanie danych
texts = sentences['word'].values
labels = sentences['final-annotation'].values

# Podział na zbiór treningowy i testowy
train_texts, test_texts, train_labels, test_labels = train_test_split(
    texts, labels, train_size=TRAIN_SIZE, random_state=42
)


In [7]:

# Przygotowanie datasetów
train_dataset = TokenClassificationDataset(train_texts, train_labels, tokenizer, MAX_LEN)
test_dataset = TokenClassificationDataset(test_texts, test_labels, tokenizer, MAX_LEN)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)


In [13]:

# Trening modelu
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)

for epoch in range(10000):
    model.train()
    total_loss = 0
    
    for batch in train_loader:
        optimizer.zero_grad()
        
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )
        
        loss = outputs.loss
        total_loss += loss.item()
        
        loss.backward()
        optimizer.step()
    
    print(f'Epoch {epoch + 1}, Loss: {total_loss/len(train_loader)}')


Epoch 1, Loss: 0.3591282069683075
Epoch 2, Loss: 0.35966864228248596
Epoch 3, Loss: 0.3706051707267761
Epoch 4, Loss: 0.375672310590744
Epoch 5, Loss: 0.36563655734062195
Epoch 6, Loss: 0.36577484011650085
Epoch 7, Loss: 0.3424343466758728
Epoch 8, Loss: 0.3571338355541229
Epoch 9, Loss: 0.3628283739089966
Epoch 10, Loss: 0.3712102174758911
Epoch 11, Loss: 0.35093939304351807
Epoch 12, Loss: 0.3522355854511261
Epoch 13, Loss: 0.3515315353870392
Epoch 14, Loss: 0.37684470415115356
Epoch 15, Loss: 0.3551165461540222
Epoch 16, Loss: 0.34163784980773926
Epoch 17, Loss: 0.36297473311424255
Epoch 18, Loss: 0.3253186047077179
Epoch 19, Loss: 0.33071136474609375
Epoch 20, Loss: 0.3614720106124878
Epoch 21, Loss: 0.3497300446033478
Epoch 22, Loss: 0.3621300160884857
Epoch 23, Loss: 0.337888240814209
Epoch 24, Loss: 0.35919275879859924
Epoch 25, Loss: 0.3435458838939667
Epoch 26, Loss: 0.34478920698165894
Epoch 27, Loss: 0.3438310921192169
Epoch 28, Loss: 0.3561217784881592
Epoch 29, Loss: 0.362

KeyboardInterrupt: 

In [14]:

# Ewaluacja
model.eval()
predictions = []
true_labels = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        
        preds = torch.argmax(outputs.logits, dim=2)
        
        for i in range(len(preds)):
            pred = preds[i][batch['attention_mask'][i] == 1]
            label = labels[i][batch['attention_mask'][i] == 1]
            
            pred = pred[label != -100]
            label = label[label != -100]
            
            predictions.extend(pred.cpu().numpy())
            true_labels.extend(label.cpu().numpy())

# Wyświetlenie wyników dla zbioru testowego
print("\nWyniki klasyfikacji:")
print(classification_report(true_labels, predictions))



Wyniki klasyfikacji:
              precision    recall  f1-score   support

           0       1.00      0.62      0.77        16
           1       0.91      0.84      0.87        37
           2       0.10      0.50      0.17         2
           3       0.82      0.90      0.86        10

    accuracy                           0.78        65
   macro avg       0.71      0.72      0.67        65
weighted avg       0.89      0.78      0.82        65



In [15]:

# Predykcje dla przykładowych zdań ze zbioru testowego
label_mapping = {0: 'negatywny', 1: 'neutralny', 2: 'pozytywny', 3: 'inne'}

def predict_sentence(sentence_words):
    model.eval()
    with torch.no_grad():
        inputs = tokenizer(
            sentence_words,
            is_split_into_words=True,
            return_tensors='pt',
            padding=True,
            truncation=True,
            max_length=MAX_LEN
        ).to(device)
        
        outputs = model(**inputs)
        predictions = torch.argmax(outputs.logits, dim=2)
        
        word_predictions = []
        word_ids = inputs.word_ids()
        
        current_word = None
        current_predictions = []
        
        for token_idx, word_idx in enumerate(word_ids):
            if word_idx is None:
                continue
            if word_idx != current_word:
                if current_word is not None:
                    # Wybierz najczęstszą predykcję dla słowa
                    word_predictions.append(max(set(current_predictions), key=current_predictions.count))
                current_word = word_idx
                current_predictions = []
            current_predictions.append(predictions[0][token_idx].item())
        
        # Dodaj ostatnie słowo
        if current_predictions:
            word_predictions.append(max(set(current_predictions), key=current_predictions.count))
            
        return word_predictions


In [16]:

print("\nPrzykładowe predykcje dla zdań ze zbioru testowego:")
for i in range(min(3, len(test_texts))):  # Pokazujemy pierwsze 3 zdania
    sentence = test_texts[i]
    predictions = predict_sentence(sentence)
    
    print(f"\nZdanie {i+1}:")
    for word, pred in zip(sentence, predictions):
        pred_label = label_mapping[pred]
        print(f"Słowo: {word:15} Predykcja: {pred_label}")


Przykładowe predykcje dla zdań ze zbioru testowego:

Zdanie 1:
Słowo: Jakość          Predykcja: neutralny
Słowo: i               Predykcja: inne
Słowo: praktyczność    Predykcja: pozytywny
Słowo: wykonania       Predykcja: pozytywny
Słowo: tego            Predykcja: inne
Słowo: trymera         Predykcja: neutralny
Słowo: pozostawia      Predykcja: negatywny
Słowo: naprawdę        Predykcja: negatywny
Słowo: wiele           Predykcja: negatywny
Słowo: do              Predykcja: negatywny
Słowo: życzenia        Predykcja: negatywny
Słowo: O               Predykcja: inne
Słowo: golarce         Predykcja: neutralny
Słowo: w               Predykcja: inne
Słowo: tym             Predykcja: inne
Słowo: zestawie        Predykcja: neutralny
Słowo: nie             Predykcja: negatywny
Słowo: warto           Predykcja: negatywny
Słowo: nawet           Predykcja: negatywny
Słowo: wspominać       Predykcja: negatywny
Słowo: Lepiej          Predykcja: neutralny
Słowo: od              Predykcja: inn

In [17]:
# Ścieżki do zapisu
model_save_path = "bert_model/bert_with_peft_model"
tokenizer_save_path = "bert_model/bert_tokenizer"

# Zapis modelu
model.save_pretrained(model_save_path)
print(f"Model zapisano w: {model_save_path}")

# Zapis tokenizera
tokenizer.save_pretrained(tokenizer_save_path)
print(f"Tokenizer zapisano w: {tokenizer_save_path}")


Model zapisano w: bert_model/bert_with_peft_model
Tokenizer zapisano w: bert_model/bert_tokenizer
