In [1]:
!pip install transformers torch




[notice] A new release of pip is available: 24.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


# Prepare dataset

In [29]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load dataset
data = pd.read_csv('data/annotations_all_batches - WORD - SECOND BATCH.csv')

# Keep necessary columns and drop NaNs
data = data[['sentence_id', 'word', 'final-annotation']].dropna()

# Map labels to descriptive names
# Update the label mapping
label_mapping = {
    0: 'negatywny',
    1: 'neutralny',
    2: 'pozytywny',
    3: 'inne',
}

# Map final-annotation column
data['final-annotation'] = data['final-annotation'].astype(int).map(label_mapping)

# Group by sentences
grouped = data.groupby('sentence_id').agg({'word': list, 'final-annotation': list}).reset_index()

# Split data
train_data, test_data = train_test_split(grouped, test_size=0.1, random_state=42)


In [30]:
train_data.head()

Unnamed: 0,sentence_id,word,final-annotation
4,5,"[Super, uchwyt, Jak, go, dostałem, myślałem, ż...","[pozytywny, neutralny, inne, inne, neutralny, ..."
2,3,"[założyłem, sam, i, działazamiennik, jak, w, o...","[neutralny, neutralny, inne, neutralny, neutra..."
0,1,"[Do, Bosch, SMV53L10EU, pasuje, IDEALNIE, wpas...","[inne, neutralny, neutralny, pozytywny, pozyty..."
3,4,"[Zestaw, zawiera, wszystko, czego, potrzeba, a...","[neutralny, pozytywny, pozytywny, pozytywny, p..."


In [31]:
test_data.head()

Unnamed: 0,sentence_id,word,final-annotation
1,2,"[Jakość, i, praktyczność, wykonania, tego, try...","[neutralny, inne, neutralny, neutralny, inne, ..."


In [32]:
from transformers import BertTokenizerFast
import torch
from torch.utils.data import Dataset

tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

# Define all unique labels
unique_labels = ['negatywny', 'neutralny', 'pozytywny', 'inne']
label2id = {label: i for i, label in enumerate(unique_labels)}
id2label = {i: label for label, i in label2id.items()}

# Update TokenClassificationDataset class remains the same
class TokenClassificationDataset(Dataset):
    def __init__(self, data, tokenizer, label_mapping, max_len=128):
        self.data = data
        self.tokenizer = tokenizer
        self.label_mapping = label_mapping
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        sentence = self.data.iloc[index]['word']
        labels = self.data.iloc[index]['final-annotation']

        # Tokenize and align labels
        encoding = self.tokenizer(sentence, is_split_into_words=True, 
                                  truncation=True, padding='max_length', 
                                  max_length=self.max_len, return_tensors="pt")
        encoded_labels = [-100] * self.max_len

        word_ids = encoding.word_ids(batch_index=0)
        label_ids = [self.label_mapping[label] for label in labels]
        label_index = 0

        for i, word_id in enumerate(word_ids):
            if word_id is None:
                continue
            if word_id != word_ids[i - 1]:
                encoded_labels[i] = label_ids[label_index]
                label_index += 1

        encoding["labels"] = torch.tensor(encoded_labels, dtype=torch.long)
        return {key: val.squeeze() for key, val in encoding.items()}


# Prepare datasets
train_dataset = TokenClassificationDataset(train_data, tokenizer, label2id)
test_dataset = TokenClassificationDataset(test_data, tokenizer, label2id)

# Finetunnig

In [34]:
model

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12

In [33]:
from transformers import BertForTokenClassification, Trainer, TrainingArguments

model = BertForTokenClassification.from_pretrained('bert-base-uncased', num_labels=len(unique_labels))

training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=1e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=100,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    save_strategy="no",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

# trainer.train()


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Ewaluacja

In [14]:
trainer.evaluate()

100%|██████████| 1/1 [00:00<00:00, 998.41it/s]


{'eval_loss': 1.4543756246566772,
 'eval_runtime': 0.0993,
 'eval_samples_per_second': 10.066,
 'eval_steps_per_second': 10.066,
 'epoch': 100.0}

In [15]:
# Save the model
trainer.save_model('./bert_model')

In [16]:
# Load the model
model = BertForTokenClassification.from_pretrained('./bert_model')
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

In [28]:
import numpy as np

# Perform evaluation
predictions, labels, _ = trainer.predict(test_dataset)

# Convert predictions to label ids
predictions = np.argmax(predictions, axis=2)

# Convert label ids to label names
predicted_labels = [[id2label[p] for p in pred] for pred in predictions]

# Print the predictions for the test set
for i, sentence in enumerate(test_data['word']):
    for word, pred in zip(sentence, *predicted_labels):
        print(f"{word:<12} --> {pred:<10}")
    print()

100%|██████████| 1/1 [00:00<00:00, 1000.31it/s]

Jakość       --> inne      
i            --> pozytywny 
praktyczność --> neutralny 
wykonania    --> neutralny 
tego         --> inne      
trymera      --> neutralny 
pozostawia   --> neutralny 
naprawdę     --> neutralny 
wiele        --> neutralny 
do           --> neutralny 
życzenia     --> inne      
O            --> neutralny 
golarce      --> neutralny 
w            --> neutralny 
tym          --> inne      
zestawie     --> neutralny 
nie          --> inne      
warto        --> neutralny 
nawet        --> neutralny 
wspominać    --> neutralny 
Lepiej       --> neutralny 
od           --> neutralny 
razu         --> neutralny 
ja           --> neutralny 
wyrzucić     --> inne      
Za           --> neutralny 
połowę       --> neutralny 
ceny         --> neutralny 
można        --> inne      
kupić        --> inne      
nieco        --> inne      
lepszy       --> inne      
produkt      --> neutralny 
produkowany  --> neutralny 
dla          --> neutralny 
marketów     --> neu


