In [None]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizerFast, BertForTokenClassification, AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import warnings
warnings.filterwarnings('ignore')


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Stałe
BATCH_SIZE = 16
EPOCHS = 3
LEARNING_RATE = 5e-5
MAX_LEN = 128
TRAIN_SIZE = 0.8
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
# Wczytanie i przygotowanie danych
df = pd.read_csv('/content/drive/MyDrive/studia/NLP/NLP-Zad2/data/annotations_all_batches - WORD - SECOND BATCH.csv')
print(df.head())
df = df.fillna(method='ffill')

# Grupowanie po sentence_id
sentences = df.groupby('sentence_id').agg({
    'word': lambda x: list(x),
    'final-annotation': lambda x: list(x)
}).reset_index()


   sentence_id  word_id        word  Olek  Kuba Zgodne?  Stachu  \
0            1        1          Do     3     3       T     NaN   
1            1        2       Bosch     1     1       T     NaN   
2            1        3  SMV53L10EU     1     1       T     NaN   
3            1        4      pasuje     2     2       T     NaN   
4            1        5    IDEALNIE     2     2       T     NaN   

   final-annotation  Unnamed: 8  
0                 3         NaN  
1                 1         NaN  
2                 1         NaN  
3                 2         NaN  
4                 2         NaN  


In [None]:
pip install sacremoses

Collecting sacremoses
  Downloading sacremoses-0.1.1-py3-none-any.whl.metadata (8.3 kB)
Downloading sacremoses-0.1.1-py3-none-any.whl (897 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/897.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m897.5/897.5 kB[0m [31m36.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sacremoses
Successfully installed sacremoses-0.1.1


In [None]:
from peft import LoraConfig, get_peft_model
from transformers import AutoModelForTokenClassification, AutoTokenizer

# Model setup with PEFT
model_name = "allegro/herbert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
base_model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=4)

# Configure LoRA (Low-Rank Adaptation)
peft_config = LoraConfig(
    r=32,  # Rank
    lora_alpha=32,
    target_modules=["query", "value"],  # Layers to apply LoRA
    lora_dropout=0.1,
    bias="none"
)

# Wrap the model with PEFT
model = get_peft_model(base_model, peft_config)
print("PEFT model is ready!")
model.print_trainable_parameters()

Some weights of BertForTokenClassification were not initialized from the model checkpoint at allegro/herbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


PEFT model is ready!
trainable params: 1,179,648 || all params: 125,035,012 || trainable%: 0.9435


In [None]:
# import torch
# from transformers import AutoModelForTokenClassification, AutoTokenizer
# from peft import PeftModel, PeftConfig

# peft_model_id = "/content/drive/MyDrive/studia/NLP/NLP-Zad2/save_model/bert_peft_tokens"
# config = PeftConfig.from_pretrained(peft_model_id)
# model = AutoModelForTokenClassification.from_pretrained(config.base_model_name_or_path, num_labels=4)
# model = PeftModel.from_pretrained(model, peft_model_id)
# tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)
# print("Model and tokenizer loaded successfully!")

In [None]:
# Klasa dataset
class TokenClassificationDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        words = self.texts[idx]
        labels = self.labels[idx]

        # Tokenizacja
        encoding = self.tokenizer(
            words,
            is_split_into_words=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        # Dostosowanie etykiet do tokenów
        word_ids = encoding.word_ids()
        label_ids = []

        for word_id in word_ids:
            if word_id is None:
                label_ids.append(-100)
            else:
                label_ids.append(labels[word_id])

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label_ids)
        }


In [None]:
# Przygotowanie danych
texts = sentences['word'].values
labels = sentences['final-annotation'].values

# Podział na zbiór treningowy i testowy
train_texts, test_texts, train_labels, test_labels = train_test_split(
    texts, labels, train_size=TRAIN_SIZE, random_state=42
)


In [None]:
# Przygotowanie datasetów
train_dataset = TokenClassificationDataset(train_texts, train_labels, tokenizer, MAX_LEN)
test_dataset = TokenClassificationDataset(test_texts, test_labels, tokenizer, MAX_LEN)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)


In [None]:
device

device(type='cuda')

In [None]:
# Trening modelu
model.to(device)
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)

for epoch in range(1000):
    model.train()
    total_loss = 0

    for batch in train_loader:
        optimizer.zero_grad()

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )

        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

    if epoch % 10 == 0:
        print(f'Epoch {epoch + 1}, Loss: {total_loss/len(train_loader)}')


Epoch 1, Loss: 1.32180917263031
Epoch 11, Loss: 1.3027799129486084
Epoch 21, Loss: 1.2619314193725586
Epoch 31, Loss: 1.2178276777267456
Epoch 41, Loss: 1.184668779373169
Epoch 51, Loss: 1.157524824142456
Epoch 61, Loss: 1.137104868888855
Epoch 71, Loss: 1.1118160486221313
Epoch 81, Loss: 1.0921175479888916
Epoch 91, Loss: 1.0667426586151123
Epoch 101, Loss: 1.0231467485427856
Epoch 111, Loss: 0.9728987812995911
Epoch 121, Loss: 0.912367582321167
Epoch 131, Loss: 0.847463846206665
Epoch 141, Loss: 0.7570992112159729
Epoch 151, Loss: 0.703957736492157
Epoch 161, Loss: 0.6215837001800537
Epoch 171, Loss: 0.574069619178772
Epoch 181, Loss: 0.5117709636688232
Epoch 191, Loss: 0.4465044438838959
Epoch 201, Loss: 0.42718708515167236
Epoch 211, Loss: 0.383780837059021
Epoch 221, Loss: 0.3626765310764313
Epoch 231, Loss: 0.3093695640563965
Epoch 241, Loss: 0.322110116481781
Epoch 251, Loss: 0.2913350760936737
Epoch 261, Loss: 0.27836787700653076
Epoch 271, Loss: 0.225826695561409
Epoch 281, Lo

In [None]:

# Ewaluacja
model.eval()
predictions = []
true_labels = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )

        preds = torch.argmax(outputs.logits, dim=2)

        for i in range(len(preds)):
            pred = preds[i][batch['attention_mask'][i] == 1]
            label = labels[i][batch['attention_mask'][i] == 1]

            pred = pred[label != -100]
            label = label[label != -100]

            predictions.extend(pred.cpu().numpy())
            true_labels.extend(label.cpu().numpy())

# Wyświetlenie wyników dla zbioru testowego
print("\nWyniki klasyfikacji:")
print(classification_report(true_labels, predictions))



Wyniki klasyfikacji:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        16
           1       0.94      0.81      0.87        37
           2       0.05      0.50      0.10         2
           3       0.71      1.00      0.83        10

    accuracy                           0.63        65
   macro avg       0.43      0.58      0.45        65
weighted avg       0.65      0.63      0.63        65



In [None]:

# Predykcje dla przykładowych zdań ze zbioru testowego
label_mapping = {0: 'negatywny', 1: 'neutralny', 2: 'pozytywny', 3: 'inne'}

def predict_sentence(sentence_words):
    model.eval()
    with torch.no_grad():
        inputs = tokenizer(
            sentence_words,
            is_split_into_words=True,
            return_tensors='pt',
            padding=True,
            truncation=True,
            max_length=MAX_LEN
        ).to(device)

        outputs = model(**inputs)
        predictions = torch.argmax(outputs.logits, dim=2)

        word_predictions = []
        word_ids = inputs.word_ids()

        current_word = None
        current_predictions = []

        for token_idx, word_idx in enumerate(word_ids):
            if word_idx is None:
                continue
            if word_idx != current_word:
                if current_word is not None:
                    # Wybierz najczęstszą predykcję dla słowa
                    word_predictions.append(max(set(current_predictions), key=current_predictions.count))
                current_word = word_idx
                current_predictions = []
            current_predictions.append(predictions[0][token_idx].item())

        # Dodaj ostatnie słowo
        if current_predictions:
            word_predictions.append(max(set(current_predictions), key=current_predictions.count))

        return word_predictions


In [None]:

print("\nPrzykładowe predykcje dla zdań ze zbioru testowego:")
for i in range(min(3, len(test_texts))):  # Pokazujemy pierwsze 3 zdania
    sentence = test_texts[i]
    predictions = predict_sentence(sentence)

    print(f"\nZdanie {i+1}:")
    for word, pred in zip(sentence, predictions):
        pred_label = label_mapping[pred]
        print(f"Słowo: {word:15} Predykcja: {pred_label}")


Przykładowe predykcje dla zdań ze zbioru testowego:

Zdanie 1:
Słowo: Jakość          Predykcja: neutralny
Słowo: i               Predykcja: inne
Słowo: praktyczność    Predykcja: pozytywny
Słowo: wykonania       Predykcja: neutralny
Słowo: tego            Predykcja: inne
Słowo: trymera         Predykcja: neutralny
Słowo: pozostawia      Predykcja: pozytywny
Słowo: naprawdę        Predykcja: pozytywny
Słowo: wiele           Predykcja: pozytywny
Słowo: do              Predykcja: pozytywny
Słowo: życzenia        Predykcja: pozytywny
Słowo: O               Predykcja: inne
Słowo: golarce         Predykcja: neutralny
Słowo: w               Predykcja: inne
Słowo: tym             Predykcja: inne
Słowo: zestawie        Predykcja: neutralny
Słowo: nie             Predykcja: pozytywny
Słowo: warto           Predykcja: pozytywny
Słowo: nawet           Predykcja: pozytywny
Słowo: wspominać       Predykcja: pozytywny
Słowo: Lepiej          Predykcja: neutralny
Słowo: od              Predykcja: inn

In [None]:
# Ścieżki do zapisu
model_save_path = "/content/drive/MyDrive/studia/NLP/NLP-Zad2/save_model/bert_peft_tokens_32"

# Zapis modelu
model.save_pretrained(model_save_path)
print(f"Model zapisano w: {model_save_path}")

# Zapis tokenizera
tokenizer.save_pretrained(model_save_path)
print(f"Tokenizer zapisano w: {model_save_path}")

Model zapisano w: /content/drive/MyDrive/studia/NLP/NLP-Zad2/save_model/bert_peft_tokens_32
Tokenizer zapisano w: /content/drive/MyDrive/studia/NLP/NLP-Zad2/save_model/bert_peft_tokens_32


In [None]:
original_words_list = []
labels_list = []
hidden_states_per_layer = []
predicted_labels_list = []

model.eval()

with torch.no_grad():
    for batch in test_loader:
        # Move input_ids and attention_mask to CPU
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            output_hidden_states=True
        )

        logits = outputs.logits
        predicted_labels = torch.argmax(logits, dim=-1)[attention_mask==1]
        predicted_labels = predicted_labels.cpu().numpy()[1:-1]
        predicted_labels_list.extend(predicted_labels)

        all_hidden_states = outputs.hidden_states  # Lista ukrytych stanów z każdej warstwy
        hidden_states_per_layer.append([state.cpu() for state in all_hidden_states])
        labels_list.extend(labels.cpu().numpy())

        for i, ids in enumerate(input_ids):
            words = tokenizer.convert_ids_to_tokens(ids)
            valid_words = [word for word, mask in zip(words, attention_mask[i]) if mask == 1]
            original_words_list.append(valid_words)

# Flatten the list to align with the structure of `labels_list`
original_words_list = [word for sentence in original_words_list for word in sentence][1:-1]
hidden_states_per_layer = [
    [layer.numpy() for layer in batch] for batch in hidden_states_per_layer
]
labels_list = np.array(labels_list)
labels_flat = np.hstack([
    batch.flatten() for batch in labels_list
])
labels_flat = labels_flat[labels_flat != -100]

In [None]:
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

# Przygotowanie danych
layer_hidden_states = []

for layer_idx in range(len(hidden_states_per_layer[0])):
    # Spłaszczenie danych dla danej warstwy
    layer_flat = np.vstack([
        batch[layer_idx].reshape(-1, batch[layer_idx].shape[-1])
        for batch in hidden_states_per_layer
    ])

    # Usunięcie paddingu (-100)
    valid_idx = labels_list.flatten() != -100
    layer_flat = layer_flat[valid_idx]
    layer_hidden_states.append(layer_flat)

# Redukcja wymiarowości dla każdej warstwy
reduced_embeddings_per_layer = []

for i, layer in enumerate(layer_hidden_states):
    tsne = TSNE(n_components=2, random_state=42)
    reduced_embeddings = tsne.fit_transform(layer)
    reduced_embeddings_per_layer.append(reduced_embeddings)
num_layers = len(reduced_embeddings_per_layer)

In [None]:
import plotly.express as px
import plotly.subplots as sp
import pandas as pd

# Prepare `words_flat` to align with valid tokens
words_flat = np.array(original_words_list)

# Prepare interactive plots for each layer
fig = sp.make_subplots(
    rows=len(reduced_embeddings_per_layer),
    cols=1,
    subplot_titles=[f"Layer {i + 1}" for i in range(len(reduced_embeddings_per_layer))],
    vertical_spacing=0.025
)

for i, reduced_embeddings in enumerate(reduced_embeddings_per_layer):
    # Create a DataFrame for easier handling in Plotly
    df = pd.DataFrame({
        "Dim1": reduced_embeddings[:, 0],
        "Dim2": reduced_embeddings[:, 1],
        "Label": labels_flat,
        "Predicted_label": predicted_labels_list,
        "Word": words_flat
    })

    # Create the scatter plot
    scatter = px.scatter(
        df,
        x="Dim1",
        y="Dim2",
        color="Predicted_label",
        hover_data={
            "Word": True,  # Display Word on hover
            "Label": True,  # Display Label on hover
            "Predicted_label": True,  # Display Predicted_label on hover
            "Dim1": False,  # Optionally hide Dim1 and Dim2
            "Dim2": False
        },
        title=f"Layer {i + 1}",
        color_continuous_scale="Viridis"
    )

    # Add the trace to the subplot
    for trace in scatter.data:
        fig.add_trace(trace, row=i + 1, col=1)

# Update layout for better visualization
fig.update_layout(
    height=500 * len(reduced_embeddings_per_layer),  # Adjust height based on the number of layers
    title_text="t-SNE Embedding Visualization Across Layers with Hover Data",
    showlegend=True
)

fig.show()
