In [2]:
import os
import json
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import tqdm.notebook as tq
from collections import defaultdict
from sklearn.metrics import f1_score, classification_report
from sklearn.model_selection import train_test_split

#from transformers import BertTokenizer, BertModel
from torch.utils.tensorboard import SummaryWriter

writer = SummaryWriter(log_dir='logs')
#translator = Translator()

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [3]:
MAX_LEN = 100
BATCH = 32
#PRE_TRAINED_MODEL_NAME = 'bert-base-cased' #"roberta-base" #'bert-base-cased'
EPOCHS = 10
LEARNING_RATE = 0.00001 # dla MEISD
LEARNING_RATE_FINE = 5e-6    # dla fine-tuningu na ESConv
THRESHOLD = 0.2 #prog decyzyjny
DROPOUT_RATE = 0.3
WEIGHT_DECAY = 0.001
LSTM_LAYERS = 2
LSTM_HIDDEN_DIM = 128
FOCAL_LOSS_ALFA = 4
FOCAL_LOSS_GAMMA = 2
MODE='min'
PATIENCE=2
FACTOR=0.5
VERBOSE=True
output_dir = './fine_tuned_bert_lstm_model'
os.makedirs(output_dir, exist_ok=True)

In [4]:
df_data = pd.read_csv('C:/Users/juwieczo/DataspellProjects/meisd_project/data/filtered_negative_MEISD_intensity_max_first_25_conv.csv')
df_llm_aug = pd.read_excel('C:/Users/juwieczo/DataspellProjects/meisd_project/pipeline/augmented_dataset_llm_70percent.xlsx')
#df_data = pd.read_csv('C:/Users/juwieczo/DataspellProjects/meisd_project/pipeline/balanced_augmented_data_primary_intensity.csv')

In [5]:
label_frequencies = df_data['max_intensity'].value_counts()
label_frequencies_percent = df_data['max_intensity'].value_counts(normalize=True) * 100
print(label_frequencies_percent)
print(label_frequencies)

KeyError: 'max_intensity'

In [6]:
df_data['label'] = (df_data['max_intensity'] == 2).astype(int)
#df_data['label'] = (df_data['label'] == 2).astype(int)

columns = ['Utterances', 'label']
df = df_data[columns].copy()

In [7]:
df = pd.concat([df, df_llm_aug], ignore_index=True)

In [8]:
df.head()

Unnamed: 0,Utterances,label
0,like i said,0
1,"now you think i'm gay. no, i'm not gay",0
2,now i have to like it here,1
3,yes no other reason? just a favor for an old p...,1
4,if he doesn't respond to these tests in the ne...,1


In [9]:
label_frequencies = df['label'].value_counts()
label_frequencies_percent = df['label'].value_counts(normalize=True) * 100
print(label_frequencies_percent)
print(label_frequencies)

label
0    50.324343
1    49.675657
Name: proportion, dtype: float64
label
0    1474
1    1455
Name: count, dtype: int64


In [10]:
from sklearn.model_selection import train_test_split
df_train, df_test = train_test_split(df, random_state=77, test_size=0.30, shuffle=True)
df_test, df_valid = train_test_split(df_test, random_state=88, test_size=0.50, shuffle=True)

In [11]:
print(f"Original train size: {df_data.shape}")
print(f"Validation size: {df_valid.shape}, Test size: {df_test.shape}")

Original train size: (1085, 4)
Validation size: (440, 2), Test size: (439, 2)


In [12]:
class_distribution = df['label'].value_counts(normalize=True)
print(class_distribution)

label
0    0.503243
1    0.496757
Name: proportion, dtype: float64


In [13]:
target_list = list(df.columns)
target_list = target_list[1:]
target_list

['label']

In [28]:
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, df, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.df = df
        self.utterances = list(df['Utterances'])
        self.targets = self.df['label'].astype(int).values
        self.max_len = max_len

    def __len__(self):
        return len(self.utterances)

    def __getitem__(self, index):
        utterances = str(self.utterances[index])

        inputs = self.tokenizer.encode_plus(
            utterances,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        target = torch.tensor(self.targets[index], dtype=torch.long)

        return {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            #'token_type_ids': inputs["token_type_ids"].flatten(), -> nie potrzebne przy RoBERTa
            'targets': torch.tensor(self.targets[index], dtype=torch.long),
            'utterances': utterances
        }

In [29]:
class BERTLSTMClassifier(nn.Module):
    def __init__(self, bert_model, lstm_hidden_dim=128, lstm_layers=1, dropout_rate=DROPOUT_RATE, bidirectional=True):
        super(BERTLSTMClassifier, self).__init__()
        self.bert = bert_model
        self.lstm = nn.LSTM(
            input_size=self.bert.config.hidden_size,
            hidden_size=lstm_hidden_dim,
            num_layers=lstm_layers,
            bidirectional=bidirectional,
            batch_first=True,
            dropout=dropout_rate if lstm_layers > 1 else 0
        )
        self.dropout = nn.Dropout(dropout_rate)
        # Jeśli LSTM jest dwukierunkowe, rozmiar wejścia do fc to lstm_hidden_dim*2, w przeciwnym wypadku lstm_hidden_dim
        fc_input_dim = lstm_hidden_dim * 2 if bidirectional else lstm_hidden_dim
        self.fc = nn.Linear(fc_input_dim, 1)

    def forward(self, input_ids, attention_mask, token_type_ids=None):
        # Pobranie reprezentacji z BERTa (cała sekwencja)
        bert_output = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids
        )
        # last_hidden_state: tensor o wymiarach (batch_size, seq_length, hidden_size)
        sequence_output = bert_output.last_hidden_state

        # Przepuszczenie przez LSTM
        lstm_output, (hidden, _) = self.lstm(sequence_output)
        # Jeśli LSTM jest dwukierunkowe, ukryty stan z ostatniej warstwy ma wymiar (num_layers*2, batch, hidden_dim)
        if self.lstm.bidirectional:
            # Pobieramy ostatnie stany z obu kierunków i je łączymy
            hidden_forward = hidden[-2, :, :]  # ostatnia warstwa, kierunek "forward"
            hidden_backward = hidden[-1, :, :]  # ostatnia warstwa, kierunek "backward"
            hidden = torch.cat((hidden_forward, hidden_backward), dim=1)
        else:
            hidden = hidden[-1, :, :]

        dropout_output = self.dropout(hidden)
        logits = self.fc(dropout_output)
        return logits

In [30]:
def freeze_bert_layers(model, freeze_until_layer=6):
    """
    Zamraża wszystkie warstwy enkodera BERT, których numer jest mniejszy niż freeze_until_layer.
    W modelu BERT-base (12 warstw): przy freeze_until_layer=6, zamrożone zostaną warstwy 0-5.
    """
    for name, param in model.named_parameters():
        # Szukamy parametrów, które należą do warstw enkodera
        if "bert.encoder.layer" in name:
            # Nazwa ma postać: bert.encoder.layer.<layer_num>...
            try:
                layer_num = int(name.split('.')[3])
            except:
                continue  # zabezpieczenie, gdyby parsowanie się nie udało
            if layer_num < freeze_until_layer:
                param.requires_grad = False
                # Możesz też dodać print dla debugowania:
                print(f"Freezing parameter: {name}")
    print(f"Frozen BERT layers: 0-{freeze_until_layer - 1}")


In [35]:
bert_model = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)
model = BERTLSTMClassifier(bert_model, lstm_layers=LSTM_LAYERS, lstm_hidden_dim=LSTM_HIDDEN_DIM)

model.to(device)
tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)

In [18]:
train_dataset = CustomDataset(df_train, tokenizer, MAX_LEN)
valid_dataset = CustomDataset(df_valid, tokenizer, MAX_LEN)

In [19]:
train_data_loader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH, shuffle=True, num_workers=0)
val_data_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=BATCH, shuffle=False, num_workers=0)

In [20]:
data = next(iter(train_data_loader))
outputs = model(data["input_ids"], attention_mask=data["attention_mask"])
print(outputs)

tensor([[-0.0092],
        [ 0.0249],
        [-0.0860],
        [ 0.0373],
        [-0.0649],
        [ 0.0022],
        [-0.0425],
        [-0.0131],
        [-0.1055],
        [-0.0718],
        [ 0.0666],
        [-0.0418],
        [-0.0600],
        [-0.0622],
        [-0.0295],
        [-0.1065],
        [-0.0071],
        [-0.0431],
        [-0.0666],
        [-0.0347],
        [-0.0542],
        [-0.0113],
        [ 0.0133],
        [-0.0662],
        [-0.0188],
        [ 0.0187],
        [-0.0207],
        [-0.0160],
        [-0.1542],
        [-0.0030],
        [ 0.0058],
        [-0.0237]], grad_fn=<AddmmBackward0>)


In [21]:
test_text = "We are testing BERT tokenizer."
encodings = tokenizer.encode_plus(test_text,
                                  add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                                  max_length = 50,
                                  truncation = True,
                                  padding = "max_length",
                                  return_attention_mask = True,
                                  return_tensors = "pt")

In [22]:
bert_model = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)
#bert_model = RobertaModel.from_pretrained(PRE_TRAINED_MODEL_NAME)
last_hidden_state, pooled_output = bert_model(
    input_ids=encodings['input_ids'],
    attention_mask=encodings['attention_mask']
)

In [31]:
class FocalLoss(nn.Module):
    def __init__(self, alpha=1, gamma=2, reduction='mean'):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.reduction = reduction
        self.bce = nn.BCEWithLogitsLoss(reduction='none')

    def forward(self, inputs, targets):
        # inputs powinny być wyjściem z modelu (logity)
        bce_loss = self.bce(inputs, targets.float())
        probas = torch.sigmoid(inputs)
        # Obliczenie p_t: dla próbki z target=1 mamy probas, dla target=0 mamy 1 - probas
        p_t = targets * probas + (1 - targets) * (1 - probas)
        loss = self.alpha * (1 - p_t) ** self.gamma * bce_loss
        if self.reduction == 'mean':
            return loss.mean()
        elif self.reduction == 'sum':
            return loss.sum()
        else:
            return loss

loss_fn = FocalLoss(alpha=FOCAL_LOSS_ALFA, gamma=FOCAL_LOSS_GAMMA, reduction='mean')

In [32]:
writer = SummaryWriter(log_dir='logs')
from torch.optim import AdamW
optimizer = AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=EPOCHS)

NameError: name 'model' is not defined

In [25]:
def train_model(training_loader, model, optimizer):
    """
    Trenuje model na danych treningowych i zwraca model, dokładność, średni loss oraz F1-score.

    Args:
        training_loader (DataLoader): DataLoader z danymi treningowymi.
        model (torch.nn.Module): Model do trenowania.
        optimizer (torch.optim.Optimizer): Optymalizator do aktualizacji wag modelu.
        loss_fn (callable): Funkcja strat, np. nn.BCEWithLogitsLoss.

    Returns:
        model (torch.nn.Module): Wytrenowany model.
        train_accuracy (float): Dokładność modelu na zbiorze treningowym.
        avg_loss (float): Średnia wartość funkcji strat.
        train_f1 (float): F1-score (binary) na zbiorze treningowym.
    """
    # Inicjalizacja zmiennych do śledzenia wyników
    losses = []
    correct_predictions = 0
    num_samples = 0
    all_preds = []
    all_labels = []

    model.train()

    loop = tq.tqdm(enumerate(training_loader), total=len(training_loader), leave=True, colour='steelblue')

    for batch_idx, data in loop:
        ids = data['input_ids'].to(device, dtype=torch.long)
        mask = data['attention_mask'].to(device, dtype=torch.long)
        #token_type_ids = data['token_type_ids'].to(device, dtype=torch.long)
        targets = data['targets'].to(device, dtype=torch.float)  # Binary targets jako float

        #outputs = model(ids, mask, token_type_ids if 'token_type_ids' in data else None)
        outputs = model(ids, mask if 'token_type_ids' in data else None)

        outputs = outputs.squeeze(-1)  # Dopasowanie wymiarów do binary classification (1D)

        loss = loss_fn(outputs, targets)
        losses.append(loss.item())

        preds = torch.sigmoid(outputs) >= 0.5  # Sigmoid + progowanie przy 0.5
        correct_predictions += torch.sum(preds == targets).item()
        num_samples += targets.size(0)

        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(targets.cpu().numpy())

        optimizer.zero_grad()
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        loop.set_postfix(batch_loss=loss.item())

    train_f1 = f1_score(all_labels, all_preds, average='binary')

    return model, correct_predictions / num_samples, np.mean(losses), train_f1

In [26]:
def eval_model(validation_loader, model, epoch):
    """
    Ewaluacja modelu na danych walidacyjnych.

    Args:
        validation_loader (DataLoader): DataLoader z danymi walidacyjnymi.
        model (torch.nn.Module): Model do oceny.
        loss_fn (callable): Funkcja strat, np. nn.BCEWithLogitsLoss.
        epoch (int): Aktualny numer epoki do logowania w TensorBoard.

    Returns:
        val_accuracy (float): Dokładność modelu na zbiorze walidacyjnym.
        avg_loss (float): Średnia wartość funkcji strat na zbiorze walidacyjnym.
        val_f1 (float): F1-score (binary) na zbiorze walidacyjnym.
    """
    # Inicjalizacja zmiennych do śledzenia wyników
    losses = []
    correct_predictions = 0
    num_samples = 0
    all_preds = []
    all_labels = []

    # Ustaw model w tryb ewaluacyjny
    model.eval()

    # Wyłącz gradienty dla ewaluacji
    with torch.no_grad():
        for data in validation_loader:
            ids = data['input_ids'].to(device, dtype=torch.long)
            mask = data['attention_mask'].to(device, dtype=torch.long)
            #token_type_ids = data['token_type_ids'].to(device, dtype=torch.long)
            targets = data['targets'].to(device, dtype=torch.float)  # Binary targets jako float

            #outputs = model(ids, mask, token_type_ids if 'token_type_ids' in data else None)
            outputs = model(ids, mask if 'token_type_ids' in data else None)

            outputs = outputs.squeeze(-1)

            loss = loss_fn(outputs, targets)
            losses.append(loss.item())

            preds = torch.sigmoid(outputs) >= 0.5  # Sigmoid + progowanie przy 0.5
            correct_predictions += torch.sum(preds == targets).item()
            num_samples += targets.size(0)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(targets.cpu().numpy())

    avg_loss = np.mean(losses)
    val_f1 = f1_score(all_labels, all_preds, average='binary')

    writer.add_scalar('Loss/validation', avg_loss, epoch)
    writer.add_scalar('F1-Score/validation', val_f1, epoch)

    return correct_predictions / num_samples, avg_loss, val_f1


In [27]:
def count_trainable_parameters(model):
    total_params = sum(p.numel() for p in model.parameters())
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    return total_params, trainable_params

In [28]:
history = defaultdict(list)
best_f1 = 0
patience_counter = 0


for epoch in range(1, EPOCHS + 1):
    print(f'Epoch {epoch}/{EPOCHS}')

    model, train_acc, train_loss, train_f1 = train_model(train_data_loader, model, optimizer)
    print(f'Train loss {train_loss:.4f} | Train accuracy {train_acc:.4f} | Train F1 {train_f1:.4f}')

    val_acc, val_loss, val_f1 = eval_model(val_data_loader, model, epoch)
    print(f'Val loss {val_loss:.4f} | Val accuracy {val_acc:.4f} | Val F1 {val_f1:.4f}')

    # Logowanie metryk do TensorBoard
    writer.add_scalar('Loss/train', train_loss, epoch)
    writer.add_scalar('Accuracy/train', train_acc, epoch)
    writer.add_scalar('F1-Score/train', train_f1, epoch)

    history['train_acc'].append(train_acc)
    history['train_loss'].append(train_loss)
    history['train_f1'].append(train_f1)
    history['val_acc'].append(val_acc)
    history['val_loss'].append(val_loss)
    history['val_f1'].append(val_f1)

    if val_f1 > best_f1:
        tokenizer.save_pretrained(output_dir)
        model.bert.save_pretrained(output_dir)
        torch.save(model.state_dict(), os.path.join(output_dir, "best_binary_model_state.bin"))
        #torch.save(model.state_dict(), "best_binary_model_state.bin")
        best_f1 = val_f1
        print("Saved new best model.")

    #scheduler.step(val_loss)  # Tuning LR
    scheduler.step()

writer.close()

Epoch 1/10


  0%|          | 0/65 [00:00<?, ?it/s]

We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


Train loss 0.6934 | Train accuracy 0.5215 | Train F1 0.5357
Val loss 0.6870 | Val accuracy 0.5795 | Val F1 0.5542
Saved new best model.
Epoch 2/10


  0%|          | 0/65 [00:00<?, ?it/s]

Train loss 0.6870 | Train accuracy 0.5356 | Train F1 0.4974
Val loss 0.6810 | Val accuracy 0.5500 | Val F1 0.6711
Saved new best model.
Epoch 3/10


  0%|          | 0/65 [00:00<?, ?it/s]

Train loss 0.6756 | Train accuracy 0.5834 | Train F1 0.6104
Val loss 0.6612 | Val accuracy 0.6250 | Val F1 0.6857
Saved new best model.
Epoch 4/10


  0%|          | 0/65 [00:00<?, ?it/s]

Train loss 0.6458 | Train accuracy 0.6327 | Train F1 0.6385
Val loss 0.6408 | Val accuracy 0.6455 | Val F1 0.6594
Epoch 5/10


  0%|          | 0/65 [00:00<?, ?it/s]

Train loss 0.5970 | Train accuracy 0.7185 | Train F1 0.7233
Val loss 0.6354 | Val accuracy 0.6114 | Val F1 0.6827
Epoch 6/10


  0%|          | 0/65 [00:00<?, ?it/s]

Train loss 0.5313 | Train accuracy 0.7761 | Train F1 0.7803
Val loss 0.5976 | Val accuracy 0.6818 | Val F1 0.7200
Saved new best model.
Epoch 7/10


  0%|          | 0/65 [00:00<?, ?it/s]

Train loss 0.4543 | Train accuracy 0.8346 | Train F1 0.8333
Val loss 0.5861 | Val accuracy 0.7182 | Val F1 0.7530
Saved new best model.
Epoch 8/10


  0%|          | 0/65 [00:00<?, ?it/s]

Train loss 0.4124 | Train accuracy 0.8629 | Train F1 0.8659
Val loss 0.5608 | Val accuracy 0.7341 | Val F1 0.7394
Epoch 9/10


  0%|          | 0/65 [00:00<?, ?it/s]

Train loss 0.3827 | Train accuracy 0.8824 | Train F1 0.8820
Val loss 0.5551 | Val accuracy 0.7341 | Val F1 0.7473
Epoch 10/10


  0%|          | 0/65 [00:00<?, ?it/s]

Train loss 0.3669 | Train accuracy 0.8849 | Train F1 0.8860
Val loss 0.5595 | Val accuracy 0.7409 | Val F1 0.7554
Saved new best model.


# Transfer learning to ESConv dataset

In [18]:
def load_data(file_path):
    with open(file_path, "r", encoding='utf-8') as file:
        data = json.load(file)
    return data

#dataset = load_data("D:/julixus/MEISD/meisd_project/data/ESConv.json")
dataset = load_data("C:/Users/juwieczo/DataspellProjects/meisd_project/data/ESConv.json")

In [19]:
def extract_seeker_data(data, key):
    result = []

    for entry in data:
        dialog = entry['dialog']
        seeker_dialog = [item['content'].strip() for item in dialog if item['speaker'] == 'seeker']

        quarter_length = max(1, len(seeker_dialog) // 4)

        if key == 'initial_emotion_intensity':
            selected_dialog = seeker_dialog[:quarter_length]
        elif key == 'final_emotion_intensity':
            selected_dialog = seeker_dialog[-quarter_length:]
        else:
            continue

        result.append({
            key: entry['survey_score']['seeker'][key],
            'dialog': selected_dialog
        })

    return result

first_25_percent = extract_seeker_data(dataset, 'initial_emotion_intensity')
#last_25_percent = extract_seeker_data(dataset, 'final_emotion_intensity')

first_25_df = pd.DataFrame(first_25_percent)
#last_25_df = pd.DataFrame(last_25_percent)

first_25_df.head()

Unnamed: 0,initial_emotion_intensity,dialog
0,5,"[Hello, I am having a lot of anxiety about qui..."
1,5,"[hello im looking for someone to talk to, im f..."
2,4,"[Hello, I'm concerned about my job. I have bee..."
3,4,"[I am dong good. You?, I have been staying hom..."
4,5,"[Infinitely complicated., Too many decisions. ..."


In [20]:
first_25_df["dialog"] = first_25_df["dialog"].apply(
    lambda x: " ".join(x) if isinstance(x, list) else x)

In [21]:
label_frequencies = first_25_df['initial_emotion_intensity'].value_counts()
label_frequencies_percent = first_25_df['initial_emotion_intensity'].value_counts(normalize=True) * 100
print(label_frequencies_percent)
print(label_frequencies)

initial_emotion_intensity
4    43.615385
5    32.846154
3    20.538462
2     2.846154
1     0.153846
Name: proportion, dtype: float64
initial_emotion_intensity
4    567
5    427
3    267
2     37
1      2
Name: count, dtype: int64


In [10]:
first_25_df.head(5)

Unnamed: 0,initial_emotion_intensity,dialog
0,5,"[Hello, I am having a lot of anxiety about qui..."
1,5,"[hello im looking for someone to talk to, im f..."
2,4,"[Hello, I'm concerned about my job. I have bee..."
3,4,"[I am dong good. You?, I have been staying hom..."
4,5,"[Infinitely complicated., Too many decisions. ..."


In [12]:
label_counts = first_25_df['initial_emotion_intensity'].value_counts()
least_common_label = label_counts.idxmin()
first_25_df = first_25_df[first_25_df['initial_emotion_intensity'] != least_common_label]
first_25_df['initial_emotion_intensity'] = pd.to_numeric(first_25_df['initial_emotion_intensity'], errors='coerce')
first_25_df['initial_emotion_intensity'] = first_25_df['initial_emotion_intensity'] - 2

In [15]:
first_25_df.rename(columns={
    'dialog': 'Utterances',
    'initial_emotion_intensity': 'label'
}, inplace=True)

In [16]:
first_25_df['label'] = (first_25_df['label'] == 2).astype(int)
columns = ['Utterances', 'label']
df = first_25_df[columns].copy()

In [23]:
df_train_esconv, df_temp_esconv = train_test_split(df, random_state=77, test_size=0.30, shuffle=True)
df_valid_esconv, df_test_esconv = train_test_split(df_temp_esconv, random_state=88, test_size=0.50, shuffle=True)
print(f"ESConv: Train {df_train_esconv.shape}, Valid {df_valid_esconv.shape}, Test {df_test_esconv.shape}")

ESConv: Train (882, 2), Valid (189, 2), Test (190, 2)


In [43]:
class ESConvDataset(torch.utils.data.Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            # token_type_ids=False wyłączone
            'labels': torch.tensor(label, dtype=torch.long)
        }


In [49]:
tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)
train_dataset_esconv = ESConvDataset(texts=df_train_esconv['Utterances'], labels=df_train_esconv['label'], tokenizer=tokenizer, max_len=MAX_LEN)
valid_dataset_esconv = ESConvDataset(texts=df_valid_esconv['Utterances'], labels=df_train_esconv['label'], tokenizer=tokenizer, max_len=MAX_LEN)
train_data_loader_esconv = torch.utils.data.DataLoader(train_dataset_esconv, batch_size=BATCH, shuffle=True)
valid_data_loader_esconv = torch.utils.data.DataLoader(valid_dataset_esconv, batch_size=BATCH, shuffle=False)

In [50]:
print(tokenizer.vocab_size)

28996


In [51]:
print(tokenizer)

BertTokenizer(name_or_path='bert-base-cased', vocab_size=28996, model_max_length=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True, added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
)


In [52]:
model.load_state_dict(torch.load("best_binary_model_state.bin"))
model.to(device)

RuntimeError: Error(s) in loading state_dict for BERTLSTMClassifier:
	size mismatch for bert.embeddings.word_embeddings.weight: copying a param with shape torch.Size([30522, 768]) from checkpoint, the shape in current model is torch.Size([28996, 768]).

In [None]:
total_params, trainable_params = count_trainable_parameters(model)
print(f"Before freezing on ESConv: Total params: {total_params}, Trainable params: {trainable_params}")

In [None]:
freeze_bert_layers(model, freeze_until_layer=6)
total_params, trainable_params = count_trainable_parameters(model)
print(f"After freezing on ESConv: Total params: {total_params}, Trainable params: {trainable_params}")

In [None]:
optimizer_esconv = AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=LEARNING_RATE_FINE, weight_decay=WEIGHT_DECAY)
scheduler_esconv = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer_esconv, T_max=EPOCHS)

In [None]:
history_esconv = defaultdict(list)
best_f1_esconv = 0

print("Fine-tuning on ESConv...")
for epoch in range(1, EPOCHS + 1):
    print(f'Fine-tuning Epoch {epoch}/{EPOCHS} (ESConv)')
    model, train_acc, train_loss, train_f1 = train_model(train_data_loader_esconv, model, optimizer_esconv)
    print(f'ESConv Train loss {train_loss:.4f} | Train accuracy {train_acc:.4f} | Train F1 {train_f1:.4f}')
    val_acc, val_loss, val_f1 = eval_model(valid_data_loader_esconv, model, epoch)
    print(f'ESConv Val loss {val_loss:.4f} | Val accuracy {val_acc:.4f} | Val F1 {val_f1:.4f}')
    history_esconv['train_acc'].append(train_acc)
    history_esconv['train_loss'].append(train_loss)
    history_esconv['train_f1'].append(train_f1)
    history_esconv['val_acc'].append(val_acc)
    history_esconv['val_loss'].append(val_loss)
    history_esconv['val_f1'].append(val_f1)
    if val_f1 > best_f1_esconv:
        torch.save(model.state_dict(), "best_fine_tuned_model_state.bin")
        best_f1_esconv = val_f1
        print("Saved new best ESConv model.")
    scheduler_esconv.step()
print("Fine-tuning on ESConv complete!")
# Zamknij TensorBoard writer
writer.close()

In [None]:
test_dataset_esconv = CustomDataset(df_test_esconv, tokenizer, MAX_LEN)
test_data_loader_esconv = torch.utils.data.DataLoader(test_dataset_esconv, batch_size=BATCH, shuffle=False)
predictions, true_labels = [], []
model.eval()
with torch.no_grad():
    for batch in test_data_loader_esconv:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['targets'].to(device)
        outputs = model(input_ids, attention_mask)
        outputs = outputs.squeeze(-1)
        preds = torch.sigmoid(outputs) >= 0.5
        predictions.extend(preds.cpu().numpy())
        true_labels.extend(labels.cpu().numpy())

print("ESConv Test Set Report:")
print(classification_report(true_labels, predictions))


In [ ]:
from sklearn.metrics import classification_report

report = classification_report(true_labels, predictions, output_dict=True)

import json
with open("esconv_classification_report.json", "w") as f:
    json.dump(report, f, indent=4)

print("Zapisano classification_report do pliku esconv_classification_report.json")


In [ ]:
import os
import time

print("Zamykam komputer za 1 minutę...")
time.sleep(60) 
os.system("shutdown /s /t 1")