In [1]:
import pandas as pd
import numpy as np
import tqdm.notebook as tq
from collections import defaultdict

import torch
import torch.nn as nn
from transformers import BertTokenizer, BertModel

from tqdm import tqdm

from sklearn.metrics import f1_score
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [2]:
MAX_LEN = 100
BATCH = 8
PRE_TRAINED_MODEL_NAME = "distilbert-base-uncased" #'bert-base-cased'
EPOCHS = 8
LEARNING_RATE = 0.0001
THRESHOLD = 0.2
DROPOUT_RATE = 0.4
WEIGHT_DECAY = 0.001
MODE = 'min'
PATIENCE = 2
FACTOR = 0.5
VERBOSE = True

In [3]:
df = pd.read_csv('C:/Users/juwieczo/DataspellProjects/meisd_project/pipeline/balanced_augmented_data_primary_intensity.csv')

In [4]:
#columns = ['Utterances', 'dialog_ids', 'uttr_ids', 'intensity', 'intensity2', 'intensity3']
columns = ['Utterances', 'label']
df = df[columns].copy()

In [5]:
from transformers import BertTokenizer, BertModel, DistilBertTokenizer

tokenizer = DistilBertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)

test_text = "We are testing BERT tokenizer."
encodings = tokenizer.encode_plus(
    test_text,
    add_special_tokens=True,  # Add '[CLS]' and '[SEP]'
    max_length=50,
    truncation=True,
    padding="max_length",
    return_attention_mask=True,
    return_tensors="pt"
)

print("Input IDs:", encodings["input_ids"])
print("Attention Mask:", encodings["attention_mask"])


Input IDs: tensor([[  101,  2057,  2024,  5604, 14324, 19204, 17629,  1012,   102,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0]])
Attention Mask: tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0]])


In [6]:
token_lens = []

for txt in df['Utterances']:
    tokens = tokenizer.encode(txt, max_length=512, truncation=True)
    token_lens.append(len(tokens))

In [7]:
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, df, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.df = df
        self.utterances = list(df['Utterances'])
        self.targets = self.df['label'].astype(float).values  # Zmieniamy na float
        self.max_len = max_len

    def __len__(self):
        return len(self.utterances)

    def __getitem__(self, index):
        utterances = str(self.utterances[index])

        inputs = self.tokenizer.encode_plus(
            utterances,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        target = torch.tensor(self.targets[index], dtype=torch.float)  # Używamy float

        return {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'token_type_ids': inputs["token_type_ids"].flatten(),
            'targets': target  # Używamy ciągłego celu
        }


In [8]:
from sklearn.model_selection import train_test_split

# split into train and test
df_train, df_test = train_test_split(df, random_state=77, test_size=0.30, shuffle=True)
# split test into test and validation datasets
df_test, df_valid = train_test_split(df_test, random_state=88, test_size=0.50, shuffle=True)

In [9]:
print(f"Original train size: {df.shape}")
print(f"Validation size: {df_valid.shape}, Test size: {df_test.shape}")

Original train size: (1584, 2)
Validation size: (238, 2), Test size: (238, 2)


In [10]:
label_frequencies = df_train['label'].value_counts()
label_frequencies_percent = df_train['label'].value_counts(normalize=True) * 100
print(label_frequencies_percent)
print(label_frequencies)

label
0    26.263538
1    25.902527
3    25.090253
2    22.743682
Name: proportion, dtype: float64
label
0    291
1    287
3    278
2    252
Name: count, dtype: int64


In [11]:
target_list = list(df.columns)
target_list = target_list[1:]
target_list

['label']

In [12]:
class RegressionModel(nn.Module):
    def __init__(self, pretrained_model_name, dropout_rate):
        super(RegressionModel, self).__init__()
        self.bert = BertModel.from_pretrained(pretrained_model_name)
        self.dropout = nn.Dropout(dropout_rate)
        self.out = nn.Linear(self.bert.config.hidden_size, 1)  # Jedna jednostka wyjściowa

    def forward(self, input_ids, attention_mask, token_type_ids):
        output = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids
        )
        pooled_output = output.last_hidden_state[:, 0, :]  # Użycie [CLS] tokena
        pooled_output = self.dropout(pooled_output)
        return self.out(pooled_output)

In [13]:
train_dataset = CustomDataset(df_train, tokenizer, MAX_LEN)
valid_dataset = CustomDataset(df_valid, tokenizer, MAX_LEN)
test_dataset = CustomDataset(df_test, tokenizer, MAX_LEN)

train_data_loader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH, shuffle=True, num_workers=0)
val_data_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=BATCH, shuffle=False, num_workers=0)
test_data_loader = torch.utils.data.DataLoader(test_dataset, batch_size=BATCH, shuffle=False, num_workers=0)

In [14]:
# from torch.utils.data import DataLoader
# 
# def create_data_loader(df, tokenizer, max_len, batch_size):
#     ds = CustomDataset(
#         df=df,
#         tokenizer=tokenizer,
#         max_len=max_len
#     )
#     return DataLoader(ds, batch_size=batch_size, num_workers=4)
# 
# train_data_loader = create_data_loader(df_train, tokenizer, MAX_LEN, BATCH)
# valid_data_loader = create_data_loader(df_valid, tokenizer, MAX_LEN, BATCH)
# test_data_loader = create_data_loader(df_test, tokenizer, MAX_LEN, BATCH)


In [15]:
def train_epoch(model, data_loader, loss_fn, optimizer, device):
    model.train()
    total_targets = []
    total_preds = []
    losses = []

    for batch in tqdm(data_loader, desc='Training', leave=False):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        token_type_ids = batch["token_type_ids"].to(device)
        targets = batch["targets"].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask, token_type_ids).squeeze(-1)
        loss = loss_fn(outputs, targets)
        loss.backward()
        optimizer.step()

        losses.append(loss.item())
        total_targets.extend(targets.cpu().numpy())
        total_preds.extend(outputs.cpu().detach().numpy())

    mse, mae, r2, pearson_corr = compute_metrics(np.array(total_targets), np.array(total_preds))
    return np.mean(losses), mse, mae, r2, pearson_corr

In [16]:
def eval_model(model, data_loader, loss_fn, device):
    model.eval()
    total_targets = []
    total_preds = []
    losses = []

    with torch.no_grad():
        for batch in tqdm(data_loader, desc='Validation', leave=False):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            token_type_ids = batch["token_type_ids"].to(device)
            targets = batch["targets"].to(device)

            outputs = model(input_ids, attention_mask, token_type_ids).squeeze(-1)
            loss = loss_fn(outputs, targets)
            losses.append(loss.item())

            total_targets.extend(targets.cpu().numpy())
            total_preds.extend(outputs.cpu().detach().numpy())

    mse, mae, r2, pearson_corr = compute_metrics(np.array(total_targets), np.array(total_preds))
    return np.mean(losses), mse, mae, r2, pearson_corr


In [17]:
def train_model(model, train_loader, val_loader, loss_fn, optimizer, device, epochs=10, patience=3):
    early_stopping = EarlyStopping(patience=patience, mode='min')

    for epoch in range(epochs):
        print(f"Epoch {epoch+1}/{epochs}")
        train_loss, train_mse, train_mae, train_r2, train_pearson = train_epoch(model, train_loader, loss_fn, optimizer, device)
        val_loss, val_mse, val_mae, val_r2, val_pearson = eval_model(model, val_loader, loss_fn, device)

        print(f"Train Loss: {train_loss:.4f}, MSE: {train_mse:.4f}, MAE: {train_mae:.4f}, R2: {train_r2:.4f}, Pearson: {train_pearson:.4f}")
        print(f"Val Loss: {val_loss:.4f}, MSE: {val_mse:.4f}, MAE: {val_mae:.4f}, R2: {val_r2:.4f}, Pearson: {val_pearson:.4f}")

        early_stopping(val_loss, model)
        if early_stopping.early_stop:
            print("Early stopping triggered.")
            break


In [18]:
class EarlyStopping:
    def __init__(self, patience=3, mode='min', delta=0):
        self.patience = patience
        self.mode = mode
        self.delta = delta
        self.best_score = None
        self.epochs_no_improve = 0
        self.early_stop = False

    def __call__(self, score, model):
        if self.best_score is None or \
                (self.mode == 'min' and score < self.best_score - self.delta) or \
                (self.mode == 'max' and score > self.best_score + self.delta):
            self.best_score = score
            self.epochs_no_improve = 0
        else:
            self.epochs_no_improve += 1
            if self.epochs_no_improve >= self.patience:
                self.early_stop = True


In [19]:
# def train_epoch(model, data_loader, loss_fn, optimizer, device):
#     model.train()
#     total_targets = []
#     total_preds = []
#     losses = []
# 
#     for batch in tqdm(data_loader, desc='Training', leave=False):
#         input_ids = batch["input_ids"].to(device)
#         attention_mask = batch["attention_mask"].to(device)
#         token_type_ids = batch["token_type_ids"].to(device)
#         targets = batch["targets"].to(device)
# 
#         optimizer.zero_grad()
#         outputs = model(input_ids, attention_mask, token_type_ids).squeeze(-1)
#         loss = loss_fn(outputs, targets)
#         loss.backward()
#         optimizer.step()
# 
#         losses.append(loss.item())
#         total_targets.extend(targets.cpu().numpy())
#         total_preds.extend(outputs.cpu().detach().numpy())
# 
#     mse, mae, r2, pearson_corr = compute_metrics(np.array(total_targets), np.array(total_preds))
#     return np.mean(losses), mse, mae, r2, pearson_corr


In [20]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from scipy.stats import pearsonr

def compute_metrics(y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    pearson_corr, _ = pearsonr(y_true, y_pred)
    return mse, mae, r2, pearson_corr


In [21]:
model = RegressionModel(pretrained_model_name=PRE_TRAINED_MODEL_NAME, dropout_rate=DROPOUT_RATE)
model = model.to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
loss_fn = nn.MSELoss().to(device)

scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, mode=MODE, patience=PATIENCE, factor=FACTOR, verbose=VERBOSE
)

You are using a model of type distilbert to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.
Some weights of BertModel were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['embeddings.LayerNorm.bias', 'embeddings.LayerNorm.weight', 'embeddings.position_embeddings.weight', 'embeddings.token_type_embeddings.weight', 'embeddings.word_embeddings.weight', 'encoder.layer.0.attention.output.LayerNorm.bias', 'encoder.layer.0.attention.output.LayerNorm.weight', 'encoder.layer.0.attention.output.dense.bias', 'encoder.layer.0.attention.output.dense.weight', 'encoder.layer.0.attention.self.key.bias', 'encoder.layer.0.attention.self.key.weight', 'encoder.layer.0.attention.self.query.bias', 'encoder.layer.0.attention.self.query.weight', 'encoder.layer.0.attention.self.value.bias', 'encoder.layer.0.attention.self.value.weight', 'encoder.layer.0.intermediate.dense.bias', 'encoder.layer.0.inte

In [22]:
train_model(model, train_data_loader, val_data_loader, loss_fn, optimizer, device, epochs=10, patience=3)


Epoch 1/10


                                                           

Train Loss: 2.1028, MSE: 2.1055, MAE: 1.1571, R2: -0.6501, Pearson: 0.0220
Val Loss: 1.6171, MSE: 1.6089, MAE: 1.0616, R2: -0.2893, Pearson: 0.1315
Epoch 2/10


                                                           

Train Loss: 1.5413, MSE: 1.5445, MAE: 1.0563, R2: -0.2104, Pearson: -0.0011
Val Loss: 1.3084, MSE: 1.3020, MAE: 1.0125, R2: -0.0434, Pearson: 0.1717
Epoch 3/10


                                                           

Train Loss: 1.4115, MSE: 1.4142, MAE: 1.0235, R2: -0.1084, Pearson: 0.0103
Val Loss: 1.3890, MSE: 1.3854, MAE: 0.9720, R2: -0.1102, Pearson: 0.2484
Epoch 4/10


                                                           

Train Loss: 1.3793, MSE: 1.3811, MAE: 1.0178, R2: -0.0824, Pearson: 0.0299
Val Loss: 1.3596, MSE: 1.3527, MAE: 1.0187, R2: -0.0840, Pearson: 0.2794
Epoch 5/10


                                                           

Train Loss: 1.4056, MSE: 1.4028, MAE: 1.0268, R2: -0.0994, Pearson: -0.0043
Val Loss: 1.3159, MSE: 1.3118, MAE: 0.9799, R2: -0.0512, Pearson: 0.2991
Early stopping triggered.




In [23]:
# import torch.optim.lr_scheduler as lr_scheduler
# 
# PATIENCE = 3 
# best_val_loss = float("inf")
# epochs_no_improve = 0
# 
# scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=FACTOR, patience=1, verbose=True)
# 
# for epoch in range(EPOCHS):
#     print(f'Epoch {epoch + 1}/{EPOCHS}')
# 
#     train_loss = train_epoch(model, train_data_loader, loss_fn, optimizer, device)
#     val_loss = eval_model(model, val_data_loader, loss_fn, device)
# 
#     print(f'Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f}')
# 
#     if val_loss < best_val_loss:
#         best_val_loss = val_loss
#         epochs_no_improve = 0
#         torch.save(model.state_dict(), "best_model.pt")
#         print("Model saved!")
#     else:
#         epochs_no_improve += 1
#         print(f'No improvement for {epochs_no_improve} epoch(s).')
# 
#     scheduler.step(val_loss)
# 
#     # Early Stopping
#     if epochs_no_improve >= PATIENCE:
#         print("Early stopping triggered!")
#         break


In [24]:
# model.load_state_dict(torch.load('best_model_state.bin'))
# 
# test_loss = eval_model(model, test_data_loader, loss_fn, device)
# print(f"Test loss: {test_loss}")

In [25]:
# import matplotlib.pyplot as plt
# 
# # Przyklad wizualizacji
# plt.scatter(total_targets, total_preds, alpha=0.5)
# plt.xlabel("Prawdziwe wartości")
# plt.ylabel("Predykcje")
# plt.title("Porównanie predykcji z rzeczywistością")
# plt.show()
