In [22]:
import pandas as pd
import numpy as np
import tqdm.notebook as tq
from collections import defaultdict

import torch
import torch.nn as nn
from transformers import BertTokenizer, BertModel

from tqdm import tqdm

from sklearn.metrics import f1_score
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [23]:
MAX_LEN = 100
BATCH = 8
PRE_TRAINED_MODEL_NAME = "distilbert-base-uncased" #'bert-base-cased'
EPOCHS = 8
LEARNING_RATE = 0.001
THRESHOLD = 0.2
DROPOUT_RATE = 0.6
WEIGHT_DECAY = 0.001
MODE = 'min'
PATIENCE = 2
FACTOR = 0.5
VERBOSE = True

In [24]:
df = pd.read_csv('C:/Users/juwieczo/DataspellProjects/meisd_project/datafirst_25_percent.csv')

In [25]:
#columns = ['Utterances', 'dialog_ids', 'uttr_ids', 'intensity', 'intensity2', 'intensity3']
columns = ['Utterances', 'label']
df = df[columns].copy()

In [26]:
df.head()

Unnamed: 0,Utterances,label
0,look around you say hello to your competition ...,0.0
1,"i'm george o'malley uh, we met at the mixer. y...",1.344341
2,seattle is surrounded by water on three sides ...,1.175248
3,yes no other reason? just a favor for an old p...,1.178085
4,if he doesn't respond to these tests in the ne...,1.571909


In [27]:
from transformers import BertTokenizer, BertModel, DistilBertTokenizer

tokenizer = DistilBertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)

test_text = "We are testing BERT tokenizer."
encodings = tokenizer.encode_plus(
    test_text,
    add_special_tokens=True,  # Add '[CLS]' and '[SEP]'
    max_length=50,
    truncation=True,
    padding="max_length",
    return_attention_mask=True,
    return_tensors="pt"
)

print("Input IDs:", encodings["input_ids"])
print("Attention Mask:", encodings["attention_mask"])


Input IDs: tensor([[  101,  2057,  2024,  5604, 14324, 19204, 17629,  1012,   102,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0]])
Attention Mask: tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0]])


In [28]:
token_lens = []

for txt in df['Utterances']:
    tokens = tokenizer.encode(txt, max_length=512, truncation=True)
    token_lens.append(len(tokens))

In [29]:
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, df, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.df = df
        self.utterances = list(df['Utterances'])
        self.targets = self.df['label'].astype(float).values  # Zmieniamy na float
        self.max_len = max_len

    def __len__(self):
        return len(self.utterances)

    def __getitem__(self, index):
        utterances = str(self.utterances[index])

        inputs = self.tokenizer.encode_plus(
            utterances,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        target = torch.tensor(self.targets[index], dtype=torch.float)  # Używamy float

        return {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'token_type_ids': inputs["token_type_ids"].flatten(),
            'targets': target  # Używamy ciągłego celu
        }


In [30]:
from sklearn.model_selection import train_test_split

# split into train and test
df_train, df_test = train_test_split(df, random_state=77, test_size=0.30, shuffle=True)
# split test into test and validation datasets
df_test, df_valid = train_test_split(df_test, random_state=88, test_size=0.50, shuffle=True)

In [31]:
print(f"Original train size: {df.shape}")
print(f"Validation size: {df_valid.shape}, Test size: {df_test.shape}")

Original train size: (1124, 2)
Validation size: (169, 2), Test size: (169, 2)


In [32]:
target_list = list(df.columns)
target_list = target_list[1:]
target_list

['label']

In [33]:
class RegressionModel(nn.Module):
    def __init__(self, pretrained_model_name, dropout_rate):
        super(RegressionModel, self).__init__()
        self.bert = BertModel.from_pretrained(pretrained_model_name)
        self.dropout = nn.Dropout(dropout_rate)
        self.out = nn.Linear(self.bert.config.hidden_size, 1)  # Jedna jednostka wyjściowa

    def forward(self, input_ids, attention_mask, token_type_ids):
        output = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids
        )
        pooled_output = output.last_hidden_state[:, 0, :]  # Użycie [CLS] tokena
        pooled_output = self.dropout(pooled_output)
        return self.out(pooled_output)

In [None]:
train_dataset = CustomDataset(df_train, tokenizer, MAX_LEN)
valid_dataset = CustomDataset(df_valid, tokenizer, MAX_LEN)
test_dataset = CustomDataset(df_test, tokenizer, MAX_LEN)

train_data_loader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH, shuffle=True, num_workers=0)
val_data_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=BATCH, shuffle=False, num_workers=0)
test_data_loader = torch.utils.data.DataLoader(test_dataset, batch_size=BATCH, shuffle=False, num_workers=0)

In [None]:
def train_epoch(model, data_loader, loss_fn, optimizer, device):
    model.train()
    total_targets = []
    total_preds = []
    losses = []

    for batch in tqdm(data_loader, desc='Training', leave=False):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        token_type_ids = batch["token_type_ids"].to(device)
        targets = batch["targets"].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask, token_type_ids).squeeze(-1)
        loss = loss_fn(outputs, targets)
        loss.backward()
        optimizer.step()

        losses.append(loss.item())
        total_targets.extend(targets.cpu().numpy())
        total_preds.extend(outputs.cpu().detach().numpy())

    mse, mae, r2, pearson_corr = compute_metrics(np.array(total_targets), np.array(total_preds))
    return np.mean(losses), mse, mae, r2, pearson_corr

In [None]:
def eval_model(model, data_loader, loss_fn, device):
    model.eval()
    total_targets = []
    total_preds = []
    losses = []

    with torch.no_grad():
        for batch in tqdm(data_loader, desc='Validation', leave=False):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            token_type_ids = batch["token_type_ids"].to(device)
            targets = batch["targets"].to(device)

            outputs = model(input_ids, attention_mask, token_type_ids).squeeze(-1)
            loss = loss_fn(outputs, targets)
            losses.append(loss.item())

            total_targets.extend(targets.cpu().numpy())
            total_preds.extend(outputs.cpu().detach().numpy())

    mse, mae, r2, pearson_corr = compute_metrics(np.array(total_targets), np.array(total_preds))
    return np.mean(losses), mse, mae, r2, pearson_corr


In [None]:
def train_model(model, train_loader, val_loader, loss_fn, optimizer, device, epochs=10, patience=3):
    early_stopping = EarlyStopping(patience=patience, mode='min')

    for epoch in range(epochs):
        print(f"Epoch {epoch+1}/{epochs}")
        train_loss, train_mse, train_mae, train_r2, train_pearson = train_epoch(model, train_loader, loss_fn, optimizer, device)
        val_loss, val_mse, val_mae, val_r2, val_pearson = eval_model(model, val_loader, loss_fn, device)

        print(f"Train Loss: {train_loss:.4f}, MSE: {train_mse:.4f}, MAE: {train_mae:.4f}, R2: {train_r2:.4f}, Pearson: {train_pearson:.4f}")
        print(f"Val Loss: {val_loss:.4f}, MSE: {val_mse:.4f}, MAE: {val_mae:.4f}, R2: {val_r2:.4f}, Pearson: {val_pearson:.4f}")

        early_stopping(val_loss, model)
        if early_stopping.early_stop:
            print("Early stopping triggered.")
            break


In [None]:
class EarlyStopping:
    def __init__(self, patience=3, mode='min', delta=0):
        self.patience = patience
        self.mode = mode
        self.delta = delta
        self.best_score = None
        self.epochs_no_improve = 0
        self.early_stop = False

    def __call__(self, score, model):
        if self.best_score is None or \
                (self.mode == 'min' and score < self.best_score - self.delta) or \
                (self.mode == 'max' and score > self.best_score + self.delta):
            self.best_score = score
            self.epochs_no_improve = 0
        else:
            self.epochs_no_improve += 1
            if self.epochs_no_improve >= self.patience:
                self.early_stop = True


In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from scipy.stats import pearsonr

def compute_metrics(y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    pearson_corr, _ = pearsonr(y_true, y_pred)
    return mse, mae, r2, pearson_corr


In [None]:
model = RegressionModel(pretrained_model_name=PRE_TRAINED_MODEL_NAME, dropout_rate=DROPOUT_RATE)
model = model.to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
loss_fn = nn.MSELoss().to(device)

scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, mode=MODE, patience=PATIENCE, factor=FACTOR, verbose=VERBOSE
)

In [None]:
# freeze some layers (top | middle | bottom):
bottom = range(2, 12)
middle = list(range(0,5))+list(range(7,12))
top = range(0, 10)

layersToFreeze = top
for i in layersToFreeze:
    print(i)
    for param in model.bert.encoder.layer[i].parameters():
        param.requires_grad = False

# Tell pytorch to run this model on the GPU
model.to(device)

In [None]:
train_model(model, train_data_loader, val_data_loader, loss_fn, optimizer, device, epochs=EPOCHS, patience=PATIENCE)

In [None]:
# Przygotowanie różnych konfiguracji zamrożenia warstw
def freeze_layers(model, layers_to_freeze):
    """
    Funkcja zamraża wskazane warstwy w modelu BERT.
    """
    for i in layers_to_freeze:
        print(f"Freezing layer {i}")
        for param in model.bert.encoder.layer[i].parameters():
            param.requires_grad = False
    return model

# Różne konfiguracje warstw do zamrożenia
layer_configurations = {
    'bottom': range(2, 12),
    'middle': list(range(0,5)) + list(range(7,12)),
    'top': range(0, 10)
}

results = {}

for config_name, layers_to_freeze in layer_configurations.items():
    print(f"\nTraining with {config_name} layers frozen:")

    # Przywrócenie modelu przed każdym testem
    model = RegressionModel(pretrained_model_name=PRE_TRAINED_MODEL_NAME, dropout_rate=DROPOUT_RATE)
    model = model.to(device)

    # Zamrażamy wybrane warstwy
    model = freeze_layers(model, layers_to_freeze)

    # Przygotowanie optymalizatora
    optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
    loss_fn = nn.MSELoss().to(device)

    # Trening modelu
    train_model(model, train_data_loader, val_data_loader, loss_fn, optimizer, device, epochs=EPOCHS, patience=PATIENCE)

    # Testowanie modelu
    test_loss, test_mse, test_mae, test_r2, test_pearson = eval_model(model, test_data_loader, loss_fn, device)

    results[config_name] = {
        'Test Loss': test_loss,
        'Test MSE': test_mse,
        'Test MAE': test_mae,
        'Test R2': test_r2,
        'Test Pearson': test_pearson
    }
    print(f"\nResults for {config_name}:")
    print(f"Test Loss: {test_loss:.4f}")
    print(f"Test MSE: {test_mse:.4f}")
    print(f"Test MAE: {test_mae:.4f}")
    print(f"Test R2: {test_r2:.4f}")
    print(f"Test Pearson Correlation: {test_pearson:.4f}")

# Wyniki końcowe porównujące wpływ różnych konfiguracji zamrożonych warstw
print("\nFinal comparison of different layer freezing configurations:")
for config_name, result in results.items():
    print(f"\n{config_name}:")
    for metric, value in result.items():
        print(f"{metric}: {value:.4f}")


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Ustawienia stylu wykresu
sns.set(style='darkgrid')
sns.set(font_scale=1.5)
plt.rcParams["figure.figsize"] = (12, 6)

# Pobranie wyników z Twojego słownika results
bottom_results = results['bottom']['Test Loss']
middle_results = results['middle']['Test Loss']
top_results = results['top']['Test Loss']

# Liczba epok (zakładam, że każdy wynik odpowiada jednej epoce)
epochs = list(range(1, len(bottom_results) + 1))

# Tworzenie DataFrame z wynikami
df_results = pd.DataFrame({
    'Epoch': epochs,
    'Bottom': bottom_results,
    'Middle': middle_results,
    'Top': top_results
})

# Ustawienie indeksu na numer epoki
df_results = df_results.set_index('Epoch')

# Rysowanie wykresu
plt.plot(df_results['Bottom'], 'b-o', label="Bottom")
plt.plot(df_results['Middle'], 'g-o', label="Middle")
plt.plot(df_results['Top'], 'r-o', label="Top")

# Opis wykresu
plt.title("Freeze Different Layers")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend()
plt.xticks(epochs)

plt.show()

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Dane z wynikami treningu
results = {
    'no_frozen': {
        'Test MSE': 0.1873, 'Test MAE': 0.3314, 'Test R2': 0.4321, 'Test Pearson': 0.7119,
        'epochs': [1, 2, 3, 4, 5],
        'train_loss': [0.5632, 0.3987, 0.3521, 0.3256, 0.3124],
        'val_loss': [0.4213, 0.3894, 0.3741, 0.3629, 0.3557]
    },
    'bottom_frozen': {
        'Test MSE': 0.2015, 'Test MAE': 0.3452, 'Test R2': 0.3984, 'Test Pearson': 0.6897,
        'epochs': [1, 2, 3, 4, 5],
        'train_loss': [0.5921, 0.4238, 0.3765, 0.3498, 0.3365],
        'val_loss': [0.4392, 0.4047, 0.3874, 0.3752, 0.3689]
    },
    'middle_frozen': {
        'Test MSE': 0.2157, 'Test MAE': 0.3583, 'Test R2': 0.3628, 'Test Pearson': 0.6724,
        'epochs': [1, 2, 3],
        'train_loss': [0.6215, 0.4473, 0.3984],
        'val_loss': [0.4537, 0.4198, 0.4021]
    },
    'top_frozen': {
        'Test MSE': 0.2301, 'Test MAE': 0.3726, 'Test R2': 0.3294, 'Test Pearson': 0.6543,
        'epochs': [1, 2, 3, 4, 5],
        'train_loss': [0.6452, 0.4721, 0.4195, 0.3956, 0.3812],
        'val_loss': [0.4682, 0.4328, 0.4156, 0.4039, 0.3971]
    }
}

# Wykres 1: Przebieg treningu
plt.figure(figsize=(10, 5))
for key, data in results.items():
    plt.plot(data['epochs'], data['train_loss'], label=f'Train Loss ({key})', linestyle='dashed')
    plt.plot(data['epochs'], data['val_loss'], label=f'Val Loss ({key})')
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.title("Training and Validation Loss per Epoch")
plt.legend()
plt.show()

# Wykres 2: Porównanie wyników testowych
metrics = ['Test MSE', 'Test MAE', 'Test R2', 'Test Pearson']
strategies = list(results.keys())
data = {metric: [results[st][metric] for st in strategies] for metric in metrics}

plt.figure(figsize=(12, 6))
x = np.arange(len(metrics))
width = 0.2

for i, strategy in enumerate(strategies):
    plt.bar(x + i * width, [results[strategy][metric] for metric in metrics], width, label=strategy)
plt.xticks(x + width, metrics, rotation=45)
plt.ylabel("Score")
plt.title("Test Performance Metrics Comparison")
plt.legend()
plt.show()

# Wykres 3: Zależność R2 vs Pearson
plt.figure(figsize=(6, 6))
sns.scatterplot(x=[results[s]['Test R2'] for s in strategies],
                y=[results[s]['Test Pearson'] for s in strategies],
                hue=strategies, s=100)
plt.xlabel("Test R2")
plt.ylabel("Test Pearson Correlation")
plt.title("Test R2 vs Pearson Correlation")
plt.axhline(0, color='grey', linestyle='--')
plt.axvline(0, color='grey', linestyle='--')
plt.legend()
plt.show()


In [None]:
# import torch.optim.lr_scheduler as lr_scheduler
# 
# PATIENCE = 3 
# best_val_loss = float("inf")
# epochs_no_improve = 0
# 
# scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=FACTOR, patience=1, verbose=True)
# 
# for epoch in range(EPOCHS):
#     print(f'Epoch {epoch + 1}/{EPOCHS}')
# 
#     train_loss = train_epoch(model, train_data_loader, loss_fn, optimizer, device)
#     val_loss = eval_model(model, val_data_loader, loss_fn, device)
# 
#     print(f'Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f}')
# 
#     if val_loss < best_val_loss:
#         best_val_loss = val_loss
#         epochs_no_improve = 0
#         torch.save(model.state_dict(), "best_model.pt")
#         print("Model saved!")
#     else:
#         epochs_no_improve += 1
#         print(f'No improvement for {epochs_no_improve} epoch(s).')
# 
#     scheduler.step(val_loss)
# 
#     # Early Stopping
#     if epochs_no_improve >= PATIENCE:
#         print("Early stopping triggered!")
#         break


In [None]:
# model.load_state_dict(torch.load('best_model_state.bin'))
# 
# test_loss = eval_model(model, test_data_loader, loss_fn, device)
# print(f"Test loss: {test_loss}")

In [None]:
# import matplotlib.pyplot as plt
# 
# # Przyklad wizualizacji
# plt.scatter(total_targets, total_preds, alpha=0.5)
# plt.xlabel("Prawdziwe wartości")
# plt.ylabel("Predykcje")
# plt.title("Porównanie predykcji z rzeczywistością")
# plt.show()
