In [None]:
import pandas as pd
import numpy as np
import tqdm.notebook as tq
from collections import defaultdict

import torch
import torch.nn as nn
from transformers import BertTokenizer, BertModel

from sklearn.metrics import f1_score
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [None]:
NUM_CLASSES = 4
MAX_LEN = 100
BATCH = 8
PRE_TRAINED_MODEL_NAME = "distilbert-base-uncased" #'bert-base-cased'
EPOCHS = 50
LEARNING_RATE = 0.0001
THRESHOLD = 0.2
DROPOUT_RATE = 0.5
WEIGHT_DECAY = 0.2
MODE = 'min'
PATIENCE = 2
FACTOR = 0.5
VERBOSE = True

In [None]:
df = pd.read_csv('C:/Users/juwieczo/DataspellProjects/meisd_project/pipeline/balanced_augmented_data_primary_intensity.csv')
# Zamień wartości na liczby całkowite
# df['intensity'] = pd.to_numeric(df['intensity'], errors='coerce').fillna(0)
# df['intensity2'] = pd.to_numeric(df['intensity2'], errors='coerce').fillna(0)
# df['intensity3'] = pd.to_numeric(df['intensity3'], errors='coerce').fillna(0)

# Zamień wartości zawierające tylko białe znaki lub '`', 'neu', 'po' na NaN
# df['intensity'] = df['intensity'].replace(r'^\s*$', np.nan, regex=True)
# df['intensity'] = df['intensity'].replace(['`', 'neu', 'po'], np.nan)
# df['intensity2'] = df['intensity2'].replace(r'^\s*$', np.nan, regex=True)
# df['intensity2'] = df['intensity2'].replace(['`', 'neu', 'po'], np.nan)
# df['intensity3'] = df['intensity3'].replace(r'^\s*$', np.nan, regex=True)
# df['intensity3'] = df['intensity3'].replace(['`', 'neu', 'po'], np.nan)

# Użyj forward fill, aby uzupełnić brakujące wartości poprzedzającą wartością
# df['intensity'] = df['intensity'].ffill()
# df['intensity2'] = df['intensity2'].ffill()
# df['intensity3'] = df['intensity3'].ffill()

# Usuń znaki niebędące cyframi (np. '`') za pomocą wyrażeń regularnych
# df['intensity'] = df['intensity'].replace(r'\D', '', regex=True).astype(int)  # Usuwa wszystko, co nie jest cyfrą
# df['intensity2'] = df['intensity2'].replace(r'\D', '', regex=True).astype(int)
# df['intensity3'] = df['intensity3'].replace(r'\D', '', regex=True).astype(int)

missing_count = df['label'].isna().sum()
print(f"Liczba braków w kolumnie 'intensity': {missing_count}")
unique_values = df['label'].unique()
print(f"Unikalne wartości w kolumnie 'intensity': {unique_values}")

In [None]:
#columns = ['Utterances', 'dialog_ids', 'uttr_ids', 'intensity', 'intensity2', 'intensity3']
columns = ['Utterances', 'label']
df = df[columns].copy()

In [None]:
# first_25_data = []
# last_25_data = []
# 
# def process_group(group):
#     num_rows = len(group)
#     quarter_size = max(1, num_rows // 4)
# 
#     # First 25%
#     first_25 = group.iloc[:quarter_size]
#     primary_intensity = max(
#         group['intensity'].iloc[0],
#         group['intensity2'].iloc[0],
#         group['intensity3'].iloc[0]
#     )
#     first_25 = first_25.assign(primary_intensity=primary_intensity)
# 
#     # Last 25%
#     last_25 = group.iloc[-quarter_size:]
#     final_intensity = max(
#         group['intensity'].iloc[-1],
#         group['intensity2'].iloc[-1],
#         group['intensity3'].iloc[-1]
#     )
#     last_25 = last_25.assign(final_intensity=final_intensity)
# 
#     first_25_data.append(first_25)
#     last_25_data.append(last_25)
# 
# df.groupby('dialog_ids').apply(process_group)
# 
# first_25_df = pd.concat(first_25_data).reset_index(drop=True)
# last_25_df = pd.concat(last_25_data).reset_index(drop=True)
# 
# grouped_first_25 = first_25_df.groupby('dialog_ids').agg({
#     'Utterances': ' '.join,
#     'primary_intensity': 'first'
# }).reset_index()
# 
# grouped_last_25 = last_25_df.groupby('dialog_ids').agg({
#     'Utterances': ' '.join,
#     'final_intensity': 'first'
# }).reset_index()
# 
# df = grouped_first_25.drop(df.columns[0], axis=1)
# 
# # grouped_first_25.to_csv('first_25_percent.csv', index=False)
# # grouped_last_25.to_csv('last_25_percent.csv', index=False)

In [None]:
#grouped_first_25.head()

In [None]:
# from transformers import BertTokenizer, BertModel
# tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)
# 
# # Test the tokenizer
# test_text = "We are testing BERT tokenizer."
# # generate encodings
# encodings = tokenizer.encode_plus(test_text,
#                                   add_special_tokens = True, # Add '[CLS]' and '[SEP]'
#                                   max_length = 50,
#                                   truncation = True,
#                                   padding = "max_length",
#                                   return_attention_mask = True,
#                                   return_tensors = "pt")

In [None]:
from transformers import BertTokenizer, BertModel, DistilBertTokenizer

# Jeśli używasz DistilBERT, musisz załadować tokenizer dla DistilBERT
tokenizer = DistilBertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)

test_text = "We are testing BERT tokenizer."
encodings = tokenizer.encode_plus(
    test_text,
    add_special_tokens=True,  # Add '[CLS]' and '[SEP]'
    max_length=50,
    truncation=True,
    padding="max_length",
    return_attention_mask=True,
    return_tensors="pt"
)

print("Input IDs:", encodings["input_ids"])
print("Attention Mask:", encodings["attention_mask"])


In [None]:
df.head()

In [None]:
token_lens = []

for txt in df['Utterances']:
    tokens = tokenizer.encode(txt, max_length=512, truncation=True)
    token_lens.append(len(tokens))


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.distplot(token_lens)
plt.xlim([0, 100])
plt.xlabel('Token count')

In [None]:
#df = grouped_first_25

In [None]:
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, df, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.df = df
        self.utterances = list(df['Utterances'])
        self.targets = self.df['label'].astype(int).values
        self.max_len = max_len

    def __len__(self):
        return len(self.utterances)

    def __getitem__(self, index):
        utterances = str(self.utterances[index])

        inputs = self.tokenizer.encode_plus(
            utterances,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        target = torch.tensor(self.targets[index], dtype=torch.long)

        return {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'token_type_ids': inputs["token_type_ids"].flatten(),
            'targets': torch.tensor(self.targets[index], dtype=torch.long),
            'utterances': utterances
        }


In [None]:
from sklearn.model_selection import train_test_split

# split into train and test
df_train, df_test = train_test_split(df, random_state=77, test_size=0.30, shuffle=True)
# split test into test and validation datasets
df_test, df_valid = train_test_split(df_test, random_state=88, test_size=0.50, shuffle=True)

In [None]:
print(f"Original train size: {df.shape}")
print(f"Validation size: {df_valid.shape}, Test size: {df_test.shape}")

In [None]:
label_frequencies = df_train['label'].value_counts()
label_frequencies_percent = df_train['label'].value_counts(normalize=True) * 100
print(label_frequencies_percent)
print(label_frequencies)

In [None]:
target_list = list(df.columns)
target_list = target_list[1:]
target_list

In [None]:
# class BERT_IntensityClass(torch.nn.Module):
#     def __init__(self, bert_model):
#         super(BERT_IntensityClass, self).__init__()
#         self.bert_model = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME, return_dict=True)
#         self.dropout = torch.nn.Dropout(p=DROPOUT_RATE) #0.5
#         self.linear = torch.nn.Linear(bert_model.config.hidden_size, NUM_CLASSES)
#         #self.softmax = nn.Softmax(dim=1) #remove for sentiment analysis
#         #CrossEntropyLoss automatycznie aplikuje funkcję softmax, więc nie ma potrzeby używać Softmax w modelu.
# 
# 
#     def forward(self, input_ids, attn_mask, token_type_ids=None):
#         output = self.bert_model(input_ids, attention_mask=attn_mask, token_type_ids=token_type_ids)
#         #pooler_output = self.pooler_output
#         dropout_output = self.dropout(output.pooler_output)
#         linear_output = self.linear(dropout_output)
#         #output = self.dropout(linear_output)
#         # output = self.softmax(linear_output)
#         return linear_output

In [None]:
class DistilBERT_IntensityClass(torch.nn.Module):
    def __init__(self, distilbert_model, dropout_rate=DROPOUT_RATE, num_classes=NUM_CLASSES):
        super(DistilBERT_IntensityClass, self).__init__()
        self.distilbert_model = distilbert_model
        self.dropout = torch.nn.Dropout(p=DROPOUT_RATE)
        self.linear = torch.nn.Linear(self.distilbert_model.config.hidden_size, num_classes)

    def forward(self, input_ids, attn_mask):
        # DistilBERT model processing
        output = self.distilbert_model(input_ids, attention_mask=attn_mask)

        # Use the last hidden state (the embedding for [CLS] token is at index 0)
        cls_output = output.last_hidden_state[:, 0, :]  # Shape: [batch_size, hidden_size]
        # Apply dropout
        dropout_output = self.dropout(cls_output)
        # Get final class logits
        linear_output = self.linear(dropout_output)
        return linear_output


In [None]:
# bert_model = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)
# model = BERT_IntensityClass(bert_model)
# model.to(device)
# tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)

In [None]:
from transformers import DistilBertModel
distilbert_model = DistilBertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)
model = DistilBERT_IntensityClass(distilbert_model)
model.to(device)
tokenizer = DistilBertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)

In [None]:
train_dataset = CustomDataset(df_train, tokenizer, MAX_LEN)
valid_dataset = CustomDataset(df_valid, tokenizer, MAX_LEN)
test_dataset = CustomDataset(df_test, tokenizer, MAX_LEN)

In [None]:
train_data_loader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH, shuffle=True, num_workers=0)
val_data_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=BATCH, shuffle=False, num_workers=0)
test_data_loader = torch.utils.data.DataLoader(test_dataset, batch_size=BATCH, shuffle=False, num_workers=0)

In [None]:
# data = next(iter(train_data_loader))
# outputs = model(data["input_ids"], attn_mask=data["attention_mask"])
# print(outputs)

In [None]:
data = next(iter(train_data_loader))

In [None]:
# test_text = "We are testing BERT tokenizer."
# encodings = tokenizer.encode_plus(test_text,
#                                   add_special_tokens=True,  # Add '[CLS]' and '[SEP]'
#                                   max_length=50,
#                                   truncation=True,
#                                   padding="max_length",
#                                   return_attention_mask=True,
#                                   return_tensors="pt")


In [None]:
from transformers import BertTokenizer, BertModel, DistilBertTokenizer

# Jeśli używasz DistilBERT, musisz załadować tokenizer dla DistilBERT

test_text = "We are testing BERT tokenizer."
encodings = tokenizer.encode_plus(
    test_text,
    add_special_tokens=True,  # Add '[CLS]' and '[SEP]'
    max_length=50,
    truncation=True,
    padding="max_length",
    return_attention_mask=True,
    return_tensors="pt"
)

print("Input IDs:", encodings["input_ids"])
print("Attention Mask:", encodings["attention_mask"])


In [None]:
# bert_model = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)
# last_hidden_state, pooled_output = bert_model(
#     input_ids=encodings['input_ids'],
#     attention_mask=encodings['attention_mask']
# )

In [None]:
# Pass the inputs through the DistilBERT model
output = distilbert_model(
    input_ids=encodings['input_ids'],
    attention_mask=encodings['attention_mask']
)

# Extract the last hidden state
last_hidden_state = output.last_hidden_state

# Extract the representation of the [CLS] token (first token in the sequence)
cls_output = last_hidden_state[:, 0, :]  # Shape: [batch_size, hidden_size]

# Now you can use cls_output for downstream tasks (e.g., classification)


In [None]:
class_distribution = df_train['label'].value_counts(normalize=True)
total_samples = sum(class_distribution)
class_weights = [total_samples / count for count in class_distribution]
class_weights = torch.tensor(class_weights, dtype=torch.float).to(device)
class_weights

In [None]:
def loss_fn(outputs, targets):
    return torch.nn.CrossEntropyLoss(weight=class_weights)(outputs, targets)

In [None]:
from torch.utils.tensorboard import SummaryWriter

writer = SummaryWriter(log_dir='logs')

In [None]:
from transformers import AdamW, get_linear_schedule_with_warmup

#EPOCHS = 10
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode=MODE, patience=PATIENCE, factor=FACTOR, verbose=VERBOSE)

In [None]:
def train_model(training_loader, model, optimizer):
    losses = []
    correct_predictions = 0
    num_samples = 0
    all_preds = []
    all_labels = []

    model.train()
    loop = tq.tqdm(enumerate(training_loader), total=len(training_loader), leave=True, colour='steelblue')

    for batch_idx, data in loop:
        ids = data['input_ids'].to(device, dtype=torch.long)
        mask = data['attention_mask'].to(device, dtype=torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype=torch.long)
        targets = data['targets'].to(device, dtype=torch.long)


# Forward pass
        outputs = model(ids, mask)

        # Calculate loss
        loss = loss_fn(outputs, targets)
        losses.append(loss.item())

        # Calculate predictions and accuracy
        _, preds = torch.max(outputs, dim=1)
        correct_predictions += torch.sum(preds == targets).item()
        num_samples += targets.size(0)

        # Collect predictions and labels for F1-score
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(targets.cpu().numpy())

        # Backward pass and optimization
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        optimizer.zero_grad()

        # Update progress bar
        loop.set_postfix(batch_loss=loss.item())

    # Calculate F1-score for training data
    train_f1 = f1_score(all_labels, all_preds, average='weighted')

    return model, correct_predictions / num_samples, np.mean(losses), train_f1

In [None]:
def eval_model(validation_loader, model, epoch):
    losses = []
    correct_predictions = 0
    num_samples = 0
    all_preds = []
    all_labels = []

    model.eval()

    with torch.no_grad():
        for data in validation_loader:
            ids = data['input_ids'].to(device, dtype=torch.long)
            mask = data['attention_mask'].to(device, dtype=torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype=torch.long)
            targets = data['targets'].to(device, dtype=torch.long)

            # Forward pass
            outputs = model(ids, mask)

            # Calculate loss
            loss = loss_fn(outputs, targets)
            losses.append(loss.item())

            # Calculate predictions and accuracy
            _, preds = torch.max(outputs, dim=1)
            correct_predictions += torch.sum(preds == targets).item()
            num_samples += targets.size(0)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(targets.cpu().numpy())

    avg_loss = np.mean(losses)
    val_f1 = f1_score(all_labels, all_preds, average='weighted')

    # Logowanie do TensorBoard
    writer.add_scalar('Loss/validation', avg_loss, epoch)
    writer.add_scalar('F1-Score/validation', val_f1, epoch)

    return correct_predictions / num_samples, avg_loss, val_f1

In [None]:
from collections import defaultdict
import torch
from torch.utils.tensorboard import SummaryWriter
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import io

history = defaultdict(list)
best_accuracy = 0
writer = SummaryWriter(log_dir='logs')

def plot_confusion_matrix(cm, class_names, epoch):
    figure = plt.figure(figsize=(10, 8))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=class_names, yticklabels=class_names)
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.title(f'Confusion Matrix at Epoch {epoch}')

    buf = io.BytesIO()
    plt.savefig(buf, format='png')
    buf.seek(0)
    image = torch.tensor(np.frombuffer(buf.getvalue(), dtype=np.uint8)).float()
    writer.add_image('Confusion Matrix', image, epoch)

    plt.close(figure)  

In [None]:
# Główna pętla treningowa
# %%time
history = defaultdict(list)
best_f1 = 0

for epoch in range(1, EPOCHS + 1):
    print(f'Epoch {epoch}/{EPOCHS}')

    model, train_acc, train_loss, train_f1 = train_model(train_data_loader, model, optimizer)
    print(f'Train loss {train_loss:.4f} | Train accuracy {train_acc:.4f} | Train F1 {train_f1:.4f}')

    val_acc, val_loss, val_f1 = eval_model(val_data_loader, model, epoch)
    print(f'Val loss {val_loss:.4f} | Val accuracy {val_acc:.4f} | Val F1 {val_f1:.4f}')

    # Logowanie metryk do TensorBoard
    writer.add_scalar('Loss/train', train_loss, epoch)
    writer.add_scalar('Accuracy/train', train_acc, epoch)
    writer.add_scalar('F1-Score/train', train_f1, epoch)

    history['train_acc'].append(train_acc)
    history['train_loss'].append(train_loss)
    history['train_f1'].append(train_f1)
    history['val_acc'].append(val_acc)
    history['val_loss'].append(val_loss)
    history['val_f1'].append(val_f1)

    # Sprawdzenie najlepszej F1 i zapisanie modelu
    if val_f1 > best_f1:
        torch.save(model.state_dict(), "best_model_state.bin")
        best_f1 = val_f1
        print("Saved new best model.")

    scheduler.step(val_loss)  # Tuning LR

writer.close()