In [1]:
import os
import pandas as pd
import numpy as np
import shutil
import sys
import tqdm.notebook as tq
from collections import defaultdict
from datasets import Dataset

import torch
import torch.nn as nn
from transformers import BertTokenizer, BertModel

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [2]:
# https://towardsdatascience.com/multi-class-text-classification-with-deep-learning-using-bert-b59ca2f5c613

In [2]:
# Hyperparameters
MAX_LEN = 256
#MAX_LEN = 64
TRAIN_BATCH_SIZE = 32
#TRAIN_BATCH_SIZE = 2
VALID_BATCH_SIZE = 32
TEST_BATCH_SIZE = 32
EPOCHS = 3
LEARNING_RATE = 1e-05

In [1]:
data = pd.read_csv('MEISD/MEISD_text.csv')

NameError: name 'pd' is not defined

In [4]:
#data = data.iloc[:int(0.1 * len(data))]

In [ ]:
data

In [5]:
pd.Series(list(data['emotion'])).unique()

array(['neutral', 'acceptance', 'disgust', 'surprise', 'joy', 'sadness',
       'anger', 'like', 'fear', 'acceptance ', 'faer', 'Fear ', 'fear ',
       'Fear', 'Anger', 'Disgust', 'Neutral', 'Surprise', 'Joy',
       'Sadness', 'Fera', 'ANGER', ' disgust', 'Neutral ', 'neutral '],
      dtype=object)

In [6]:
emotion_map = {
    'neutral': 0,
    'acceptance': 1,
    'disgust': 2,
    'surprise': 3,
    'joy': 4,
    'sadness': 5,
    'anger': 6,
    'like': 7,
    'fear': 8
}

data_emotion = pd.DataFrame()
data_emotion['Utterances'] = data['Utterances']
data_emotion['target1'] = data['emotion'].map(emotion_map).fillna(9).astype(int)
data_emotion['target2'] = data['emotion2'].map(emotion_map).fillna(9).astype(int)
data_emotion['target3'] = data['emotion3'].map(emotion_map).fillna(9).astype(int)

In [7]:
def to_binary_vector(row, num_classes=9):
    vector = np.zeros(num_classes)
    for i in range(1, 4):  # iteracja po target1, target2, target3
        if row[f'target{i}'] < num_classes:
            vector[row[f'target{i}']] = 1
    return vector


In [8]:
data_emotion['target_vector'] = data_emotion.apply(to_binary_vector, axis=1)
data_emotion[['Utterances', 'target_vector']]

Unnamed: 0,Utterances,target_vector
0,look around you,"[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]"
1,say hello to your competition,"[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]"
2,eight of you will switch to an easier specialty,"[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]"
3,five of you will crack under the pressure,"[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]"
4,two of you will be asked to leave,"[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]"
...,...,...
20012,"oh, that's right, you're a woman and you need ...","[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0]"
20013,i'll try again,"[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0]"
20014,"please, pam, reconsider and have a bagel","[0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]"
20015,i have an early lunch,"[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0]"


In [9]:
dataset = Dataset.from_pandas(data_emotion[['Utterances', 'target_vector']])

In [10]:
dataset

Dataset({
    features: ['Utterances', 'target_vector'],
    num_rows: 20017
})

In [11]:
 #split = dataset['train'].train_test_split(test_size=0.3, seed=42)
split = dataset.train_test_split(test_size=0.3, seed=42)

In [12]:
split

DatasetDict({
    train: Dataset({
        features: ['Utterances', 'target_vector'],
        num_rows: 14011
    })
    test: Dataset({
        features: ['Utterances', 'target_vector'],
        num_rows: 6006
    })
})

In [13]:
from transformers import AutoTokenizer

checkpoint = 'bert-base-cased'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)




In [14]:
train_data = split['train']
val_data = split['test']

In [15]:
from torch.utils.data import TensorDataset

encoded_data_train = tokenizer.batch_encode_plus(
    train_data['Utterances'],
    add_special_tokens=True,
    return_attention_mask=True,
    pad_to_max_length=True,
    max_length=MAX_LEN,
    return_tensors='pt'
)

encoded_data_val = tokenizer.batch_encode_plus(
    val_data['Utterances'],
    add_special_tokens=True,
    return_attention_mask=True,
    pad_to_max_length=True,
    max_length=MAX_LEN,
    return_tensors='pt'
)



input_ids_train = encoded_data_train['input_ids']
attention_masks_train = encoded_data_train['attention_mask']
labels_train = torch.tensor(train_data['target_vector'])

input_ids_val = encoded_data_val['input_ids']
attention_masks_val = encoded_data_val['attention_mask']
labels_val = torch.tensor(val_data['target_vector'])

dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)
dataset_val = TensorDataset(input_ids_val, attention_masks_val, labels_val)


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [16]:
from transformers import BertForSequenceClassification
model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                      num_labels=9,
                                                      output_attentions=False,
                                                      output_hidden_states=False)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

batch_size = 32

dataloader_train = DataLoader(dataset_train,
                              sampler=RandomSampler(dataset_train),
                              batch_size=batch_size)

dataloader_validation = DataLoader(dataset_val,
                                   sampler=SequentialSampler(dataset_val),
                                   batch_size=batch_size)


In [18]:
from transformers import AdamW, get_linear_schedule_with_warmup

optimizer = AdamW(model.parameters(),
                  lr=1e-5,
                  eps=1e-8)

epochs = 3

scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=0,
                                            num_training_steps=len(dataloader_train)*epochs)




In [19]:
from sklearn.metrics import f1_score

def f1_score_func(preds, labels):
    # Flatten both preds and labels
    preds_flat = np.round(preds).astype(int).flatten()
    labels_flat = labels.astype(int).flatten()

    return f1_score(labels_flat, preds_flat, average='weighted', zero_division=0)

def accuracy_per_class(preds, labels):
    label_dict_inverse = {v: k for k, v in emotion_map.items()}

    preds_flat = np.round(preds).astype(int)
    labels_flat = labels.astype(int)

    # Iterate over each label/class
    for i in range(labels_flat.shape[1]):
        y_preds = preds_flat[:, i]
        y_true = labels_flat[:, i]
        class_name = label_dict_inverse[i]
        accuracy = np.mean(y_preds == y_true)  # Calculate accuracy
        print(f'Class: {class_name}')
        print(f'Accuracy: {accuracy}\n')



In [20]:
  import random
from tqdm import tqdm

seed_val = 17
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

def evaluate(dataloader_val):
    model.eval()

    loss_val_total = 0
    predictions, true_vals = [], []

    for batch in dataloader_val:
        batch = tuple(b.to(device) for b in batch)

        inputs = {
            'input_ids': batch[0],
            'attention_mask': batch[1],
            'labels': batch[2],
        }

        with torch.no_grad():
            outputs = model(**inputs)

        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids) 

    loss_val_avg = loss_val_total / len(dataloader_val)

    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)

    return loss_val_avg, predictions, true_vals

for epoch in tqdm(range(1, EPOCHS + 1)):

    model.train()

    loss_train_total = 0

    progress_bar = tqdm(dataloader_train, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)
    for batch in progress_bar:

        optimizer.zero_grad() 

        batch = tuple(b.to(device) for b in batch)

        inputs = {
            'input_ids': batch[0],
            'attention_mask': batch[1],
            'labels': batch[2],
        }

        outputs = model(**inputs)

        loss = outputs[0]
        loss_train_total += loss.item()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()

        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item() / len(batch))})

    tqdm.write(f'\nEpoch {epoch}')

    loss_train_avg = loss_train_total / len(dataloader_train)
    tqdm.write(f'Trainin loss: {loss_train_avg}')

    val_loss, predictions, true_vals = evaluate(dataloader_validation)
    val_f1 = f1_score_func(predictions, true_vals)
    tqdm.write(f'Validation loss: {val_loss}')
    tqdm.write(f'F1 Score (Weighted): {val_f1}')

torch.save(model.state_dict(), 'finetuned_BERT_final.model')


  0%|          | 0/3 [00:00<?, ?it/s]
Epoch 1:   0%|          | 0/438 [00:00<?, ?it/s][A
Epoch 1:   0%|          | 0/438 [00:17<?, ?it/s, training_loss=0.235][A
Epoch 1:   0%|          | 1/438 [00:17<2:09:48, 17.82s/it, training_loss=0.235][A
Epoch 1:   0%|          | 1/438 [00:36<2:09:48, 17.82s/it, training_loss=0.232][A
Epoch 1:   0%|          | 2/438 [00:36<2:12:43, 18.26s/it, training_loss=0.232][A
Epoch 1:   0%|          | 2/438 [00:52<2:12:43, 18.26s/it, training_loss=0.228][A
Epoch 1:   1%|          | 3/438 [00:52<2:05:38, 17.33s/it, training_loss=0.228][A
Epoch 1:   1%|          | 3/438 [01:08<2:05:38, 17.33s/it, training_loss=0.221][A
Epoch 1:   1%|          | 4/438 [01:08<2:00:11, 16.62s/it, training_loss=0.221][A
Epoch 1:   1%|          | 4/438 [01:23<2:00:11, 16.62s/it, training_loss=0.221][A
Epoch 1:   1%|          | 5/438 [01:23<1:56:06, 16.09s/it, training_loss=0.221][A
Epoch 1:   1%|          | 5/438 [01:38<1:56:06, 16.09s/it, training_loss=0.218][A
Epoch 1


Epoch 1
Trainin loss: 0.4149661928959633


 33%|███▎      | 1/3 [2:10:38<4:21:16, 7838.49s/it]

Validation loss: 0.3976703238614062
F1 Score (Weighted): 0.0



Epoch 2:   0%|          | 0/438 [00:00<?, ?it/s][A
Epoch 2:   0%|          | 0/438 [00:15<?, ?it/s, training_loss=0.149][A
Epoch 2:   0%|          | 1/438 [00:15<1:50:04, 15.11s/it, training_loss=0.149][A
Epoch 2:   0%|          | 1/438 [00:30<1:50:04, 15.11s/it, training_loss=0.127][A
Epoch 2:   0%|          | 2/438 [00:30<1:49:44, 15.10s/it, training_loss=0.127][A
Epoch 2:   0%|          | 2/438 [00:45<1:49:44, 15.10s/it, training_loss=0.129][A
Epoch 2:   1%|          | 3/438 [00:45<1:49:24, 15.09s/it, training_loss=0.129][A
Epoch 2:   1%|          | 3/438 [01:00<1:49:24, 15.09s/it, training_loss=0.125][A
Epoch 2:   1%|          | 4/438 [01:00<1:48:55, 15.06s/it, training_loss=0.125][A
Epoch 2:   1%|          | 4/438 [01:15<1:48:55, 15.06s/it, training_loss=0.131][A
Epoch 2:   1%|          | 5/438 [01:15<1:48:41, 15.06s/it, training_loss=0.131][A
Epoch 2:   1%|          | 5/438 [01:30<1:48:41, 15.06s/it, training_loss=0.135][A
Epoch 2:   1%|▏         | 6/438 [01:30<1:48:


Epoch 2
Trainin loss: 0.39360376151457227


 67%|██████▋   | 2/3 [4:20:32<2:10:12, 7812.09s/it]

Validation loss: 0.39125105104547864
F1 Score (Weighted): 0.006830920934844583



Epoch 3:   0%|          | 0/438 [00:00<?, ?it/s][A
Epoch 3:   0%|          | 0/438 [00:14<?, ?it/s, training_loss=0.126][A
Epoch 3:   0%|          | 1/438 [00:14<1:48:22, 14.88s/it, training_loss=0.126][A
Epoch 3:   0%|          | 1/438 [00:29<1:48:22, 14.88s/it, training_loss=0.129][A
Epoch 3:   0%|          | 2/438 [00:29<1:48:11, 14.89s/it, training_loss=0.129][A
Epoch 3:   0%|          | 2/438 [00:44<1:48:11, 14.89s/it, training_loss=0.136][A
Epoch 3:   1%|          | 3/438 [00:44<1:47:53, 14.88s/it, training_loss=0.136][A
Epoch 3:   1%|          | 3/438 [00:59<1:47:53, 14.88s/it, training_loss=0.138][A
Epoch 3:   1%|          | 4/438 [00:59<1:47:30, 14.86s/it, training_loss=0.138][A
Epoch 3:   1%|          | 4/438 [01:14<1:47:30, 14.86s/it, training_loss=0.128][A
Epoch 3:   1%|          | 5/438 [01:14<1:47:26, 14.89s/it, training_loss=0.128][A
Epoch 3:   1%|          | 5/438 [01:29<1:47:26, 14.89s/it, training_loss=0.126][A
Epoch 3:   1%|▏         | 6/438 [01:29<1:47:


Epoch 3
Trainin loss: 0.388516952730205


100%|██████████| 3/3 [6:28:38<00:00, 7772.77s/it]  


Validation loss: 0.38937313854694366
F1 Score (Weighted): 0.0228100907840582


In [21]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                      num_labels=9,
                                                      output_attentions=False,
                                                      output_hidden_states=False)
  

        
model.to(device)

model.load_state_dict(torch.load('finetuned_BERT_final.model', map_location=torch.device('cpu')))

_, predictions, true_vals = evaluate(dataloader_validation)
accuracy_per_class(predictions, true_vals)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Class: neutral
Accuracy: 0.0

Class: acceptance
Accuracy: 0.0

Class: disgust
Accuracy: 0.08191808191808192

Class: surprise
Accuracy: 0.01898101898101898

Class: joy
Accuracy: 0.004329004329004329

Class: sadness
Accuracy: 0.0

Class: anger
Accuracy: 0.0

Class: like
Accuracy: 0.0

Class: fear
Accuracy: 0.0


In [None]:
def tokenize_fn(batch):
    return tokenizer(batch['Utterances'], padding=True, truncation=True, max_length=MAX_LEN, return_tensors='pt')
#    
# def tokenize_fn(batch):
#     return tokenizer(batch['Utterances'], padding='max_length', truncation=True, max_length=MAX_LEN, return_tensors='pt')

tokenized_dataset = split.map(tokenize_fn, batched=True)

In [None]:
tokenized_dataset

In [None]:
tokenized_dataset['train'][0]

In [None]:
#trainer.train()


In [None]:
class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        self.bert_model = BertModel.from_pretrained('bert-base-uncased', return_dict=True)
        self.dropout = torch.nn.Dropout(0.3)
        self.linear = torch.nn.Linear(768, 9) #number of classs

    def forward(self, input_ids, attn_mask, token_type_ids):
        output = self.bert_model(
            input_ids,
            attention_mask=attn_mask,
            token_type_ids=token_type_ids
        )
        output_dropout = self.dropout(output.pooler_output)
        output = self.linear(output_dropout)
        return output

model = BERTClass()

# # Freezing BERT layers: (tested, weaker convergence)
# for param in model.bert_model.parameters():
#     param.requires_grad = False

model.to(device)


In [None]:
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

In [None]:
from transformers import AdamW

# define the optimizer
optimizer = AdamW(model.parameters(), lr = 1e-5)

In [None]:
# Training of the model for one epoch
def train_model(training_loader, model, optimizer):

    losses = []
    correct_predictions = 0
    num_samples = 0
    # set model to training mode (activate droput, batch norm)
    model.train()
    # initialize the progress bar
    loop = tq.tqdm(enumerate(training_loader), total=len(training_loader),
                   leave=True, colour='steelblue')
    for batch_idx, data in loop:
        ids = data['input_ids'].to(device, dtype = torch.long)
        mask = data['attention_mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)

        # forward
        outputs = model(ids, mask, token_type_ids) # (batch,predict)=(32,8)
        loss = loss_fn(outputs, targets)
        losses.append(loss.item())
        # training accuracy
        _, preds = torch.max(outputs, dim=1) # batch dim 
        _, targ = torch.max(targets, dim=1)  # batch dim
        num_samples += len(targ)  # technically adding batch size
        correct_predictions += torch.sum(preds == targ)

        # backward
        optimizer.zero_grad()
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        # grad descent step
        optimizer.step()

        # Update progress bar
        #loop.set_description(f"")
        #loop.set_postfix(batch_loss=loss)

    # returning: trained model, model accuracy, mean loss
    return model, float(correct_predictions)/num_samples, np.mean(losses)

In [None]:
def eval_model(validation_loader, model, optimizer):
    losses = []
    correct_predictions = 0
    num_samples = 0
    # set model to eval mode (turn off dropout, fix batch norm)
    model.eval()

    with torch.no_grad():
        for batch_idx, data in enumerate(validation_loader, 0):
            ids = data['input_ids'].to(device, dtype = torch.long)
            mask = data['attention_mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            outputs = model(ids, mask, token_type_ids)

            loss = loss_fn(outputs, targets)
            losses.append(loss.item())

            # validation accuracy
            _, preds = torch.max(outputs, dim=1) # batch dim 
            _, targ = torch.max(targets, dim=1)  # batch dim
            num_samples += len(targ)  # technically adding batch size
            correct_predictions += torch.sum(preds == targ)

    return float(correct_predictions)/num_samples, np.mean(losses)

In [None]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="pt")

train_data_loader = torch.utils.data.DataLoader(tokenized_dataset['train'],
                                                batch_size=TRAIN_BATCH_SIZE,
                                                shuffle=True,
                                                num_workers=0,
                                                collate_fn=data_collator
                                                )

val_data_loader = torch.utils.data.DataLoader(tokenized_dataset['test'],
                                              batch_size=VALID_BATCH_SIZE,
                                              shuffle=False,
                                              num_workers=0,
                                              collate_fn=data_collator
                                              )

In [None]:
history = defaultdict(list)
best_accuracy = 0

for epoch in range(1, EPOCHS+1):
    print(f'Epoch {epoch}/{EPOCHS}')
    model, train_acc, train_loss = train_model(train_data_loader, model, optimizer)
    val_acc, val_loss = eval_model(val_data_loader, model, optimizer)

    print(f'train_loss={train_loss:.4f}, val_loss={val_loss:.4f} train_acc={train_acc:.4f}, val_acc={val_acc:.4f}')

    history['train_acc'].append(train_acc)
    history['train_loss'].append(train_loss)
    history['val_acc'].append(val_acc)
    history['val_loss'].append(val_loss)
    # save the best model
    if val_acc > best_accuracy:
        torch.save(model.state_dict(), os.path.join(data_dir,"output","best_model_state.bin"))
        best_accuracy = val_acc


In [ ]:
import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = (10,7)
plt.plot(history['train_acc'], label='train accuracy')
plt.plot(history['val_acc'], label='validation accuracy')
plt.title('Training history')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend()
plt.ylim([0, 1]);
plt.grid()

In [None]:
target_list = list(tokenized_dataset.columns)
target_list

In [ ]:
train_data_loader = torch.utils.data.DataLoader(train_dataset,
                                                batch_size=TRAIN_BATCH_SIZE,
                                                shuffle=True,
                                                num_workers=0
                                                )

val_data_loader = torch.utils.data.DataLoader(valid_dataset,
                                              batch_size=VALID_BATCH_SIZE,
                                              shuffle=False,
                                              num_workers=0
                                              )

test_data_loader = torch.utils.data.DataLoader(test_dataset,
                                               batch_size=TEST_BATCH_SIZE,
                                               shuffle=False,
                                               num_workers=0
                                               )



In [None]:
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=3)
