In [None]:
%pip install datasets transformers --quiet 2> /dev/null

In [None]:
import sklearn
import pandas as pd
import numpy as np
import random
import torch
import json

seed = 42

np.random.seed(seed)
random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)

Getting the dataset.

In [None]:
from datasets import load_dataset
sem_eval_2018_task_1 = load_dataset('sem_eval_2018_task_1', 'subtask5.english', trust_remote_code=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading builder script:   0%|          | 0.00/6.29k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/5.98M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/6838 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3259 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/886 [00:00<?, ? examples/s]

#BERT

BERT + 1/2 gold standard data + 1/2 synthetic data

NO ROLE

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.metrics import accuracy_score, f1_score, multilabel_confusion_matrix
from torch.utils.data import DataLoader, TensorDataset, RandomSampler

num_epochs = 5
batch_size = 32
learning_rate = 1e-5
dropout = 0.2
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load dataset
mixed_df = pd.read_csv("half_gsd_half_gpt_data.csv")
df_val = pd.DataFrame(sem_eval_2018_task_1['validation'])
df_test = pd.DataFrame(sem_eval_2018_task_1['test'])
label_columns = ['anticipation', 'optimism', 'trust', 'joy', 'love', 'anger', 'disgust', 'pessimism', 'sadness', 'fear', 'surprise']

# Initialize BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=len(label_columns))
model.dropout = torch.nn.Dropout(dropout)  # Adding dropout to the model
model.to(device)

# Tokenize training data
inputs_train = tokenizer(list(mixed_df['Tweet']), padding=True, truncation=True, return_tensors="pt")
labels_train = torch.tensor(mixed_df[label_columns].values, dtype=torch.float32)
train_data = TensorDataset(inputs_train['input_ids'], inputs_train['attention_mask'], labels_train)
train_sampler = RandomSampler(train_data, replacement=False, num_samples=None, generator=torch.Generator().manual_seed(seed))  # Ensuring the shuffle is consistent
train_dataloader = DataLoader(train_data, batch_size=batch_size, sampler=train_sampler)

# Tokenize validation data (assuming df_val remains unchanged)
inputs_val = tokenizer(list(df_val['Tweet']), padding=True, truncation=True, return_tensors="pt")
labels_val = torch.tensor(df_val[label_columns].values, dtype=torch.float32)
val_data = TensorDataset(inputs_val['input_ids'], inputs_val['attention_mask'], labels_val)
val_dataloader = DataLoader(val_data, batch_size=batch_size, shuffle=False)

# Optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# Training loop
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch in train_dataloader:
        input_ids, attention_mask, labels = [b.to(device) for b in batch]
        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    avg_train_loss = total_loss / len(train_dataloader)

    # Validation
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch in val_dataloader:
            input_ids, attention_mask, labels = [b.to(device) for b in batch]
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            val_loss += loss.item()

    avg_val_loss = val_loss / len(val_dataloader)
    print(f"Epoch {epoch+1}, Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}")

# Evaluation on test set (assuming df_test remains unchanged)
inputs_test = tokenizer(list(df_test['Tweet']), padding=True, truncation=True, return_tensors="pt").to(device)
labels_test = torch.tensor(df_test[label_columns].values, dtype=torch.float32).to(device)

model.eval()
with torch.no_grad():
    outputs_test = model(input_ids=inputs_test['input_ids'], attention_mask=inputs_test['attention_mask'])
    logits_test = outputs_test.logits
    predicted_labels_test = (torch.sigmoid(logits_test) > 0.5).cpu().numpy().astype(int)

# Calculate test metrics
accuracy_test = accuracy_score(labels_test.cpu(), predicted_labels_test) * 100
f1_macro_test = f1_score(labels_test.cpu(), predicted_labels_test, average='macro') * 100
f1_micro_test = f1_score(labels_test.cpu(), predicted_labels_test, average='micro') * 100

confusion_matrix_test = multilabel_confusion_matrix(labels_test.cpu(), predicted_labels_test)

print(f"Test Accuracy: {accuracy_test:.2f}")
print(f"Test F1-macro: {f1_macro_test:.2f}")
print(f"Test F1-micro: {f1_micro_test:.2f}")
print("Confusion Matrix:")
for label, matrix in zip(label_columns, confusion_matrix_test):
    print(f"Label: {label}")
    print(matrix)
    print()

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1, Train Loss: 0.5175, Val Loss: 0.4056
Epoch 2, Train Loss: 0.3767, Val Loss: 0.3616
Epoch 3, Train Loss: 0.3240, Val Loss: 0.3456
Epoch 4, Train Loss: 0.2891, Val Loss: 0.3347
Epoch 5, Train Loss: 0.2638, Val Loss: 0.3302
Test Accuracy: 26.08
Test F1-macro: 49.12
Test F1-micro: 67.06
Confusion Matrix:
Label: anticipation
[[2769   65]
 [ 398   27]]

Label: optimism
[[1752  364]
 [ 271  872]]

Label: trust
[[3019   87]
 [ 143   10]]

Label: joy
[[1572  245]
 [ 269 1173]]

Label: love
[[2606  137]
 [ 285  231]]

Label: anger
[[1950  208]
 [ 276  825]]

Label: disgust
[[1899  261]
 [ 281  818]]

Label: pessimism
[[2757  127]
 [ 266  109]]

Label: sadness
[[2174  125]
 [ 460  500]]

Label: fear
[[2723   51]
 [ 238  247]]

Label: surprise
[[3088    1]
 [ 170    0]]



NO ROLE + NEWS ARTICLES

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.metrics import accuracy_score, f1_score, multilabel_confusion_matrix
from torch.utils.data import DataLoader, TensorDataset, RandomSampler

num_epochs = 5
batch_size = 32
learning_rate = 1e-5
dropout = 0.2
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load dataset
mixed_df = pd.read_csv("half_gsd_half_gpt_data_no_role_news.csv")
df_val = pd.DataFrame(sem_eval_2018_task_1['validation'])
df_test = pd.DataFrame(sem_eval_2018_task_1['test'])
label_columns = ['anticipation', 'optimism', 'trust', 'joy', 'love', 'anger', 'disgust', 'pessimism', 'sadness', 'fear', 'surprise']

# Initialize BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=len(label_columns))
model.dropout = torch.nn.Dropout(dropout)  # Adding dropout to the model
model.to(device)

# Tokenize training data
inputs_train = tokenizer(list(mixed_df['Tweet']), padding=True, truncation=True, return_tensors="pt")
labels_train = torch.tensor(mixed_df[label_columns].values, dtype=torch.float32)
train_data = TensorDataset(inputs_train['input_ids'], inputs_train['attention_mask'], labels_train)
train_sampler = RandomSampler(train_data, replacement=False, num_samples=None, generator=torch.Generator().manual_seed(seed))  # Ensuring the shuffle is consistent
train_dataloader = DataLoader(train_data, batch_size=batch_size, sampler=train_sampler)

# Tokenize validation data (assuming df_val remains unchanged)
inputs_val = tokenizer(list(df_val['Tweet']), padding=True, truncation=True, return_tensors="pt")
labels_val = torch.tensor(df_val[label_columns].values, dtype=torch.float32)
val_data = TensorDataset(inputs_val['input_ids'], inputs_val['attention_mask'], labels_val)
val_dataloader = DataLoader(val_data, batch_size=batch_size, shuffle=False)

# Optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# Training loop
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch in train_dataloader:
        input_ids, attention_mask, labels = [b.to(device) for b in batch]
        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    avg_train_loss = total_loss / len(train_dataloader)

    # Validation
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch in val_dataloader:
            input_ids, attention_mask, labels = [b.to(device) for b in batch]
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            val_loss += loss.item()

    avg_val_loss = val_loss / len(val_dataloader)
    print(f"Epoch {epoch+1}, Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}")

# Evaluation on test set (assuming df_test remains unchanged)
inputs_test = tokenizer(list(df_test['Tweet']), padding=True, truncation=True, return_tensors="pt").to(device)
labels_test = torch.tensor(df_test[label_columns].values, dtype=torch.float32).to(device)

model.eval()
with torch.no_grad():
    outputs_test = model(input_ids=inputs_test['input_ids'], attention_mask=inputs_test['attention_mask'])
    logits_test = outputs_test.logits
    predicted_labels_test = (torch.sigmoid(logits_test) > 0.5).cpu().numpy().astype(int)

# Calculate test metrics
accuracy_test = accuracy_score(labels_test.cpu(), predicted_labels_test) * 100
f1_macro_test = f1_score(labels_test.cpu(), predicted_labels_test, average='macro') * 100
f1_micro_test = f1_score(labels_test.cpu(), predicted_labels_test, average='micro') * 100

confusion_matrix_test = multilabel_confusion_matrix(labels_test.cpu(), predicted_labels_test)

print(f"Test Accuracy: {accuracy_test:.2f}")
print(f"Test F1-macro: {f1_macro_test:.2f}")
print(f"Test F1-micro: {f1_micro_test:.2f}")
print("Confusion Matrix:")
for label, matrix in zip(label_columns, confusion_matrix_test):
    print(f"Label: {label}")
    print(matrix)
    print()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1, Train Loss: 0.5420, Val Loss: 0.4218
Epoch 2, Train Loss: 0.4236, Val Loss: 0.3728
Epoch 3, Train Loss: 0.3648, Val Loss: 0.3560
Epoch 4, Train Loss: 0.3241, Val Loss: 0.3351
Epoch 5, Train Loss: 0.2938, Val Loss: 0.3263
Test Accuracy: 25.71
Test F1-macro: 48.50
Test F1-micro: 67.39
Confusion Matrix:
Label: anticipation
[[2785   49]
 [ 402   23]]

Label: optimism
[[1764  352]
 [ 281  862]]

Label: trust
[[3079   27]
 [ 148    5]]

Label: joy
[[1599  218]
 [ 258 1184]]

Label: love
[[2655   88]
 [ 324  192]]

Label: anger
[[1932  226]
 [ 269  832]]

Label: disgust
[[1893  267]
 [ 284  815]]

Label: pessimism
[[2767  117]
 [ 289   86]]

Label: sadness
[[2117  182]
 [ 417  543]]

Label: fear
[[2714   60]
 [ 224  261]]

Label: surprise
[[3087    2]
 [ 167    3]]



ROLE 1

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.metrics import accuracy_score, f1_score, multilabel_confusion_matrix
from torch.utils.data import DataLoader, TensorDataset, RandomSampler

num_epochs = 5
batch_size = 32
learning_rate = 1e-5
dropout = 0.2
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load dataset
mixed_df = pd.read_csv("half_gsd_half_gpt_data_role_1.csv")
df_val = pd.DataFrame(sem_eval_2018_task_1['validation'])
df_test = pd.DataFrame(sem_eval_2018_task_1['test'])
label_columns = ['anticipation', 'optimism', 'trust', 'joy', 'love', 'anger', 'disgust', 'pessimism', 'sadness', 'fear', 'surprise']

# Initialize BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=len(label_columns))
model.dropout = torch.nn.Dropout(dropout)  # Adding dropout to the model
model.to(device)

# Tokenize training data
inputs_train = tokenizer(list(mixed_df['Tweet']), padding=True, truncation=True, return_tensors="pt")
labels_train = torch.tensor(mixed_df[label_columns].values, dtype=torch.float32)
train_data = TensorDataset(inputs_train['input_ids'], inputs_train['attention_mask'], labels_train)
train_sampler = RandomSampler(train_data, replacement=False, num_samples=None, generator=torch.Generator().manual_seed(seed))  # Ensuring the shuffle is consistent
train_dataloader = DataLoader(train_data, batch_size=batch_size, sampler=train_sampler)

# Tokenize validation data (assuming df_val remains unchanged)
inputs_val = tokenizer(list(df_val['Tweet']), padding=True, truncation=True, return_tensors="pt")
labels_val = torch.tensor(df_val[label_columns].values, dtype=torch.float32)
val_data = TensorDataset(inputs_val['input_ids'], inputs_val['attention_mask'], labels_val)
val_dataloader = DataLoader(val_data, batch_size=batch_size, shuffle=False)

# Optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# Training loop
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch in train_dataloader:
        input_ids, attention_mask, labels = [b.to(device) for b in batch]
        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    avg_train_loss = total_loss / len(train_dataloader)

    # Validation
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch in val_dataloader:
            input_ids, attention_mask, labels = [b.to(device) for b in batch]
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            val_loss += loss.item()

    avg_val_loss = val_loss / len(val_dataloader)
    print(f"Epoch {epoch+1}, Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}")

# Evaluation on test set (assuming df_test remains unchanged)
inputs_test = tokenizer(list(df_test['Tweet']), padding=True, truncation=True, return_tensors="pt").to(device)
labels_test = torch.tensor(df_test[label_columns].values, dtype=torch.float32).to(device)

model.eval()
with torch.no_grad():
    outputs_test = model(input_ids=inputs_test['input_ids'], attention_mask=inputs_test['attention_mask'])
    logits_test = outputs_test.logits
    predicted_labels_test = (torch.sigmoid(logits_test) > 0.5).cpu().numpy().astype(int)

# Calculate test metrics
accuracy_test = accuracy_score(labels_test.cpu(), predicted_labels_test) * 100
f1_macro_test = f1_score(labels_test.cpu(), predicted_labels_test, average='macro') * 100
f1_micro_test = f1_score(labels_test.cpu(), predicted_labels_test, average='micro') * 100

confusion_matrix_test = multilabel_confusion_matrix(labels_test.cpu(), predicted_labels_test)

print(f"Test Accuracy: {accuracy_test:.2f}")
print(f"Test F1-macro: {f1_macro_test:.2f}")
print(f"Test F1-micro: {f1_micro_test:.2f}")
print("Confusion Matrix:")
for label, matrix in zip(label_columns, confusion_matrix_test):
    print(f"Label: {label}")
    print(matrix)
    print()

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1, Train Loss: 0.5240, Val Loss: 0.4115
Epoch 2, Train Loss: 0.3870, Val Loss: 0.3671
Epoch 3, Train Loss: 0.3314, Val Loss: 0.3440
Epoch 4, Train Loss: 0.2970, Val Loss: 0.3279
Epoch 5, Train Loss: 0.2680, Val Loss: 0.3223
Test Accuracy: 25.31
Test F1-macro: 46.32
Test F1-micro: 65.67
Confusion Matrix:
Label: anticipation
[[2782   52]
 [ 406   19]]

Label: optimism
[[1825  291]
 [ 338  805]]

Label: trust
[[3077   29]
 [ 151    2]]

Label: joy
[[1617  200]
 [ 308 1134]]

Label: love
[[2698   45]
 [ 396  120]]

Label: anger
[[1960  198]
 [ 308  793]]

Label: disgust
[[1904  256]
 [ 314  785]]

Label: pessimism
[[2773  111]
 [ 278   97]]

Label: sadness
[[2159  140]
 [ 461  499]]

Label: fear
[[2702   72]
 [ 211  274]]

Label: surprise
[[3089    0]
 [ 170    0]]



ROLE 2

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.metrics import accuracy_score, f1_score, multilabel_confusion_matrix
from torch.utils.data import DataLoader, TensorDataset, RandomSampler

num_epochs = 5
batch_size = 32
learning_rate = 1e-5
dropout = 0.2
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load dataset
mixed_df = pd.read_csv("half_gsd_half_gpt_data_role_2.csv")
df_val = pd.DataFrame(sem_eval_2018_task_1['validation'])
df_test = pd.DataFrame(sem_eval_2018_task_1['test'])
label_columns = ['anticipation', 'optimism', 'trust', 'joy', 'love', 'anger', 'disgust', 'pessimism', 'sadness', 'fear', 'surprise']

# Initialize BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=len(label_columns))
model.dropout = torch.nn.Dropout(dropout)  # Adding dropout to the model
model.to(device)

# Tokenize training data
inputs_train = tokenizer(list(mixed_df['Tweet']), padding=True, truncation=True, return_tensors="pt")
labels_train = torch.tensor(mixed_df[label_columns].values, dtype=torch.float32)
train_data = TensorDataset(inputs_train['input_ids'], inputs_train['attention_mask'], labels_train)
train_sampler = RandomSampler(train_data, replacement=False, num_samples=None, generator=torch.Generator().manual_seed(seed))  # Ensuring the shuffle is consistent
train_dataloader = DataLoader(train_data, batch_size=batch_size, sampler=train_sampler)

# Tokenize validation data (assuming df_val remains unchanged)
inputs_val = tokenizer(list(df_val['Tweet']), padding=True, truncation=True, return_tensors="pt")
labels_val = torch.tensor(df_val[label_columns].values, dtype=torch.float32)
val_data = TensorDataset(inputs_val['input_ids'], inputs_val['attention_mask'], labels_val)
val_dataloader = DataLoader(val_data, batch_size=batch_size, shuffle=False)

# Optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# Training loop
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch in train_dataloader:
        input_ids, attention_mask, labels = [b.to(device) for b in batch]
        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    avg_train_loss = total_loss / len(train_dataloader)

    # Validation
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch in val_dataloader:
            input_ids, attention_mask, labels = [b.to(device) for b in batch]
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            val_loss += loss.item()

    avg_val_loss = val_loss / len(val_dataloader)
    print(f"Epoch {epoch+1}, Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}")

# Evaluation on test set (assuming df_test remains unchanged)
inputs_test = tokenizer(list(df_test['Tweet']), padding=True, truncation=True, return_tensors="pt").to(device)
labels_test = torch.tensor(df_test[label_columns].values, dtype=torch.float32).to(device)

model.eval()
with torch.no_grad():
    outputs_test = model(input_ids=inputs_test['input_ids'], attention_mask=inputs_test['attention_mask'])
    logits_test = outputs_test.logits
    predicted_labels_test = (torch.sigmoid(logits_test) > 0.5).cpu().numpy().astype(int)

# Calculate test metrics
accuracy_test = accuracy_score(labels_test.cpu(), predicted_labels_test) * 100
f1_macro_test = f1_score(labels_test.cpu(), predicted_labels_test, average='macro') * 100
f1_micro_test = f1_score(labels_test.cpu(), predicted_labels_test, average='micro') * 100

confusion_matrix_test = multilabel_confusion_matrix(labels_test.cpu(), predicted_labels_test)

print(f"Test Accuracy: {accuracy_test:.2f}")
print(f"Test F1-macro: {f1_macro_test:.2f}")
print(f"Test F1-micro: {f1_micro_test:.2f}")
print("Confusion Matrix:")
for label, matrix in zip(label_columns, confusion_matrix_test):
    print(f"Label: {label}")
    print(matrix)
    print()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1, Train Loss: 0.5280, Val Loss: 0.4252
Epoch 2, Train Loss: 0.4012, Val Loss: 0.3716
Epoch 3, Train Loss: 0.3443, Val Loss: 0.3496
Epoch 4, Train Loss: 0.3058, Val Loss: 0.3351
Epoch 5, Train Loss: 0.2769, Val Loss: 0.3246
Test Accuracy: 26.27
Test F1-macro: 46.64
Test F1-micro: 66.44
Confusion Matrix:
Label: anticipation
[[2816   18]
 [ 418    7]]

Label: optimism
[[1814  302]
 [ 358  785]]

Label: trust
[[3059   47]
 [ 151    2]]

Label: joy
[[1615  202]
 [ 314 1128]]

Label: love
[[2673   70]
 [ 369  147]]

Label: anger
[[1901  257]
 [ 226  875]]

Label: disgust
[[1893  267]
 [ 288  811]]

Label: pessimism
[[2791   93]
 [ 284   91]]

Label: sadness
[[2137  162]
 [ 424  536]]

Label: fear
[[2730   44]
 [ 225  260]]

Label: surprise
[[3089    0]
 [ 170    0]]



ROLE 3

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.metrics import accuracy_score, f1_score, multilabel_confusion_matrix
from torch.utils.data import DataLoader, TensorDataset, RandomSampler

num_epochs = 5
batch_size = 32
learning_rate = 1e-5
dropout = 0.2
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load dataset
mixed_df = pd.read_csv("half_gsd_half_gpt_data_role_3.csv")
df_val = pd.DataFrame(sem_eval_2018_task_1['validation'])
df_test = pd.DataFrame(sem_eval_2018_task_1['test'])
label_columns = ['anticipation', 'optimism', 'trust', 'joy', 'love', 'anger', 'disgust', 'pessimism', 'sadness', 'fear', 'surprise']

# Initialize BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=len(label_columns))
model.dropout = torch.nn.Dropout(dropout)  # Adding dropout to the model
model.to(device)

# Tokenize training data
inputs_train = tokenizer(list(mixed_df['Tweet']), padding=True, truncation=True, return_tensors="pt")
labels_train = torch.tensor(mixed_df[label_columns].values, dtype=torch.float32)
train_data = TensorDataset(inputs_train['input_ids'], inputs_train['attention_mask'], labels_train)
train_sampler = RandomSampler(train_data, replacement=False, num_samples=None, generator=torch.Generator().manual_seed(seed))  # Ensuring the shuffle is consistent
train_dataloader = DataLoader(train_data, batch_size=batch_size, sampler=train_sampler)

# Tokenize validation data (assuming df_val remains unchanged)
inputs_val = tokenizer(list(df_val['Tweet']), padding=True, truncation=True, return_tensors="pt")
labels_val = torch.tensor(df_val[label_columns].values, dtype=torch.float32)
val_data = TensorDataset(inputs_val['input_ids'], inputs_val['attention_mask'], labels_val)
val_dataloader = DataLoader(val_data, batch_size=batch_size, shuffle=False)

# Optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# Training loop
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch in train_dataloader:
        input_ids, attention_mask, labels = [b.to(device) for b in batch]
        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    avg_train_loss = total_loss / len(train_dataloader)

    # Validation
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch in val_dataloader:
            input_ids, attention_mask, labels = [b.to(device) for b in batch]
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            val_loss += loss.item()

    avg_val_loss = val_loss / len(val_dataloader)
    print(f"Epoch {epoch+1}, Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}")

# Evaluation on test set (assuming df_test remains unchanged)
inputs_test = tokenizer(list(df_test['Tweet']), padding=True, truncation=True, return_tensors="pt").to(device)
labels_test = torch.tensor(df_test[label_columns].values, dtype=torch.float32).to(device)

model.eval()
with torch.no_grad():
    outputs_test = model(input_ids=inputs_test['input_ids'], attention_mask=inputs_test['attention_mask'])
    logits_test = outputs_test.logits
    predicted_labels_test = (torch.sigmoid(logits_test) > 0.5).cpu().numpy().astype(int)

# Calculate test metrics
accuracy_test = accuracy_score(labels_test.cpu(), predicted_labels_test) * 100
f1_macro_test = f1_score(labels_test.cpu(), predicted_labels_test, average='macro') * 100
f1_micro_test = f1_score(labels_test.cpu(), predicted_labels_test, average='micro') * 100

confusion_matrix_test = multilabel_confusion_matrix(labels_test.cpu(), predicted_labels_test)

print(f"Test Accuracy: {accuracy_test:.2f}")
print(f"Test F1-macro: {f1_macro_test:.2f}")
print(f"Test F1-micro: {f1_micro_test:.2f}")
print("Confusion Matrix:")
for label, matrix in zip(label_columns, confusion_matrix_test):
    print(f"Label: {label}")
    print(matrix)
    print()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1, Train Loss: 0.5247, Val Loss: 0.4101
Epoch 2, Train Loss: 0.3792, Val Loss: 0.3605
Epoch 3, Train Loss: 0.3254, Val Loss: 0.3397
Epoch 4, Train Loss: 0.2897, Val Loss: 0.3356
Epoch 5, Train Loss: 0.2630, Val Loss: 0.3269
Test Accuracy: 25.68
Test F1-macro: 47.64
Test F1-micro: 66.53
Confusion Matrix:
Label: anticipation
[[2763   71]
 [ 404   21]]

Label: optimism
[[1838  278]
 [ 355  788]]

Label: trust
[[3046   60]
 [ 151    2]]

Label: joy
[[1631  186]
 [ 324 1118]]

Label: love
[[2645   98]
 [ 341  175]]

Label: anger
[[1892  266]
 [ 226  875]]

Label: disgust
[[1803  357]
 [ 232  867]]

Label: pessimism
[[2767  117]
 [ 281   94]]

Label: sadness
[[2099  200]
 [ 416  544]]

Label: fear
[[2717   57]
 [ 204  281]]

Label: surprise
[[3088    1]
 [ 170    0]]



ROLE 4

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.metrics import accuracy_score, f1_score, multilabel_confusion_matrix
from torch.utils.data import DataLoader, TensorDataset, RandomSampler

num_epochs = 5
batch_size = 32
learning_rate = 1e-5
dropout = 0.2
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load dataset
mixed_df = pd.read_csv("half_gsd_half_gpt_data_role_4.csv")
df_val = pd.DataFrame(sem_eval_2018_task_1['validation'])
df_test = pd.DataFrame(sem_eval_2018_task_1['test'])
label_columns = ['anticipation', 'optimism', 'trust', 'joy', 'love', 'anger', 'disgust', 'pessimism', 'sadness', 'fear', 'surprise']

# Initialize BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=len(label_columns))
model.dropout = torch.nn.Dropout(dropout)  # Adding dropout to the model
model.to(device)

# Tokenize training data
inputs_train = tokenizer(list(mixed_df['Tweet']), padding=True, truncation=True, return_tensors="pt")
labels_train = torch.tensor(mixed_df[label_columns].values, dtype=torch.float32)
train_data = TensorDataset(inputs_train['input_ids'], inputs_train['attention_mask'], labels_train)
train_sampler = RandomSampler(train_data, replacement=False, num_samples=None, generator=torch.Generator().manual_seed(seed))  # Ensuring the shuffle is consistent
train_dataloader = DataLoader(train_data, batch_size=batch_size, sampler=train_sampler)

# Tokenize validation data (assuming df_val remains unchanged)
inputs_val = tokenizer(list(df_val['Tweet']), padding=True, truncation=True, return_tensors="pt")
labels_val = torch.tensor(df_val[label_columns].values, dtype=torch.float32)
val_data = TensorDataset(inputs_val['input_ids'], inputs_val['attention_mask'], labels_val)
val_dataloader = DataLoader(val_data, batch_size=batch_size, shuffle=False)

# Optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# Training loop
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch in train_dataloader:
        input_ids, attention_mask, labels = [b.to(device) for b in batch]
        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    avg_train_loss = total_loss / len(train_dataloader)

    # Validation
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch in val_dataloader:
            input_ids, attention_mask, labels = [b.to(device) for b in batch]
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            val_loss += loss.item()

    avg_val_loss = val_loss / len(val_dataloader)
    print(f"Epoch {epoch+1}, Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}")

# Evaluation on test set (assuming df_test remains unchanged)
inputs_test = tokenizer(list(df_test['Tweet']), padding=True, truncation=True, return_tensors="pt").to(device)
labels_test = torch.tensor(df_test[label_columns].values, dtype=torch.float32).to(device)

model.eval()
with torch.no_grad():
    outputs_test = model(input_ids=inputs_test['input_ids'], attention_mask=inputs_test['attention_mask'])
    logits_test = outputs_test.logits
    predicted_labels_test = (torch.sigmoid(logits_test) > 0.5).cpu().numpy().astype(int)

# Calculate test metrics
accuracy_test = accuracy_score(labels_test.cpu(), predicted_labels_test) * 100
f1_macro_test = f1_score(labels_test.cpu(), predicted_labels_test, average='macro') * 100
f1_micro_test = f1_score(labels_test.cpu(), predicted_labels_test, average='micro') * 100

confusion_matrix_test = multilabel_confusion_matrix(labels_test.cpu(), predicted_labels_test)

print(f"Test Accuracy: {accuracy_test:.2f}")
print(f"Test F1-macro: {f1_macro_test:.2f}")
print(f"Test F1-micro: {f1_micro_test:.2f}")
print("Confusion Matrix:")
for label, matrix in zip(label_columns, confusion_matrix_test):
    print(f"Label: {label}")
    print(matrix)
    print()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1, Train Loss: 0.5405, Val Loss: 0.4529
Epoch 2, Train Loss: 0.4173, Val Loss: 0.3658
Epoch 3, Train Loss: 0.3520, Val Loss: 0.3400
Epoch 4, Train Loss: 0.3127, Val Loss: 0.3295
Epoch 5, Train Loss: 0.2810, Val Loss: 0.3238
Test Accuracy: 26.51
Test F1-macro: 48.18
Test F1-micro: 67.47
Confusion Matrix:
Label: anticipation
[[2807   27]
 [ 402   23]]

Label: optimism
[[1776  340]
 [ 309  834]]

Label: trust
[[3080   26]
 [ 144    9]]

Label: joy
[[1607  210]
 [ 265 1177]]

Label: love
[[2686   57]
 [ 340  176]]

Label: anger
[[1905  253]
 [ 244  857]]

Label: disgust
[[1859  301]
 [ 261  838]]

Label: pessimism
[[2772  112]
 [ 296   79]]

Label: sadness
[[2047  252]
 [ 357  603]]

Label: fear
[[2740   34]
 [ 253  232]]

Label: surprise
[[3087    2]
 [ 170    0]]



ROLE 1 + NEWS ARTICLES

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.metrics import accuracy_score, f1_score, multilabel_confusion_matrix
from torch.utils.data import DataLoader, TensorDataset, RandomSampler

num_epochs = 5
batch_size = 32
learning_rate = 1e-5
dropout = 0.2
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load dataset
mixed_df = pd.read_csv("half_gsd_half_gpt_data_role_5.csv")
df_val = pd.DataFrame(sem_eval_2018_task_1['validation'])
df_test = pd.DataFrame(sem_eval_2018_task_1['test'])
label_columns = ['anticipation', 'optimism', 'trust', 'joy', 'love', 'anger', 'disgust', 'pessimism', 'sadness', 'fear', 'surprise']

# Initialize BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=len(label_columns))
model.dropout = torch.nn.Dropout(dropout)  # Adding dropout to the model
model.to(device)

# Tokenize training data
inputs_train = tokenizer(list(mixed_df['Tweet']), padding=True, truncation=True, return_tensors="pt")
labels_train = torch.tensor(mixed_df[label_columns].values, dtype=torch.float32)
train_data = TensorDataset(inputs_train['input_ids'], inputs_train['attention_mask'], labels_train)
train_sampler = RandomSampler(train_data, replacement=False, num_samples=None, generator=torch.Generator().manual_seed(seed))  # Ensuring the shuffle is consistent
train_dataloader = DataLoader(train_data, batch_size=batch_size, sampler=train_sampler)

# Tokenize validation data (assuming df_val remains unchanged)
inputs_val = tokenizer(list(df_val['Tweet']), padding=True, truncation=True, return_tensors="pt")
labels_val = torch.tensor(df_val[label_columns].values, dtype=torch.float32)
val_data = TensorDataset(inputs_val['input_ids'], inputs_val['attention_mask'], labels_val)
val_dataloader = DataLoader(val_data, batch_size=batch_size, shuffle=False)

# Optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# Training loop
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch in train_dataloader:
        input_ids, attention_mask, labels = [b.to(device) for b in batch]
        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    avg_train_loss = total_loss / len(train_dataloader)

    # Validation
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch in val_dataloader:
            input_ids, attention_mask, labels = [b.to(device) for b in batch]
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            val_loss += loss.item()

    avg_val_loss = val_loss / len(val_dataloader)
    print(f"Epoch {epoch+1}, Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}")

# Evaluation on test set (assuming df_test remains unchanged)
inputs_test = tokenizer(list(df_test['Tweet']), padding=True, truncation=True, return_tensors="pt").to(device)
labels_test = torch.tensor(df_test[label_columns].values, dtype=torch.float32).to(device)

model.eval()
with torch.no_grad():
    outputs_test = model(input_ids=inputs_test['input_ids'], attention_mask=inputs_test['attention_mask'])
    logits_test = outputs_test.logits
    predicted_labels_test = (torch.sigmoid(logits_test) > 0.5).cpu().numpy().astype(int)

# Calculate test metrics
accuracy_test = accuracy_score(labels_test.cpu(), predicted_labels_test) * 100
f1_macro_test = f1_score(labels_test.cpu(), predicted_labels_test, average='macro') * 100
f1_micro_test = f1_score(labels_test.cpu(), predicted_labels_test, average='micro') * 100

confusion_matrix_test = multilabel_confusion_matrix(labels_test.cpu(), predicted_labels_test)

print(f"Test Accuracy: {accuracy_test:.2f}")
print(f"Test F1-macro: {f1_macro_test:.2f}")
print(f"Test F1-micro: {f1_micro_test:.2f}")
print("Confusion Matrix:")
for label, matrix in zip(label_columns, confusion_matrix_test):
    print(f"Label: {label}")
    print(matrix)
    print()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1, Train Loss: 0.5498, Val Loss: 0.4200
Epoch 2, Train Loss: 0.4120, Val Loss: 0.3549
Epoch 3, Train Loss: 0.3482, Val Loss: 0.3327
Epoch 4, Train Loss: 0.3107, Val Loss: 0.3297
Epoch 5, Train Loss: 0.2800, Val Loss: 0.3236
Test Accuracy: 25.77
Test F1-macro: 49.61
Test F1-micro: 67.13
Confusion Matrix:
Label: anticipation
[[2724  110]
 [ 381   44]]

Label: optimism
[[1834  282]
 [ 360  783]]

Label: trust
[[3090   16]
 [ 149    4]]

Label: joy
[[1655  162]
 [ 322 1120]]

Label: love
[[2642  101]
 [ 317  199]]

Label: anger
[[1923  235]
 [ 252  849]]

Label: disgust
[[1863  297]
 [ 276  823]]

Label: pessimism
[[2754  130]
 [ 296   79]]

Label: sadness
[[2077  222]
 [ 350  610]]

Label: fear
[[2705   69]
 [ 203  282]]

Label: surprise
[[3077   12]
 [ 161    9]]



##Single Label BERT

No Role

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.metrics import accuracy_score, f1_score, multilabel_confusion_matrix
from torch.utils.data import DataLoader, TensorDataset, RandomSampler

num_epochs = 5
batch_size = 32
learning_rate = 1e-5
dropout = 0.2
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load dataset
mixed_df = pd.read_csv("gsd_gpt_single_label_no_role.csv")
df_val = pd.DataFrame(sem_eval_2018_task_1['validation'])
df_test = pd.DataFrame(sem_eval_2018_task_1['test'])
label_columns = ['anticipation', 'optimism', 'trust', 'joy', 'love', 'anger', 'disgust', 'pessimism', 'sadness', 'fear', 'surprise']

# Initialize BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=len(label_columns))
model.dropout = torch.nn.Dropout(dropout)  # Adding dropout to the model
model.to(device)

# Tokenize training data
inputs_train = tokenizer(list(mixed_df['Tweet']), padding=True, truncation=True, return_tensors="pt")
labels_train = torch.tensor(mixed_df[label_columns].values, dtype=torch.float32)
train_data = TensorDataset(inputs_train['input_ids'], inputs_train['attention_mask'], labels_train)
train_sampler = RandomSampler(train_data, replacement=False, num_samples=None, generator=torch.Generator().manual_seed(seed))  # Ensuring the shuffle is consistent
train_dataloader = DataLoader(train_data, batch_size=batch_size, sampler=train_sampler)

# Tokenize validation data (assuming df_val remains unchanged)
inputs_val = tokenizer(list(df_val['Tweet']), padding=True, truncation=True, return_tensors="pt")
labels_val = torch.tensor(df_val[label_columns].values, dtype=torch.float32)
val_data = TensorDataset(inputs_val['input_ids'], inputs_val['attention_mask'], labels_val)
val_dataloader = DataLoader(val_data, batch_size=batch_size, shuffle=False)

# Optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# Training loop
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch in train_dataloader:
        input_ids, attention_mask, labels = [b.to(device) for b in batch]
        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    avg_train_loss = total_loss / len(train_dataloader)

    # Validation
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch in val_dataloader:
            input_ids, attention_mask, labels = [b.to(device) for b in batch]
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            val_loss += loss.item()

    avg_val_loss = val_loss / len(val_dataloader)
    print(f"Epoch {epoch+1}, Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}")

# Evaluation on test set (assuming df_test remains unchanged)
inputs_test = tokenizer(list(df_test['Tweet']), padding=True, truncation=True, return_tensors="pt").to(device)
labels_test = torch.tensor(df_test[label_columns].values, dtype=torch.float32).to(device)

model.eval()
with torch.no_grad():
    outputs_test = model(input_ids=inputs_test['input_ids'], attention_mask=inputs_test['attention_mask'])
    logits_test = outputs_test.logits
    predicted_labels_test = (torch.sigmoid(logits_test) > 0.5).cpu().numpy().astype(int)

# Calculate test metrics
accuracy_test = accuracy_score(labels_test.cpu(), predicted_labels_test) * 100
f1_macro_test = f1_score(labels_test.cpu(), predicted_labels_test, average='macro') * 100
f1_micro_test = f1_score(labels_test.cpu(), predicted_labels_test, average='micro') * 100

confusion_matrix_test = multilabel_confusion_matrix(labels_test.cpu(), predicted_labels_test)

print(f"Test Accuracy: {accuracy_test:.2f}")
print(f"Test F1-macro: {f1_macro_test:.2f}")
print(f"Test F1-micro: {f1_micro_test:.2f}")
print("Confusion Matrix:")
for label, matrix in zip(label_columns, confusion_matrix_test):
    print(f"Label: {label}")
    print(matrix)
    print()

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1, Train Loss: 0.4928, Val Loss: 0.4284
Epoch 2, Train Loss: 0.3760, Val Loss: 0.3675
Epoch 3, Train Loss: 0.3219, Val Loss: 0.3427
Epoch 4, Train Loss: 0.2812, Val Loss: 0.3275
Epoch 5, Train Loss: 0.2520, Val Loss: 0.3201
Test Accuracy: 26.08
Test F1-macro: 44.53
Test F1-micro: 66.65
Confusion Matrix:
Label: anticipation
[[2826    8]
 [ 413   12]]

Label: optimism
[[1794  322]
 [ 348  795]]

Label: trust
[[3106    0]
 [ 153    0]]

Label: joy
[[1641  176]
 [ 316 1126]]

Label: love
[[2685   58]
 [ 371  145]]

Label: anger
[[1927  231]
 [ 252  849]]

Label: disgust
[[1863  297]
 [ 255  844]]

Label: pessimism
[[2870   14]
 [ 360   15]]

Label: sadness
[[2161  138]
 [ 429  531]]

Label: fear
[[2731   43]
 [ 226  259]]

Label: surprise
[[3089    0]
 [ 170    0]]



Role 1

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.metrics import accuracy_score, f1_score, multilabel_confusion_matrix
from torch.utils.data import DataLoader, TensorDataset, RandomSampler

num_epochs = 5
batch_size = 32
learning_rate = 1e-5
dropout = 0.2
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load dataset
mixed_df = pd.read_csv("gsd_gpt_single_label_role_1.csv")
df_val = pd.DataFrame(sem_eval_2018_task_1['validation'])
df_test = pd.DataFrame(sem_eval_2018_task_1['test'])
label_columns = ['anticipation', 'optimism', 'trust', 'joy', 'love', 'anger', 'disgust', 'pessimism', 'sadness', 'fear', 'surprise']

# Initialize BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=len(label_columns))
model.dropout = torch.nn.Dropout(dropout)  # Adding dropout to the model
model.to(device)

# Tokenize training data
inputs_train = tokenizer(list(mixed_df['Tweet']), padding=True, truncation=True, return_tensors="pt")
labels_train = torch.tensor(mixed_df[label_columns].values, dtype=torch.float32)
train_data = TensorDataset(inputs_train['input_ids'], inputs_train['attention_mask'], labels_train)
train_sampler = RandomSampler(train_data, replacement=False, num_samples=None, generator=torch.Generator().manual_seed(seed))  # Ensuring the shuffle is consistent
train_dataloader = DataLoader(train_data, batch_size=batch_size, sampler=train_sampler)

# Tokenize validation data (assuming df_val remains unchanged)
inputs_val = tokenizer(list(df_val['Tweet']), padding=True, truncation=True, return_tensors="pt")
labels_val = torch.tensor(df_val[label_columns].values, dtype=torch.float32)
val_data = TensorDataset(inputs_val['input_ids'], inputs_val['attention_mask'], labels_val)
val_dataloader = DataLoader(val_data, batch_size=batch_size, shuffle=False)

# Optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# Training loop
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch in train_dataloader:
        input_ids, attention_mask, labels = [b.to(device) for b in batch]
        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    avg_train_loss = total_loss / len(train_dataloader)

    # Validation
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch in val_dataloader:
            input_ids, attention_mask, labels = [b.to(device) for b in batch]
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            val_loss += loss.item()

    avg_val_loss = val_loss / len(val_dataloader)
    print(f"Epoch {epoch+1}, Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}")

# Evaluation on test set (assuming df_test remains unchanged)
inputs_test = tokenizer(list(df_test['Tweet']), padding=True, truncation=True, return_tensors="pt").to(device)
labels_test = torch.tensor(df_test[label_columns].values, dtype=torch.float32).to(device)

model.eval()
with torch.no_grad():
    outputs_test = model(input_ids=inputs_test['input_ids'], attention_mask=inputs_test['attention_mask'])
    logits_test = outputs_test.logits
    predicted_labels_test = (torch.sigmoid(logits_test) > 0.5).cpu().numpy().astype(int)

# Calculate test metrics
accuracy_test = accuracy_score(labels_test.cpu(), predicted_labels_test) * 100
f1_macro_test = f1_score(labels_test.cpu(), predicted_labels_test, average='macro') * 100
f1_micro_test = f1_score(labels_test.cpu(), predicted_labels_test, average='micro') * 100

confusion_matrix_test = multilabel_confusion_matrix(labels_test.cpu(), predicted_labels_test)

print(f"Test Accuracy: {accuracy_test:.2f}")
print(f"Test F1-macro: {f1_macro_test:.2f}")
print(f"Test F1-micro: {f1_micro_test:.2f}")
print("Confusion Matrix:")
for label, matrix in zip(label_columns, confusion_matrix_test):
    print(f"Label: {label}")
    print(matrix)
    print()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1, Train Loss: 0.4911, Val Loss: 0.4347
Epoch 2, Train Loss: 0.3864, Val Loss: 0.3759
Epoch 3, Train Loss: 0.3308, Val Loss: 0.3480
Epoch 4, Train Loss: 0.2884, Val Loss: 0.3310
Epoch 5, Train Loss: 0.2572, Val Loss: 0.3221
Test Accuracy: 26.76
Test F1-macro: 45.68
Test F1-micro: 66.28
Confusion Matrix:
Label: anticipation
[[2834    0]
 [ 425    0]]

Label: optimism
[[1853  263]
 [ 416  727]]

Label: trust
[[3106    0]
 [ 153    0]]

Label: joy
[[1644  173]
 [ 311 1131]]

Label: love
[[2712   31]
 [ 382  134]]

Label: anger
[[1969  189]
 [ 296  805]]

Label: disgust
[[1907  253]
 [ 299  800]]

Label: pessimism
[[2823   61]
 [ 305   70]]

Label: sadness
[[2165  134]
 [ 437  523]]

Label: fear
[[2718   56]
 [ 200  285]]

Label: surprise
[[3089    0]
 [ 170    0]]



Role 1 + NEWS

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.metrics import accuracy_score, f1_score, multilabel_confusion_matrix
from torch.utils.data import DataLoader, TensorDataset, RandomSampler

num_epochs = 5
batch_size = 32
learning_rate = 1e-5
dropout = 0.2
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load dataset
mixed_df = pd.read_csv("gsd_gpt_single_label_role_1_news.csv")
df_val = pd.DataFrame(sem_eval_2018_task_1['validation'])
df_test = pd.DataFrame(sem_eval_2018_task_1['test'])
label_columns = ['anticipation', 'optimism', 'trust', 'joy', 'love', 'anger', 'disgust', 'pessimism', 'sadness', 'fear', 'surprise']

# Initialize BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=len(label_columns))
model.dropout = torch.nn.Dropout(dropout)  # Adding dropout to the model
model.to(device)

# Tokenize training data
inputs_train = tokenizer(list(mixed_df['Tweet']), padding=True, truncation=True, return_tensors="pt")
labels_train = torch.tensor(mixed_df[label_columns].values, dtype=torch.float32)
train_data = TensorDataset(inputs_train['input_ids'], inputs_train['attention_mask'], labels_train)
train_sampler = RandomSampler(train_data, replacement=False, num_samples=None, generator=torch.Generator().manual_seed(seed))  # Ensuring the shuffle is consistent
train_dataloader = DataLoader(train_data, batch_size=batch_size, sampler=train_sampler)

# Tokenize validation data (assuming df_val remains unchanged)
inputs_val = tokenizer(list(df_val['Tweet']), padding=True, truncation=True, return_tensors="pt")
labels_val = torch.tensor(df_val[label_columns].values, dtype=torch.float32)
val_data = TensorDataset(inputs_val['input_ids'], inputs_val['attention_mask'], labels_val)
val_dataloader = DataLoader(val_data, batch_size=batch_size, shuffle=False)

# Optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# Training loop
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch in train_dataloader:
        input_ids, attention_mask, labels = [b.to(device) for b in batch]
        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    avg_train_loss = total_loss / len(train_dataloader)

    # Validation
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch in val_dataloader:
            input_ids, attention_mask, labels = [b.to(device) for b in batch]
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            val_loss += loss.item()

    avg_val_loss = val_loss / len(val_dataloader)
    print(f"Epoch {epoch+1}, Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}")

# Evaluation on test set (assuming df_test remains unchanged)
inputs_test = tokenizer(list(df_test['Tweet']), padding=True, truncation=True, return_tensors="pt").to(device)
labels_test = torch.tensor(df_test[label_columns].values, dtype=torch.float32).to(device)

model.eval()
with torch.no_grad():
    outputs_test = model(input_ids=inputs_test['input_ids'], attention_mask=inputs_test['attention_mask'])
    logits_test = outputs_test.logits
    predicted_labels_test = (torch.sigmoid(logits_test) > 0.5).cpu().numpy().astype(int)

# Calculate test metrics
accuracy_test = accuracy_score(labels_test.cpu(), predicted_labels_test) * 100
f1_macro_test = f1_score(labels_test.cpu(), predicted_labels_test, average='macro') * 100
f1_micro_test = f1_score(labels_test.cpu(), predicted_labels_test, average='micro') * 100

confusion_matrix_test = multilabel_confusion_matrix(labels_test.cpu(), predicted_labels_test)

print(f"Test Accuracy: {accuracy_test:.2f}")
print(f"Test F1-macro: {f1_macro_test:.2f}")
print(f"Test F1-micro: {f1_micro_test:.2f}")
print("Confusion Matrix:")
for label, matrix in zip(label_columns, confusion_matrix_test):
    print(f"Label: {label}")
    print(matrix)
    print()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1, Train Loss: 0.5008, Val Loss: 0.4702
Epoch 2, Train Loss: 0.3997, Val Loss: 0.3741
Epoch 3, Train Loss: 0.3355, Val Loss: 0.3418
Epoch 4, Train Loss: 0.2946, Val Loss: 0.3242
Epoch 5, Train Loss: 0.2646, Val Loss: 0.3197
Test Accuracy: 26.48
Test F1-macro: 44.55
Test F1-micro: 65.60
Confusion Matrix:
Label: anticipation
[[2834    0]
 [ 424    1]]

Label: optimism
[[1835  281]
 [ 376  767]]

Label: trust
[[3106    0]
 [ 153    0]]

Label: joy
[[1641  176]
 [ 312 1130]]

Label: love
[[2701   42]
 [ 381  135]]

Label: anger
[[1989  169]
 [ 333  768]]

Label: disgust
[[1961  199]
 [ 378  721]]

Label: pessimism
[[2840   44]
 [ 335   40]]

Label: sadness
[[2167  132]
 [ 428  532]]

Label: fear
[[2714   60]
 [ 200  285]]

Label: surprise
[[3089    0]
 [ 170    0]]



Increased multi-label

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.metrics import accuracy_score, f1_score, multilabel_confusion_matrix
from torch.utils.data import DataLoader, TensorDataset, RandomSampler

num_epochs = 5
batch_size = 32
learning_rate = 1e-5
dropout = 0.2
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load dataset
mixed_df = pd.read_csv("gsd_gpt_increased_multi.csv")
df_val = pd.DataFrame(sem_eval_2018_task_1['validation'])
df_test = pd.DataFrame(sem_eval_2018_task_1['test'])
label_columns = ['anticipation', 'optimism', 'trust', 'joy', 'love', 'anger', 'disgust', 'pessimism', 'sadness', 'fear', 'surprise']

# Initialize BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=len(label_columns))
model.dropout = torch.nn.Dropout(dropout)  # Adding dropout to the model
model.to(device)

# Tokenize training data
inputs_train = tokenizer(list(mixed_df['Tweet']), padding=True, truncation=True, return_tensors="pt")
labels_train = torch.tensor(mixed_df[label_columns].values, dtype=torch.float32)
train_data = TensorDataset(inputs_train['input_ids'], inputs_train['attention_mask'], labels_train)
train_sampler = RandomSampler(train_data, replacement=False, num_samples=None, generator=torch.Generator().manual_seed(seed))  # Ensuring the shuffle is consistent
train_dataloader = DataLoader(train_data, batch_size=batch_size, sampler=train_sampler)

# Tokenize validation data (assuming df_val remains unchanged)
inputs_val = tokenizer(list(df_val['Tweet']), padding=True, truncation=True, return_tensors="pt")
labels_val = torch.tensor(df_val[label_columns].values, dtype=torch.float32)
val_data = TensorDataset(inputs_val['input_ids'], inputs_val['attention_mask'], labels_val)
val_dataloader = DataLoader(val_data, batch_size=batch_size, shuffle=False)

# Optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# Training loop
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch in train_dataloader:
        input_ids, attention_mask, labels = [b.to(device) for b in batch]
        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    avg_train_loss = total_loss / len(train_dataloader)

    # Validation
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch in val_dataloader:
            input_ids, attention_mask, labels = [b.to(device) for b in batch]
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            val_loss += loss.item()

    avg_val_loss = val_loss / len(val_dataloader)
    print(f"Epoch {epoch+1}, Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}")

# Evaluation on test set (assuming df_test remains unchanged)
inputs_test = tokenizer(list(df_test['Tweet']), padding=True, truncation=True, return_tensors="pt").to(device)
labels_test = torch.tensor(df_test[label_columns].values, dtype=torch.float32).to(device)

model.eval()
with torch.no_grad():
    outputs_test = model(input_ids=inputs_test['input_ids'], attention_mask=inputs_test['attention_mask'])
    logits_test = outputs_test.logits
    predicted_labels_test = (torch.sigmoid(logits_test) > 0.5).cpu().numpy().astype(int)

# Calculate test metrics
accuracy_test = accuracy_score(labels_test.cpu(), predicted_labels_test) * 100
f1_macro_test = f1_score(labels_test.cpu(), predicted_labels_test, average='macro') * 100
f1_micro_test = f1_score(labels_test.cpu(), predicted_labels_test, average='micro') * 100

confusion_matrix_test = multilabel_confusion_matrix(labels_test.cpu(), predicted_labels_test)

print(f"Test Accuracy: {accuracy_test:.2f}")
print(f"Test F1-macro: {f1_macro_test:.2f}")
print(f"Test F1-micro: {f1_micro_test:.2f}")
print("Confusion Matrix:")
for label, matrix in zip(label_columns, confusion_matrix_test):
    print(f"Label: {label}")
    print(matrix)
    print()

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1, Train Loss: 0.4966, Val Loss: 0.3981
Epoch 2, Train Loss: 0.3413, Val Loss: 0.3607
Epoch 3, Train Loss: 0.2895, Val Loss: 0.3423
Epoch 4, Train Loss: 0.2553, Val Loss: 0.3392
Epoch 5, Train Loss: 0.2298, Val Loss: 0.3254
Test Accuracy: 25.71
Test F1-macro: 49.70
Test F1-micro: 67.47
Confusion Matrix:
Label: anticipation
[[2773   61]
 [ 394   31]]

Label: optimism
[[1777  339]
 [ 321  822]]

Label: trust
[[3037   69]
 [ 148    5]]

Label: joy
[[1596  221]
 [ 281 1161]]

Label: love
[[2600  143]
 [ 251  265]]

Label: anger
[[1921  237]
 [ 258  843]]

Label: disgust
[[1898  262]
 [ 298  801]]

Label: pessimism
[[2771  113]
 [ 274  101]]

Label: sadness
[[2071  228]
 [ 369  591]]

Label: fear
[[2697   77]
 [ 208  277]]

Label: surprise
[[3088    1]
 [ 169    1]]



Increased single-label

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.metrics import accuracy_score, f1_score, multilabel_confusion_matrix
from torch.utils.data import DataLoader, TensorDataset, RandomSampler

num_epochs = 5
batch_size = 32
learning_rate = 1e-5
dropout = 0.2
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load dataset
mixed_df = pd.read_csv("gsd_gpt_increased_single.csv")
df_val = pd.DataFrame(sem_eval_2018_task_1['validation'])
df_test = pd.DataFrame(sem_eval_2018_task_1['test'])
label_columns = ['anticipation', 'optimism', 'trust', 'joy', 'love', 'anger', 'disgust', 'pessimism', 'sadness', 'fear', 'surprise']

# Initialize BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=len(label_columns))
model.dropout = torch.nn.Dropout(dropout)  # Adding dropout to the model
model.to(device)

# Tokenize training data
inputs_train = tokenizer(list(mixed_df['Tweet']), padding=True, truncation=True, return_tensors="pt")
labels_train = torch.tensor(mixed_df[label_columns].values, dtype=torch.float32)
train_data = TensorDataset(inputs_train['input_ids'], inputs_train['attention_mask'], labels_train)
train_sampler = RandomSampler(train_data, replacement=False, num_samples=None, generator=torch.Generator().manual_seed(seed))  # Ensuring the shuffle is consistent
train_dataloader = DataLoader(train_data, batch_size=batch_size, sampler=train_sampler)

# Tokenize validation data (assuming df_val remains unchanged)
inputs_val = tokenizer(list(df_val['Tweet']), padding=True, truncation=True, return_tensors="pt")
labels_val = torch.tensor(df_val[label_columns].values, dtype=torch.float32)
val_data = TensorDataset(inputs_val['input_ids'], inputs_val['attention_mask'], labels_val)
val_dataloader = DataLoader(val_data, batch_size=batch_size, shuffle=False)

# Optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# Training loop
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch in train_dataloader:
        input_ids, attention_mask, labels = [b.to(device) for b in batch]
        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    avg_train_loss = total_loss / len(train_dataloader)

    # Validation
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch in val_dataloader:
            input_ids, attention_mask, labels = [b.to(device) for b in batch]
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            val_loss += loss.item()

    avg_val_loss = val_loss / len(val_dataloader)
    print(f"Epoch {epoch+1}, Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}")

# Evaluation on test set (assuming df_test remains unchanged)
inputs_test = tokenizer(list(df_test['Tweet']), padding=True, truncation=True, return_tensors="pt").to(device)
labels_test = torch.tensor(df_test[label_columns].values, dtype=torch.float32).to(device)

model.eval()
with torch.no_grad():
    outputs_test = model(input_ids=inputs_test['input_ids'], attention_mask=inputs_test['attention_mask'])
    logits_test = outputs_test.logits
    predicted_labels_test = (torch.sigmoid(logits_test) > 0.5).cpu().numpy().astype(int)

# Calculate test metrics
accuracy_test = accuracy_score(labels_test.cpu(), predicted_labels_test) * 100
f1_macro_test = f1_score(labels_test.cpu(), predicted_labels_test, average='macro') * 100
f1_micro_test = f1_score(labels_test.cpu(), predicted_labels_test, average='micro') * 100

confusion_matrix_test = multilabel_confusion_matrix(labels_test.cpu(), predicted_labels_test)

print(f"Test Accuracy: {accuracy_test:.2f}")
print(f"Test F1-macro: {f1_macro_test:.2f}")
print(f"Test F1-micro: {f1_micro_test:.2f}")
print("Confusion Matrix:")
for label, matrix in zip(label_columns, confusion_matrix_test):
    print(f"Label: {label}")
    print(matrix)
    print()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1, Train Loss: 0.4579, Val Loss: 0.4300
Epoch 2, Train Loss: 0.3426, Val Loss: 0.3672
Epoch 3, Train Loss: 0.2774, Val Loss: 0.3353
Epoch 4, Train Loss: 0.2332, Val Loss: 0.3234
Epoch 5, Train Loss: 0.2023, Val Loss: 0.3188
Test Accuracy: 25.35
Test F1-macro: 45.81
Test F1-micro: 66.13
Confusion Matrix:
Label: anticipation
[[2822   12]
 [ 411   14]]

Label: optimism
[[1873  243]
 [ 463  680]]

Label: trust
[[3103    3]
 [ 153    0]]

Label: joy
[[1627  190]
 [ 274 1168]]

Label: love
[[2664   79]
 [ 342  174]]

Label: anger
[[1961  197]
 [ 291  810]]

Label: disgust
[[1918  242]
 [ 322  777]]

Label: pessimism
[[2866   18]
 [ 354   21]]

Label: sadness
[[2100  199]
 [ 380  580]]

Label: fear
[[2733   41]
 [ 225  260]]

Label: surprise
[[3087    2]
 [ 161    9]]



#RoBERTa

RoBERTa + half GSD + half synthetic data

In [None]:
from sklearn.metrics import accuracy_score, f1_score, multilabel_confusion_matrix
from torch.utils.data import DataLoader, TensorDataset, RandomSampler
from transformers import AutoTokenizer, RobertaForSequenceClassification

# Define hyperparameters
num_epochs = 10
batch_size = 32
learning_rate = 1e-5
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
dropout = 0.2
gradient_accumulation_steps = 4

# Load dataset (assuming combined_df and sem_eval_2018_task_1 are defined)
df = pd.read_csv("half_gsd_half_gpt_data.csv")
df_val = pd.DataFrame(sem_eval_2018_task_1['validation'])
df_test = pd.DataFrame(sem_eval_2018_task_1['test'])
label_columns = ['anticipation', 'optimism', 'trust', 'joy', 'love', 'anger', 'disgust', 'pessimism', 'sadness', 'fear', 'surprise']

# Initialize BERT tokenizer and model with dropout
tokenizer = AutoTokenizer.from_pretrained("roberta-base")
model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=len(label_columns), hidden_dropout_prob=dropout, attention_probs_dropout_prob=dropout)
model.to(device)

# Tokenize training data
inputs_train = tokenizer(list(df['Tweet']), padding=True, truncation=True, return_tensors="pt")
labels_train = torch.tensor(df[label_columns].values, dtype=torch.float32)
train_data = TensorDataset(inputs_train['input_ids'], inputs_train['attention_mask'], labels_train)
train_sampler = RandomSampler(train_data, replacement=False, num_samples=None, generator=torch.Generator().manual_seed(seed))  # Ensuring the shuffle is consistent
train_dataloader = DataLoader(train_data, batch_size=batch_size, sampler=train_sampler)

# Tokenize validation data
inputs_val = tokenizer(list(df_val['Tweet']), padding=True, truncation=True, return_tensors="pt")
labels_val = torch.tensor(df_val[label_columns].values, dtype=torch.float32)
val_data = TensorDataset(inputs_val['input_ids'], inputs_val['attention_mask'], labels_val)
val_dataloader = DataLoader(val_data, batch_size=batch_size, shuffle=False)

# Optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# Training loop
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    optimizer.zero_grad()

    for step, batch in enumerate(train_dataloader):
        input_ids, attention_mask, labels = [b.to(device) for b in batch]
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        total_loss += loss.item()

        if (step + 1) % gradient_accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()

        # Free up memory
        del input_ids, attention_mask, labels, outputs, loss
        torch.cuda.empty_cache()

    avg_train_loss = total_loss / len(train_dataloader)

    # Validation
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch in val_dataloader:
            input_ids, attention_mask, labels = [b.to(device) for b in batch]
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            val_loss += loss.item()

            # Free up memory
            del input_ids, attention_mask, labels, outputs, loss
            torch.cuda.empty_cache()

    avg_val_loss = val_loss / len(val_dataloader)
    print(f"Epoch {epoch+1}, Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}")

torch.cuda.empty_cache()

# Evaluation on test set in batches
inputs_test = tokenizer(list(df_test['Tweet']), padding=True, truncation=True, return_tensors="pt")
labels_test = torch.tensor(df_test[label_columns].values, dtype=torch.float32)

test_data = TensorDataset(inputs_test['input_ids'], inputs_test['attention_mask'], labels_test)
test_dataloader = DataLoader(test_data, batch_size=batch_size, shuffle=False)

model.eval()
all_logits = []
with torch.no_grad():
    for batch in test_dataloader:
        input_ids, attention_mask, labels = [b.to(device) for b in batch]
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        all_logits.append(logits.cpu())

        # Free up memory
        del input_ids, attention_mask, labels, outputs, logits
        torch.cuda.empty_cache()

logits_test = torch.cat(all_logits, dim=0)
predicted_labels_test = (torch.sigmoid(logits_test) > 0.5).numpy().astype(int)

# Calculate test metrics
accuracy_test = accuracy_score(labels_test, predicted_labels_test) * 100
f1_macro_test = f1_score(labels_test, predicted_labels_test, average='macro') * 100
f1_micro_test = f1_score(labels_test, predicted_labels_test, average='micro') * 100
confusion_matrix_test = multilabel_confusion_matrix(labels_test, predicted_labels_test)

print(f"Test Accuracy: {accuracy_test:.2f}")
print(f"Test F1-macro: {f1_macro_test:.2f}")
print(f"Test F1-micro: {f1_micro_test:.2f}")
print("Confusion Matrix:")
for label, matrix in zip(label_columns, confusion_matrix_test):
    print(f"Label: {label}")
    print(matrix)
    print()

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1, Train Loss: 0.6093, Val Loss: 0.4846
Epoch 2, Train Loss: 0.4915, Val Loss: 0.4096
Epoch 3, Train Loss: 0.4102, Val Loss: 0.3771
Epoch 4, Train Loss: 0.3706, Val Loss: 0.3681
Epoch 5, Train Loss: 0.3474, Val Loss: 0.3607
Epoch 6, Train Loss: 0.3309, Val Loss: 0.3617
Epoch 7, Train Loss: 0.3170, Val Loss: 0.3499
Epoch 8, Train Loss: 0.3033, Val Loss: 0.3419
Epoch 9, Train Loss: 0.2929, Val Loss: 0.3419
Epoch 10, Train Loss: 0.2840, Val Loss: 0.3358
Test Accuracy: 22.31
Test F1-macro: 51.49
Test F1-micro: 66.42
Confusion Matrix:
Label: anticipation
[[2598  236]
 [ 343   82]]

Label: optimism
[[1665  451]
 [ 193  950]]

Label: trust
[[2921  185]
 [ 130   23]]

Label: joy
[[1561  256]
 [ 212 1230]]

Label: love
[[2564  179]
 [ 235  281]]

Label: anger
[[2012  146]
 [ 356  745]]

Label: disgust
[[1975  185]
 [ 364  735]]

Label: pessimism
[[2680  204]
 [ 246  129]]

Label: sadness
[[2097  202]
 [ 407  553]]

Label: fear
[[2642  132]
 [ 218  267]]

Label: surprise
[[3076   13]
 [ 16

No role + NEWS

In [None]:
from sklearn.metrics import accuracy_score, f1_score, multilabel_confusion_matrix
from torch.utils.data import DataLoader, TensorDataset
from transformers import AutoTokenizer, RobertaForSequenceClassification

# Define hyperparameters
num_epochs = 10
batch_size = 32
learning_rate = 1e-5
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
dropout = 0.2
gradient_accumulation_steps = 4

# Load dataset (assuming combined_df and sem_eval_2018_task_1 are defined)
df = pd.read_csv("half_gsd_half_gpt_data_no_role_news.csv")
df_val = pd.DataFrame(sem_eval_2018_task_1['validation'])
df_test = pd.DataFrame(sem_eval_2018_task_1['test'])
label_columns = ['anticipation', 'optimism', 'trust', 'joy', 'love', 'anger', 'disgust', 'pessimism', 'sadness', 'fear', 'surprise']

# Initialize BERT tokenizer and model with dropout
tokenizer = AutoTokenizer.from_pretrained("roberta-base")
model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=len(label_columns), hidden_dropout_prob=dropout, attention_probs_dropout_prob=dropout)
model.to(device)

# Tokenize training data
inputs_train = tokenizer(list(df['Tweet']), padding=True, truncation=True, return_tensors="pt")
labels_train = torch.tensor(df[label_columns].values, dtype=torch.float32)
train_data = TensorDataset(inputs_train['input_ids'], inputs_train['attention_mask'], labels_train)
train_sampler = RandomSampler(train_data, replacement=False, num_samples=None, generator=torch.Generator().manual_seed(seed))  # Ensuring the shuffle is consistent
train_dataloader = DataLoader(train_data, batch_size=batch_size, sampler=train_sampler)

# Tokenize validation data
inputs_val = tokenizer(list(df_val['Tweet']), padding=True, truncation=True, return_tensors="pt")
labels_val = torch.tensor(df_val[label_columns].values, dtype=torch.float32)
val_data = TensorDataset(inputs_val['input_ids'], inputs_val['attention_mask'], labels_val)
val_dataloader = DataLoader(val_data, batch_size=batch_size, shuffle=False)

# Optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# Training loop
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    optimizer.zero_grad()

    for step, batch in enumerate(train_dataloader):
        input_ids, attention_mask, labels = [b.to(device) for b in batch]
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        total_loss += loss.item()

        if (step + 1) % gradient_accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()

        # Free up memory
        del input_ids, attention_mask, labels, outputs, loss
        torch.cuda.empty_cache()

    avg_train_loss = total_loss / len(train_dataloader)

    # Validation
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch in val_dataloader:
            input_ids, attention_mask, labels = [b.to(device) for b in batch]
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            val_loss += loss.item()

            # Free up memory
            del input_ids, attention_mask, labels, outputs, loss
            torch.cuda.empty_cache()

    avg_val_loss = val_loss / len(val_dataloader)
    print(f"Epoch {epoch+1}, Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}")

torch.cuda.empty_cache()

# Evaluation on test set in batches
inputs_test = tokenizer(list(df_test['Tweet']), padding=True, truncation=True, return_tensors="pt")
labels_test = torch.tensor(df_test[label_columns].values, dtype=torch.float32)

test_data = TensorDataset(inputs_test['input_ids'], inputs_test['attention_mask'], labels_test)
test_dataloader = DataLoader(test_data, batch_size=batch_size, shuffle=False)

model.eval()
all_logits = []
with torch.no_grad():
    for batch in test_dataloader:
        input_ids, attention_mask, labels = [b.to(device) for b in batch]
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        all_logits.append(logits.cpu())

        # Free up memory
        del input_ids, attention_mask, labels, outputs, logits
        torch.cuda.empty_cache()

logits_test = torch.cat(all_logits, dim=0)
predicted_labels_test = (torch.sigmoid(logits_test) > 0.5).numpy().astype(int)

# Calculate test metrics
accuracy_test = accuracy_score(labels_test, predicted_labels_test) * 100
f1_macro_test = f1_score(labels_test, predicted_labels_test, average='macro') * 100
f1_micro_test = f1_score(labels_test, predicted_labels_test, average='micro') * 100
confusion_matrix_test = multilabel_confusion_matrix(labels_test, predicted_labels_test)

print(f"Test Accuracy: {accuracy_test:.2f}")
print(f"Test F1-macro: {f1_macro_test:.2f}")
print(f"Test F1-micro: {f1_micro_test:.2f}")
print("Confusion Matrix:")
for label, matrix in zip(label_columns, confusion_matrix_test):
    print(f"Label: {label}")
    print(matrix)
    print()

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1, Train Loss: 0.6228, Val Loss: 0.4947
Epoch 2, Train Loss: 0.5401, Val Loss: 0.4417
Epoch 3, Train Loss: 0.4567, Val Loss: 0.3930
Epoch 4, Train Loss: 0.4088, Val Loss: 0.3689
Epoch 5, Train Loss: 0.3852, Val Loss: 0.3556
Epoch 6, Train Loss: 0.3622, Val Loss: 0.3452
Epoch 7, Train Loss: 0.3476, Val Loss: 0.3442
Epoch 8, Train Loss: 0.3337, Val Loss: 0.3306
Epoch 9, Train Loss: 0.3205, Val Loss: 0.3244
Epoch 10, Train Loss: 0.3126, Val Loss: 0.3223
Test Accuracy: 25.81
Test F1-macro: 52.02
Test F1-micro: 67.90
Confusion Matrix:
Label: anticipation
[[2689  145]
 [ 362   63]]

Label: optimism
[[1682  434]
 [ 212  931]]

Label: trust
[[3016   90]
 [ 136   17]]

Label: joy
[[1625  192]
 [ 294 1148]]

Label: love
[[2578  165]
 [ 231  285]]

Label: anger
[[1920  238]
 [ 241  860]]

Label: disgust
[[1881  279]
 [ 274  825]]

Label: pessimism
[[2743  141]
 [ 285   90]]

Label: sadness
[[2136  163]
 [ 418  542]]

Label: fear
[[2666  108]
 [ 197  288]]

Label: surprise
[[3071   18]
 [ 16

ROLE 1

In [None]:
from sklearn.metrics import accuracy_score, f1_score, multilabel_confusion_matrix
from torch.utils.data import DataLoader, TensorDataset, RandomSampler
from transformers import AutoTokenizer, RobertaForSequenceClassification

# Define hyperparameters
num_epochs = 10
batch_size = 32
learning_rate = 1e-5
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
dropout = 0.2
gradient_accumulation_steps = 4

# Load dataset (assuming combined_df and sem_eval_2018_task_1 are defined)
df = pd.read_csv("half_gsd_half_gpt_data_role_1.csv")
df_val = pd.DataFrame(sem_eval_2018_task_1['validation'])
df_test = pd.DataFrame(sem_eval_2018_task_1['test'])
label_columns = ['anticipation', 'optimism', 'trust', 'joy', 'love', 'anger', 'disgust', 'pessimism', 'sadness', 'fear', 'surprise']

# Initialize BERT tokenizer and model with dropout
tokenizer = AutoTokenizer.from_pretrained("roberta-base")
model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=len(label_columns), hidden_dropout_prob=dropout, attention_probs_dropout_prob=dropout)
model.to(device)

# Tokenize training data
inputs_train = tokenizer(list(df['Tweet']), padding=True, truncation=True, return_tensors="pt")
labels_train = torch.tensor(df[label_columns].values, dtype=torch.float32)
train_data = TensorDataset(inputs_train['input_ids'], inputs_train['attention_mask'], labels_train)
train_sampler = RandomSampler(train_data, replacement=False, num_samples=None, generator=torch.Generator().manual_seed(seed))  # Ensuring the shuffle is consistent
train_dataloader = DataLoader(train_data, batch_size=batch_size, sampler=train_sampler)

# Tokenize validation data
inputs_val = tokenizer(list(df_val['Tweet']), padding=True, truncation=True, return_tensors="pt")
labels_val = torch.tensor(df_val[label_columns].values, dtype=torch.float32)
val_data = TensorDataset(inputs_val['input_ids'], inputs_val['attention_mask'], labels_val)
val_dataloader = DataLoader(val_data, batch_size=batch_size, shuffle=False)

# Optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# Training loop
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    optimizer.zero_grad()

    for step, batch in enumerate(train_dataloader):
        input_ids, attention_mask, labels = [b.to(device) for b in batch]
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        total_loss += loss.item()

        if (step + 1) % gradient_accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()

        # Free up memory
        del input_ids, attention_mask, labels, outputs, loss
        torch.cuda.empty_cache()

    avg_train_loss = total_loss / len(train_dataloader)

    # Validation
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch in val_dataloader:
            input_ids, attention_mask, labels = [b.to(device) for b in batch]
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            val_loss += loss.item()

            # Free up memory
            del input_ids, attention_mask, labels, outputs, loss
            torch.cuda.empty_cache()

    avg_val_loss = val_loss / len(val_dataloader)
    print(f"Epoch {epoch+1}, Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}")

torch.cuda.empty_cache()

# Evaluation on test set in batches
inputs_test = tokenizer(list(df_test['Tweet']), padding=True, truncation=True, return_tensors="pt")
labels_test = torch.tensor(df_test[label_columns].values, dtype=torch.float32)

test_data = TensorDataset(inputs_test['input_ids'], inputs_test['attention_mask'], labels_test)
test_dataloader = DataLoader(test_data, batch_size=batch_size, shuffle=False)

model.eval()
all_logits = []
with torch.no_grad():
    for batch in test_dataloader:
        input_ids, attention_mask, labels = [b.to(device) for b in batch]
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        all_logits.append(logits.cpu())

        # Free up memory
        del input_ids, attention_mask, labels, outputs, logits
        torch.cuda.empty_cache()

logits_test = torch.cat(all_logits, dim=0)
predicted_labels_test = (torch.sigmoid(logits_test) > 0.5).numpy().astype(int)

# Calculate test metrics
accuracy_test = accuracy_score(labels_test, predicted_labels_test) * 100
f1_macro_test = f1_score(labels_test, predicted_labels_test, average='macro') * 100
f1_micro_test = f1_score(labels_test, predicted_labels_test, average='micro') * 100
confusion_matrix_test = multilabel_confusion_matrix(labels_test, predicted_labels_test)

print(f"Test Accuracy: {accuracy_test:.2f}")
print(f"Test F1-macro: {f1_macro_test:.2f}")
print(f"Test F1-micro: {f1_micro_test:.2f}")
print("Confusion Matrix:")
for label, matrix in zip(label_columns, confusion_matrix_test):
    print(f"Label: {label}")
    print(matrix)
    print()

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1, Train Loss: 0.6172, Val Loss: 0.4897
Epoch 2, Train Loss: 0.5062, Val Loss: 0.4134
Epoch 3, Train Loss: 0.4146, Val Loss: 0.3944
Epoch 4, Train Loss: 0.3765, Val Loss: 0.3679
Epoch 5, Train Loss: 0.3542, Val Loss: 0.3603
Epoch 6, Train Loss: 0.3351, Val Loss: 0.3498
Epoch 7, Train Loss: 0.3206, Val Loss: 0.3410
Epoch 8, Train Loss: 0.3051, Val Loss: 0.3301
Epoch 9, Train Loss: 0.2938, Val Loss: 0.3338
Epoch 10, Train Loss: 0.2849, Val Loss: 0.3275
Test Accuracy: 24.42
Test F1-macro: 51.05
Test F1-micro: 66.93
Confusion Matrix:
Label: anticipation
[[2727  107]
 [ 395   30]]

Label: optimism
[[1757  359]
 [ 275  868]]

Label: trust
[[2991  115]
 [ 137   16]]

Label: joy
[[1629  188]
 [ 286 1156]]

Label: love
[[2565  178]
 [ 222  294]]

Label: anger
[[1970  188]
 [ 313  788]]

Label: disgust
[[1921  239]
 [ 323  776]]

Label: pessimism
[[2598  286]
 [ 208  167]]

Label: sadness
[[2051  248]
 [ 361  599]]

Label: fear
[[2689   85]
 [ 220  265]]

Label: surprise
[[3084    5]
 [ 16

ROLE 2

In [None]:
from sklearn.metrics import accuracy_score, f1_score, multilabel_confusion_matrix
from torch.utils.data import DataLoader, TensorDataset
from transformers import AutoTokenizer, RobertaForSequenceClassification

# Define hyperparameters
num_epochs = 10
batch_size = 32
learning_rate = 1e-5 #5e-5
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
dropout = 0.2
gradient_accumulation_steps = 4

# Load dataset (assuming combined_df and sem_eval_2018_task_1 are defined)
df = pd.read_csv("half_gsd_half_gpt_data_role_2.csv")
df_val = pd.DataFrame(sem_eval_2018_task_1['validation'])
df_test = pd.DataFrame(sem_eval_2018_task_1['test'])
label_columns = ['anticipation', 'optimism', 'trust', 'joy', 'love', 'anger', 'disgust', 'pessimism', 'sadness', 'fear', 'surprise']

# Initialize BERT tokenizer and model with dropout
tokenizer = AutoTokenizer.from_pretrained("roberta-base")
model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=len(label_columns), hidden_dropout_prob=dropout, attention_probs_dropout_prob=dropout)
model.to(device)

# Tokenize training data
inputs_train = tokenizer(list(df['Tweet']), padding=True, truncation=True, return_tensors="pt")
labels_train = torch.tensor(df[label_columns].values, dtype=torch.float32)
train_data = TensorDataset(inputs_train['input_ids'], inputs_train['attention_mask'], labels_train)
train_sampler = RandomSampler(train_data, replacement=False, num_samples=None, generator=torch.Generator().manual_seed(seed))  # Ensuring the shuffle is consistent
train_dataloader = DataLoader(train_data, batch_size=batch_size, sampler=train_sampler)

# Tokenize validation data
inputs_val = tokenizer(list(df_val['Tweet']), padding=True, truncation=True, return_tensors="pt")
labels_val = torch.tensor(df_val[label_columns].values, dtype=torch.float32)
val_data = TensorDataset(inputs_val['input_ids'], inputs_val['attention_mask'], labels_val)
val_dataloader = DataLoader(val_data, batch_size=batch_size, shuffle=False)

# Optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# Training loop
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    optimizer.zero_grad()

    for step, batch in enumerate(train_dataloader):
        input_ids, attention_mask, labels = [b.to(device) for b in batch]
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        total_loss += loss.item()

        if (step + 1) % gradient_accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()

        # Free up memory
        del input_ids, attention_mask, labels, outputs, loss
        torch.cuda.empty_cache()

    avg_train_loss = total_loss / len(train_dataloader)

    # Validation
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch in val_dataloader:
            input_ids, attention_mask, labels = [b.to(device) for b in batch]
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            val_loss += loss.item()

            # Free up memory
            del input_ids, attention_mask, labels, outputs, loss
            torch.cuda.empty_cache()

    avg_val_loss = val_loss / len(val_dataloader)
    print(f"Epoch {epoch+1}, Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}")

torch.cuda.empty_cache()

# Evaluation on test set in batches
inputs_test = tokenizer(list(df_test['Tweet']), padding=True, truncation=True, return_tensors="pt")
labels_test = torch.tensor(df_test[label_columns].values, dtype=torch.float32)

test_data = TensorDataset(inputs_test['input_ids'], inputs_test['attention_mask'], labels_test)
test_dataloader = DataLoader(test_data, batch_size=batch_size, shuffle=False)

model.eval()
all_logits = []
with torch.no_grad():
    for batch in test_dataloader:
        input_ids, attention_mask, labels = [b.to(device) for b in batch]
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        all_logits.append(logits.cpu())

        # Free up memory
        del input_ids, attention_mask, labels, outputs, logits
        torch.cuda.empty_cache()

logits_test = torch.cat(all_logits, dim=0)
predicted_labels_test = (torch.sigmoid(logits_test) > 0.5).numpy().astype(int)

# Calculate test metrics
accuracy_test = accuracy_score(labels_test, predicted_labels_test) * 100
f1_macro_test = f1_score(labels_test, predicted_labels_test, average='macro') * 100
f1_micro_test = f1_score(labels_test, predicted_labels_test, average='micro') * 100
confusion_matrix_test = multilabel_confusion_matrix(labels_test, predicted_labels_test)

print(f"Test Accuracy: {accuracy_test:.2f}")
print(f"Test F1-macro: {f1_macro_test:.2f}")
print(f"Test F1-micro: {f1_micro_test:.2f}")
print("Confusion Matrix:")
for label, matrix in zip(label_columns, confusion_matrix_test):
    print(f"Label: {label}")
    print(matrix)
    print()

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1, Train Loss: 0.6173, Val Loss: 0.4894
Epoch 2, Train Loss: 0.5152, Val Loss: 0.4217
Epoch 3, Train Loss: 0.4248, Val Loss: 0.3889
Epoch 4, Train Loss: 0.3858, Val Loss: 0.3735
Epoch 5, Train Loss: 0.3567, Val Loss: 0.3610
Epoch 6, Train Loss: 0.3393, Val Loss: 0.3504
Epoch 7, Train Loss: 0.3226, Val Loss: 0.3356
Epoch 8, Train Loss: 0.3057, Val Loss: 0.3368
Epoch 9, Train Loss: 0.2947, Val Loss: 0.3336
Epoch 10, Train Loss: 0.2823, Val Loss: 0.3254
Test Accuracy: 24.39
Test F1-macro: 51.56
Test F1-micro: 67.25
Confusion Matrix:
Label: anticipation
[[2696  138]
 [ 382   43]]

Label: optimism
[[1712  404]
 [ 225  918]]

Label: trust
[[2878  228]
 [ 122   31]]

Label: joy
[[1595  222]
 [ 233 1209]]

Label: love
[[2503  240]
 [ 209  307]]

Label: anger
[[1963  195]
 [ 284  817]]

Label: disgust
[[1929  231]
 [ 304  795]]

Label: pessimism
[[2746  138]
 [ 269  106]]

Label: sadness
[[2160  139]
 [ 456  504]]

Label: fear
[[2708   66]
 [ 215  270]]

Label: surprise
[[3078   11]
 [ 16

ROLE 3

In [None]:
from sklearn.metrics import accuracy_score, f1_score, multilabel_confusion_matrix
from torch.utils.data import DataLoader, TensorDataset, RandomSampler
from transformers import AutoTokenizer, RobertaForSequenceClassification

# Define hyperparameters
num_epochs = 10
batch_size = 32
learning_rate = 1e-5
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
dropout = 0.2
gradient_accumulation_steps = 4

# Load dataset (assuming combined_df and sem_eval_2018_task_1 are defined)
df = pd.read_csv("half_gsd_half_gpt_data_role_3.csv")
df_val = pd.DataFrame(sem_eval_2018_task_1['validation'])
df_test = pd.DataFrame(sem_eval_2018_task_1['test'])
label_columns = ['anticipation', 'optimism', 'trust', 'joy', 'love', 'anger', 'disgust', 'pessimism', 'sadness', 'fear', 'surprise']

# Initialize BERT tokenizer and model with dropout
tokenizer = AutoTokenizer.from_pretrained("roberta-base")
model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=len(label_columns), hidden_dropout_prob=dropout, attention_probs_dropout_prob=dropout)
model.to(device)

# Tokenize training data
inputs_train = tokenizer(list(df['Tweet']), padding=True, truncation=True, return_tensors="pt")
labels_train = torch.tensor(df[label_columns].values, dtype=torch.float32)
train_data = TensorDataset(inputs_train['input_ids'], inputs_train['attention_mask'], labels_train)
train_sampler = RandomSampler(train_data, replacement=False, num_samples=None, generator=torch.Generator().manual_seed(seed))  # Ensuring the shuffle is consistent
train_dataloader = DataLoader(train_data, batch_size=batch_size, sampler=train_sampler)

# Tokenize validation data
inputs_val = tokenizer(list(df_val['Tweet']), padding=True, truncation=True, return_tensors="pt")
labels_val = torch.tensor(df_val[label_columns].values, dtype=torch.float32)
val_data = TensorDataset(inputs_val['input_ids'], inputs_val['attention_mask'], labels_val)
val_dataloader = DataLoader(val_data, batch_size=batch_size, shuffle=False)

# Optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# Training loop
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    optimizer.zero_grad()

    for step, batch in enumerate(train_dataloader):
        input_ids, attention_mask, labels = [b.to(device) for b in batch]
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        total_loss += loss.item()

        if (step + 1) % gradient_accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()

        # Free up memory
        del input_ids, attention_mask, labels, outputs, loss
        torch.cuda.empty_cache()

    avg_train_loss = total_loss / len(train_dataloader)

    # Validation
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch in val_dataloader:
            input_ids, attention_mask, labels = [b.to(device) for b in batch]
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            val_loss += loss.item()

            # Free up memory
            del input_ids, attention_mask, labels, outputs, loss
            torch.cuda.empty_cache()

    avg_val_loss = val_loss / len(val_dataloader)
    print(f"Epoch {epoch+1}, Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}")

torch.cuda.empty_cache()

# Evaluation on test set in batches
inputs_test = tokenizer(list(df_test['Tweet']), padding=True, truncation=True, return_tensors="pt")
labels_test = torch.tensor(df_test[label_columns].values, dtype=torch.float32)

test_data = TensorDataset(inputs_test['input_ids'], inputs_test['attention_mask'], labels_test)
test_dataloader = DataLoader(test_data, batch_size=batch_size, shuffle=False)

model.eval()
all_logits = []
with torch.no_grad():
    for batch in test_dataloader:
        input_ids, attention_mask, labels = [b.to(device) for b in batch]
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        all_logits.append(logits.cpu())

        # Free up memory
        del input_ids, attention_mask, labels, outputs, logits
        torch.cuda.empty_cache()

logits_test = torch.cat(all_logits, dim=0)
predicted_labels_test = (torch.sigmoid(logits_test) > 0.5).numpy().astype(int)

# Calculate test metrics
accuracy_test = accuracy_score(labels_test, predicted_labels_test) * 100
f1_macro_test = f1_score(labels_test, predicted_labels_test, average='macro') * 100
f1_micro_test = f1_score(labels_test, predicted_labels_test, average='micro') * 100
confusion_matrix_test = multilabel_confusion_matrix(labels_test, predicted_labels_test)

print(f"Test Accuracy: {accuracy_test:.2f}")
print(f"Test F1-macro: {f1_macro_test:.2f}")
print(f"Test F1-micro: {f1_micro_test:.2f}")
print("Confusion Matrix:")
for label, matrix in zip(label_columns, confusion_matrix_test):
    print(f"Label: {label}")
    print(matrix)
    print()

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1, Train Loss: 0.6263, Val Loss: 0.4950
Epoch 2, Train Loss: 0.5108, Val Loss: 0.4252
Epoch 3, Train Loss: 0.4224, Val Loss: 0.3940
Epoch 4, Train Loss: 0.3872, Val Loss: 0.3760
Epoch 5, Train Loss: 0.3607, Val Loss: 0.3689
Epoch 6, Train Loss: 0.3417, Val Loss: 0.3614
Epoch 7, Train Loss: 0.3272, Val Loss: 0.3555
Epoch 8, Train Loss: 0.3141, Val Loss: 0.3481
Epoch 9, Train Loss: 0.3018, Val Loss: 0.3403
Epoch 10, Train Loss: 0.2901, Val Loss: 0.3365
Test Accuracy: 22.74
Test F1-macro: 50.31
Test F1-micro: 65.88
Confusion Matrix:
Label: anticipation
[[2617  217]
 [ 383   42]]

Label: optimism
[[1699  417]
 [ 220  923]]

Label: trust
[[2894  212]
 [ 125   28]]

Label: joy
[[1632  185]
 [ 281 1161]]

Label: love
[[2471  272]
 [ 206  310]]

Label: anger
[[1988  170]
 [ 319  782]]

Label: disgust
[[1952  208]
 [ 335  764]]

Label: pessimism
[[2763  121]
 [ 277   98]]

Label: sadness
[[2176  123]
 [ 495  465]]

Label: fear
[[2642  132]
 [ 177  308]]

Label: surprise
[[3071   18]
 [ 16

ROLE 4

In [None]:
from sklearn.metrics import accuracy_score, f1_score, multilabel_confusion_matrix
from torch.utils.data import DataLoader, TensorDataset
from transformers import AutoTokenizer, RobertaForSequenceClassification

# Define hyperparameters
num_epochs = 10
batch_size = 32
learning_rate = 1e-5
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
dropout = 0.2
gradient_accumulation_steps = 4

# Load dataset (assuming combined_df and sem_eval_2018_task_1 are defined)
df = pd.read_csv("half_gsd_half_gpt_data_role_4.csv")
df_val = pd.DataFrame(sem_eval_2018_task_1['validation'])
df_test = pd.DataFrame(sem_eval_2018_task_1['test'])
label_columns = ['anticipation', 'optimism', 'trust', 'joy', 'love', 'anger', 'disgust', 'pessimism', 'sadness', 'fear', 'surprise']

# Initialize BERT tokenizer and model with dropout
tokenizer = AutoTokenizer.from_pretrained("roberta-base")
model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=len(label_columns), hidden_dropout_prob=dropout, attention_probs_dropout_prob=dropout)
model.to(device)

# Tokenize training data
inputs_train = tokenizer(list(df['Tweet']), padding=True, truncation=True, return_tensors="pt")
labels_train = torch.tensor(df[label_columns].values, dtype=torch.float32)
train_data = TensorDataset(inputs_train['input_ids'], inputs_train['attention_mask'], labels_train)
train_sampler = RandomSampler(train_data, replacement=False, num_samples=None, generator=torch.Generator().manual_seed(seed))  # Ensuring the shuffle is consistent
train_dataloader = DataLoader(train_data, batch_size=batch_size, sampler=train_sampler)

# Tokenize validation data
inputs_val = tokenizer(list(df_val['Tweet']), padding=True, truncation=True, return_tensors="pt")
labels_val = torch.tensor(df_val[label_columns].values, dtype=torch.float32)
val_data = TensorDataset(inputs_val['input_ids'], inputs_val['attention_mask'], labels_val)
val_dataloader = DataLoader(val_data, batch_size=batch_size, shuffle=False)

# Optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# Training loop
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    optimizer.zero_grad()

    for step, batch in enumerate(train_dataloader):
        input_ids, attention_mask, labels = [b.to(device) for b in batch]
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        total_loss += loss.item()

        if (step + 1) % gradient_accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()

        # Free up memory
        del input_ids, attention_mask, labels, outputs, loss
        torch.cuda.empty_cache()

    avg_train_loss = total_loss / len(train_dataloader)

    # Validation
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch in val_dataloader:
            input_ids, attention_mask, labels = [b.to(device) for b in batch]
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            val_loss += loss.item()

            # Free up memory
            del input_ids, attention_mask, labels, outputs, loss
            torch.cuda.empty_cache()

    avg_val_loss = val_loss / len(val_dataloader)
    print(f"Epoch {epoch+1}, Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}")

torch.cuda.empty_cache()

# Evaluation on test set in batches
inputs_test = tokenizer(list(df_test['Tweet']), padding=True, truncation=True, return_tensors="pt")
labels_test = torch.tensor(df_test[label_columns].values, dtype=torch.float32)

test_data = TensorDataset(inputs_test['input_ids'], inputs_test['attention_mask'], labels_test)
test_dataloader = DataLoader(test_data, batch_size=batch_size, shuffle=False)

model.eval()
all_logits = []
with torch.no_grad():
    for batch in test_dataloader:
        input_ids, attention_mask, labels = [b.to(device) for b in batch]
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        all_logits.append(logits.cpu())

        # Free up memory
        del input_ids, attention_mask, labels, outputs, logits
        torch.cuda.empty_cache()

logits_test = torch.cat(all_logits, dim=0)
predicted_labels_test = (torch.sigmoid(logits_test) > 0.5).numpy().astype(int)

# Calculate test metrics
accuracy_test = accuracy_score(labels_test, predicted_labels_test) * 100
f1_macro_test = f1_score(labels_test, predicted_labels_test, average='macro') * 100
f1_micro_test = f1_score(labels_test, predicted_labels_test, average='micro') * 100
confusion_matrix_test = multilabel_confusion_matrix(labels_test, predicted_labels_test)

print(f"Test Accuracy: {accuracy_test:.2f}")
print(f"Test F1-macro: {f1_macro_test:.2f}")
print(f"Test F1-micro: {f1_micro_test:.2f}")
print("Confusion Matrix:")
for label, matrix in zip(label_columns, confusion_matrix_test):
    print(f"Label: {label}")
    print(matrix)
    print()

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1, Train Loss: 0.6133, Val Loss: 0.4938
Epoch 2, Train Loss: 0.5108, Val Loss: 0.4374
Epoch 3, Train Loss: 0.4414, Val Loss: 0.3882
Epoch 4, Train Loss: 0.3944, Val Loss: 0.3659
Epoch 5, Train Loss: 0.3713, Val Loss: 0.3497
Epoch 6, Train Loss: 0.3522, Val Loss: 0.3415
Epoch 7, Train Loss: 0.3346, Val Loss: 0.3391
Epoch 8, Train Loss: 0.3197, Val Loss: 0.3336
Epoch 9, Train Loss: 0.3096, Val Loss: 0.3231
Epoch 10, Train Loss: 0.2990, Val Loss: 0.3226
Test Accuracy: 26.08
Test F1-macro: 51.73
Test F1-micro: 68.19
Confusion Matrix:
Label: anticipation
[[2772   62]
 [ 392   33]]

Label: optimism
[[1707  409]
 [ 228  915]]

Label: trust
[[3065   41]
 [ 138   15]]

Label: joy
[[1598  219]
 [ 242 1200]]

Label: love
[[2593  150]
 [ 227  289]]

Label: anger
[[1996  162]
 [ 330  771]]

Label: disgust
[[1953  207]
 [ 340  759]]

Label: pessimism
[[2703  181]
 [ 253  122]]

Label: sadness
[[2108  191]
 [ 381  579]]

Label: fear
[[2670  104]
 [ 202  283]]

Label: surprise
[[3073   16]
 [ 16

ROLE 1 + NEWS

In [None]:
from sklearn.metrics import accuracy_score, f1_score, multilabel_confusion_matrix
from torch.utils.data import DataLoader, TensorDataset
from transformers import AutoTokenizer, RobertaForSequenceClassification

# Define hyperparameters
num_epochs = 10
batch_size = 32
learning_rate = 1e-5
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
dropout = 0.2
gradient_accumulation_steps = 4

# Load dataset (assuming combined_df and sem_eval_2018_task_1 are defined)
df = pd.read_csv("half_gsd_half_gpt_data_role_5.csv")
df_val = pd.DataFrame(sem_eval_2018_task_1['validation'])
df_test = pd.DataFrame(sem_eval_2018_task_1['test'])
label_columns = ['anticipation', 'optimism', 'trust', 'joy', 'love', 'anger', 'disgust', 'pessimism', 'sadness', 'fear', 'surprise']

# Initialize BERT tokenizer and model with dropout
tokenizer = AutoTokenizer.from_pretrained("roberta-base")
model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=len(label_columns), hidden_dropout_prob=dropout, attention_probs_dropout_prob=dropout)
model.to(device)

# Tokenize training data
inputs_train = tokenizer(list(df['Tweet']), padding=True, truncation=True, return_tensors="pt")
labels_train = torch.tensor(df[label_columns].values, dtype=torch.float32)
train_data = TensorDataset(inputs_train['input_ids'], inputs_train['attention_mask'], labels_train)
train_sampler = RandomSampler(train_data, replacement=False, num_samples=None, generator=torch.Generator().manual_seed(seed))  # Ensuring the shuffle is consistent
train_dataloader = DataLoader(train_data, batch_size=batch_size, sampler=train_sampler)

# Tokenize validation data
inputs_val = tokenizer(list(df_val['Tweet']), padding=True, truncation=True, return_tensors="pt")
labels_val = torch.tensor(df_val[label_columns].values, dtype=torch.float32)
val_data = TensorDataset(inputs_val['input_ids'], inputs_val['attention_mask'], labels_val)
val_dataloader = DataLoader(val_data, batch_size=batch_size, shuffle=False)

# Optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# Training loop
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    optimizer.zero_grad()

    for step, batch in enumerate(train_dataloader):
        input_ids, attention_mask, labels = [b.to(device) for b in batch]
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        total_loss += loss.item()

        if (step + 1) % gradient_accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()

        # Free up memory
        del input_ids, attention_mask, labels, outputs, loss
        torch.cuda.empty_cache()

    avg_train_loss = total_loss / len(train_dataloader)

    # Validation
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch in val_dataloader:
            input_ids, attention_mask, labels = [b.to(device) for b in batch]
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            val_loss += loss.item()

            # Free up memory
            del input_ids, attention_mask, labels, outputs, loss
            torch.cuda.empty_cache()

    avg_val_loss = val_loss / len(val_dataloader)
    print(f"Epoch {epoch+1}, Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}")

torch.cuda.empty_cache()

# Evaluation on test set in batches
inputs_test = tokenizer(list(df_test['Tweet']), padding=True, truncation=True, return_tensors="pt")
labels_test = torch.tensor(df_test[label_columns].values, dtype=torch.float32)

test_data = TensorDataset(inputs_test['input_ids'], inputs_test['attention_mask'], labels_test)
test_dataloader = DataLoader(test_data, batch_size=batch_size, shuffle=False)

model.eval()
all_logits = []
with torch.no_grad():
    for batch in test_dataloader:
        input_ids, attention_mask, labels = [b.to(device) for b in batch]
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        all_logits.append(logits.cpu())

        # Free up memory
        del input_ids, attention_mask, labels, outputs, logits
        torch.cuda.empty_cache()

logits_test = torch.cat(all_logits, dim=0)
predicted_labels_test = (torch.sigmoid(logits_test) > 0.5).numpy().astype(int)

# Calculate test metrics
accuracy_test = accuracy_score(labels_test, predicted_labels_test) * 100
f1_macro_test = f1_score(labels_test, predicted_labels_test, average='macro') * 100
f1_micro_test = f1_score(labels_test, predicted_labels_test, average='micro') * 100
confusion_matrix_test = multilabel_confusion_matrix(labels_test, predicted_labels_test)

print(f"Test Accuracy: {accuracy_test:.2f}")
print(f"Test F1-macro: {f1_macro_test:.2f}")
print(f"Test F1-micro: {f1_micro_test:.2f}")
print("Confusion Matrix:")
for label, matrix in zip(label_columns, confusion_matrix_test):
    print(f"Label: {label}")
    print(matrix)
    print()

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1, Train Loss: 0.6105, Val Loss: 0.4939
Epoch 2, Train Loss: 0.5193, Val Loss: 0.4209
Epoch 3, Train Loss: 0.4381, Val Loss: 0.3759
Epoch 4, Train Loss: 0.4024, Val Loss: 0.3529
Epoch 5, Train Loss: 0.3783, Val Loss: 0.3527
Epoch 6, Train Loss: 0.3589, Val Loss: 0.3366
Epoch 7, Train Loss: 0.3439, Val Loss: 0.3364
Epoch 8, Train Loss: 0.3325, Val Loss: 0.3264
Epoch 9, Train Loss: 0.3185, Val Loss: 0.3283
Epoch 10, Train Loss: 0.3098, Val Loss: 0.3178
Test Accuracy: 26.11
Test F1-macro: 51.46
Test F1-micro: 67.91
Confusion Matrix:
Label: anticipation
[[2729  105]
 [ 370   55]]

Label: optimism
[[1713  403]
 [ 217  926]]

Label: trust
[[3072   34]
 [ 145    8]]

Label: joy
[[1554  263]
 [ 217 1225]]

Label: love
[[2556  187]
 [ 236  280]]

Label: anger
[[1977  181]
 [ 314  787]]

Label: disgust
[[1939  221]
 [ 328  771]]

Label: pessimism
[[2743  141]
 [ 290   85]]

Label: sadness
[[2094  205]
 [ 394  566]]

Label: fear
[[2698   76]
 [ 210  275]]

Label: surprise
[[3063   26]
 [ 15

##RoBERTa Single Label

NO ROLE

In [None]:
from sklearn.metrics import accuracy_score, f1_score, multilabel_confusion_matrix
from torch.utils.data import DataLoader, TensorDataset, RandomSampler
from transformers import AutoTokenizer, RobertaForSequenceClassification

# Define hyperparameters
num_epochs = 10
batch_size = 32
learning_rate = 1e-5
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
dropout = 0.2
gradient_accumulation_steps = 4

# Load dataset (assuming combined_df and sem_eval_2018_task_1 are defined)
df = pd.read_csv("gsd_gpt_single_label_no_role.csv")
df_val = pd.DataFrame(sem_eval_2018_task_1['validation'])
df_test = pd.DataFrame(sem_eval_2018_task_1['test'])
label_columns = ['anticipation', 'optimism', 'trust', 'joy', 'love', 'anger', 'disgust', 'pessimism', 'sadness', 'fear', 'surprise']

# Initialize BERT tokenizer and model with dropout
tokenizer = AutoTokenizer.from_pretrained("roberta-base")
model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=len(label_columns), hidden_dropout_prob=dropout, attention_probs_dropout_prob=dropout)
model.to(device)

# Tokenize training data
inputs_train = tokenizer(list(df['Tweet']), padding=True, truncation=True, return_tensors="pt")
labels_train = torch.tensor(df[label_columns].values, dtype=torch.float32)
train_data = TensorDataset(inputs_train['input_ids'], inputs_train['attention_mask'], labels_train)
train_sampler = RandomSampler(train_data, replacement=False, num_samples=None, generator=torch.Generator().manual_seed(seed))  # Ensuring the shuffle is consistent
train_dataloader = DataLoader(train_data, batch_size=batch_size, sampler=train_sampler)

# Tokenize validation data
inputs_val = tokenizer(list(df_val['Tweet']), padding=True, truncation=True, return_tensors="pt")
labels_val = torch.tensor(df_val[label_columns].values, dtype=torch.float32)
val_data = TensorDataset(inputs_val['input_ids'], inputs_val['attention_mask'], labels_val)
val_dataloader = DataLoader(val_data, batch_size=batch_size, shuffle=False)

# Optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# Training loop
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    optimizer.zero_grad()

    for step, batch in enumerate(train_dataloader):
        input_ids, attention_mask, labels = [b.to(device) for b in batch]
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        total_loss += loss.item()

        if (step + 1) % gradient_accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()

        # Free up memory
        del input_ids, attention_mask, labels, outputs, loss
        torch.cuda.empty_cache()

    avg_train_loss = total_loss / len(train_dataloader)

    # Validation
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch in val_dataloader:
            input_ids, attention_mask, labels = [b.to(device) for b in batch]
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            val_loss += loss.item()

            # Free up memory
            del input_ids, attention_mask, labels, outputs, loss
            torch.cuda.empty_cache()

    avg_val_loss = val_loss / len(val_dataloader)
    print(f"Epoch {epoch+1}, Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}")

torch.cuda.empty_cache()

# Evaluation on test set in batches
inputs_test = tokenizer(list(df_test['Tweet']), padding=True, truncation=True, return_tensors="pt")
labels_test = torch.tensor(df_test[label_columns].values, dtype=torch.float32)

test_data = TensorDataset(inputs_test['input_ids'], inputs_test['attention_mask'], labels_test)
test_dataloader = DataLoader(test_data, batch_size=batch_size, shuffle=False)

model.eval()
all_logits = []
with torch.no_grad():
    for batch in test_dataloader:
        input_ids, attention_mask, labels = [b.to(device) for b in batch]
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        all_logits.append(logits.cpu())

        # Free up memory
        del input_ids, attention_mask, labels, outputs, logits
        torch.cuda.empty_cache()

logits_test = torch.cat(all_logits, dim=0)
predicted_labels_test = (torch.sigmoid(logits_test) > 0.5).numpy().astype(int)

# Calculate test metrics
accuracy_test = accuracy_score(labels_test, predicted_labels_test) * 100
f1_macro_test = f1_score(labels_test, predicted_labels_test, average='macro') * 100
f1_micro_test = f1_score(labels_test, predicted_labels_test, average='micro') * 100
confusion_matrix_test = multilabel_confusion_matrix(labels_test, predicted_labels_test)

print(f"Test Accuracy: {accuracy_test:.2f}")
print(f"Test F1-macro: {f1_macro_test:.2f}")
print(f"Test F1-micro: {f1_micro_test:.2f}")
print("Confusion Matrix:")
for label, matrix in zip(label_columns, confusion_matrix_test):
    print(f"Label: {label}")
    print(matrix)
    print()

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1, Train Loss: 0.5723, Val Loss: 0.4858
Epoch 2, Train Loss: 0.4513, Val Loss: 0.4760
Epoch 3, Train Loss: 0.4177, Val Loss: 0.3980
Epoch 4, Train Loss: 0.3753, Val Loss: 0.3697
Epoch 5, Train Loss: 0.3456, Val Loss: 0.3536
Epoch 6, Train Loss: 0.3231, Val Loss: 0.3393
Epoch 7, Train Loss: 0.3031, Val Loss: 0.3318
Epoch 8, Train Loss: 0.2874, Val Loss: 0.3260
Epoch 9, Train Loss: 0.2760, Val Loss: 0.3184
Epoch 10, Train Loss: 0.2649, Val Loss: 0.3162
Test Accuracy: 26.02
Test F1-macro: 45.97
Test F1-micro: 66.87
Confusion Matrix:
Label: anticipation
[[2791   43]
 [ 387   38]]

Label: optimism
[[1705  411]
 [ 267  876]]

Label: trust
[[3106    0]
 [ 153    0]]

Label: joy
[[1586  231]
 [ 260 1182]]

Label: love
[[2635  108]
 [ 293  223]]

Label: anger
[[1976  182]
 [ 301  800]]

Label: disgust
[[1948  212]
 [ 327  772]]

Label: pessimism
[[2880    4]
 [ 366    9]]

Label: sadness
[[2190  109]
 [ 487  473]]

Label: fear
[[2690   84]
 [ 211  274]]

Label: surprise
[[3089    0]
 [ 16

ROLE 1

In [None]:
from sklearn.metrics import accuracy_score, f1_score, multilabel_confusion_matrix
from torch.utils.data import DataLoader, TensorDataset, RandomSampler
from transformers import AutoTokenizer, RobertaForSequenceClassification

# Define hyperparameters
num_epochs = 10
batch_size = 32
learning_rate = 1e-5
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
dropout = 0.2
gradient_accumulation_steps = 4

# Load dataset (assuming combined_df and sem_eval_2018_task_1 are defined)
df = pd.read_csv("gsd_gpt_single_label_role_1.csv")
df_val = pd.DataFrame(sem_eval_2018_task_1['validation'])
df_test = pd.DataFrame(sem_eval_2018_task_1['test'])
label_columns = ['anticipation', 'optimism', 'trust', 'joy', 'love', 'anger', 'disgust', 'pessimism', 'sadness', 'fear', 'surprise']

# Initialize BERT tokenizer and model with dropout
tokenizer = AutoTokenizer.from_pretrained("roberta-base")
model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=len(label_columns), hidden_dropout_prob=dropout, attention_probs_dropout_prob=dropout)
model.to(device)

# Tokenize training data
inputs_train = tokenizer(list(df['Tweet']), padding=True, truncation=True, return_tensors="pt")
labels_train = torch.tensor(df[label_columns].values, dtype=torch.float32)
train_data = TensorDataset(inputs_train['input_ids'], inputs_train['attention_mask'], labels_train)
train_sampler = RandomSampler(train_data, replacement=False, num_samples=None, generator=torch.Generator().manual_seed(seed))  # Ensuring the shuffle is consistent
train_dataloader = DataLoader(train_data, batch_size=batch_size, sampler=train_sampler)

# Tokenize validation data
inputs_val = tokenizer(list(df_val['Tweet']), padding=True, truncation=True, return_tensors="pt")
labels_val = torch.tensor(df_val[label_columns].values, dtype=torch.float32)
val_data = TensorDataset(inputs_val['input_ids'], inputs_val['attention_mask'], labels_val)
val_dataloader = DataLoader(val_data, batch_size=batch_size, shuffle=False)

# Optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# Training loop
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    optimizer.zero_grad()

    for step, batch in enumerate(train_dataloader):
        input_ids, attention_mask, labels = [b.to(device) for b in batch]
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        total_loss += loss.item()

        if (step + 1) % gradient_accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()

        # Free up memory
        del input_ids, attention_mask, labels, outputs, loss
        torch.cuda.empty_cache()

    avg_train_loss = total_loss / len(train_dataloader)

    # Validation
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch in val_dataloader:
            input_ids, attention_mask, labels = [b.to(device) for b in batch]
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            val_loss += loss.item()

            # Free up memory
            del input_ids, attention_mask, labels, outputs, loss
            torch.cuda.empty_cache()

    avg_val_loss = val_loss / len(val_dataloader)
    print(f"Epoch {epoch+1}, Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}")

torch.cuda.empty_cache()

# Evaluation on test set in batches
inputs_test = tokenizer(list(df_test['Tweet']), padding=True, truncation=True, return_tensors="pt")
labels_test = torch.tensor(df_test[label_columns].values, dtype=torch.float32)

test_data = TensorDataset(inputs_test['input_ids'], inputs_test['attention_mask'], labels_test)
test_dataloader = DataLoader(test_data, batch_size=batch_size, shuffle=False)

model.eval()
all_logits = []
with torch.no_grad():
    for batch in test_dataloader:
        input_ids, attention_mask, labels = [b.to(device) for b in batch]
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        all_logits.append(logits.cpu())

        # Free up memory
        del input_ids, attention_mask, labels, outputs, logits
        torch.cuda.empty_cache()

logits_test = torch.cat(all_logits, dim=0)
predicted_labels_test = (torch.sigmoid(logits_test) > 0.5).numpy().astype(int)

# Calculate test metrics
accuracy_test = accuracy_score(labels_test, predicted_labels_test) * 100
f1_macro_test = f1_score(labels_test, predicted_labels_test, average='macro') * 100
f1_micro_test = f1_score(labels_test, predicted_labels_test, average='micro') * 100
confusion_matrix_test = multilabel_confusion_matrix(labels_test, predicted_labels_test)

print(f"Test Accuracy: {accuracy_test:.2f}")
print(f"Test F1-macro: {f1_macro_test:.2f}")
print(f"Test F1-micro: {f1_micro_test:.2f}")
print("Confusion Matrix:")
for label, matrix in zip(label_columns, confusion_matrix_test):
    print(f"Label: {label}")
    print(matrix)
    print()

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1, Train Loss: 0.6008, Val Loss: 0.4906
Epoch 2, Train Loss: 0.4570, Val Loss: 0.4863
Epoch 3, Train Loss: 0.4431, Val Loss: 0.4427
Epoch 4, Train Loss: 0.4004, Val Loss: 0.3946
Epoch 5, Train Loss: 0.3661, Val Loss: 0.3636
Epoch 6, Train Loss: 0.3405, Val Loss: 0.3537
Epoch 7, Train Loss: 0.3187, Val Loss: 0.3466
Epoch 8, Train Loss: 0.2998, Val Loss: 0.3327
Epoch 9, Train Loss: 0.2864, Val Loss: 0.3262
Epoch 10, Train Loss: 0.2724, Val Loss: 0.3228
Test Accuracy: 26.39
Test F1-macro: 48.25
Test F1-micro: 67.40
Confusion Matrix:
Label: anticipation
[[2793   41]
 [ 392   33]]

Label: optimism
[[1760  356]
 [ 301  842]]

Label: trust
[[3103    3]
 [ 149    4]]

Label: joy
[[1657  160]
 [ 319 1123]]

Label: love
[[2543  200]
 [ 213  303]]

Label: anger
[[1929  229]
 [ 262  839]]

Label: disgust
[[1895  265]
 [ 296  803]]

Label: pessimism
[[2860   24]
 [ 337   38]]

Label: sadness
[[2184  115]
 [ 482  478]]

Label: fear
[[2692   82]
 [ 199  286]]

Label: surprise
[[3089    0]
 [ 16

ROLE 1 + NEWS

In [None]:
from sklearn.metrics import accuracy_score, f1_score, multilabel_confusion_matrix
from torch.utils.data import DataLoader, TensorDataset, RandomSampler
from transformers import AutoTokenizer, RobertaForSequenceClassification

# Define hyperparameters
num_epochs = 10
batch_size = 32
learning_rate = 1e-5
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
dropout = 0.2
gradient_accumulation_steps = 4

# Load dataset (assuming combined_df and sem_eval_2018_task_1 are defined)
df = pd.read_csv("gsd_gpt_single_label_role_1_news.csv")
df_val = pd.DataFrame(sem_eval_2018_task_1['validation'])
df_test = pd.DataFrame(sem_eval_2018_task_1['test'])
label_columns = ['anticipation', 'optimism', 'trust', 'joy', 'love', 'anger', 'disgust', 'pessimism', 'sadness', 'fear', 'surprise']

# Initialize BERT tokenizer and model with dropout
tokenizer = AutoTokenizer.from_pretrained("roberta-base")
model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=len(label_columns), hidden_dropout_prob=dropout, attention_probs_dropout_prob=dropout)
model.to(device)

# Tokenize training data
inputs_train = tokenizer(list(df['Tweet']), padding=True, truncation=True, return_tensors="pt")
labels_train = torch.tensor(df[label_columns].values, dtype=torch.float32)
train_data = TensorDataset(inputs_train['input_ids'], inputs_train['attention_mask'], labels_train)
train_sampler = RandomSampler(train_data, replacement=False, num_samples=None, generator=torch.Generator().manual_seed(seed))  # Ensuring the shuffle is consistent
train_dataloader = DataLoader(train_data, batch_size=batch_size, sampler=train_sampler)

# Tokenize validation data
inputs_val = tokenizer(list(df_val['Tweet']), padding=True, truncation=True, return_tensors="pt")
labels_val = torch.tensor(df_val[label_columns].values, dtype=torch.float32)
val_data = TensorDataset(inputs_val['input_ids'], inputs_val['attention_mask'], labels_val)
val_dataloader = DataLoader(val_data, batch_size=batch_size, shuffle=False)

# Optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# Training loop
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    optimizer.zero_grad()

    for step, batch in enumerate(train_dataloader):
        input_ids, attention_mask, labels = [b.to(device) for b in batch]
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        total_loss += loss.item()

        if (step + 1) % gradient_accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()

        # Free up memory
        del input_ids, attention_mask, labels, outputs, loss
        torch.cuda.empty_cache()

    avg_train_loss = total_loss / len(train_dataloader)

    # Validation
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch in val_dataloader:
            input_ids, attention_mask, labels = [b.to(device) for b in batch]
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            val_loss += loss.item()

            # Free up memory
            del input_ids, attention_mask, labels, outputs, loss
            torch.cuda.empty_cache()

    avg_val_loss = val_loss / len(val_dataloader)
    print(f"Epoch {epoch+1}, Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}")

torch.cuda.empty_cache()

# Evaluation on test set in batches
inputs_test = tokenizer(list(df_test['Tweet']), padding=True, truncation=True, return_tensors="pt")
labels_test = torch.tensor(df_test[label_columns].values, dtype=torch.float32)

test_data = TensorDataset(inputs_test['input_ids'], inputs_test['attention_mask'], labels_test)
test_dataloader = DataLoader(test_data, batch_size=batch_size, shuffle=False)

model.eval()
all_logits = []
with torch.no_grad():
    for batch in test_dataloader:
        input_ids, attention_mask, labels = [b.to(device) for b in batch]
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        all_logits.append(logits.cpu())

        # Free up memory
        del input_ids, attention_mask, labels, outputs, logits
        torch.cuda.empty_cache()

logits_test = torch.cat(all_logits, dim=0)
predicted_labels_test = (torch.sigmoid(logits_test) > 0.5).numpy().astype(int)

# Calculate test metrics
accuracy_test = accuracy_score(labels_test, predicted_labels_test) * 100
f1_macro_test = f1_score(labels_test, predicted_labels_test, average='macro') * 100
f1_micro_test = f1_score(labels_test, predicted_labels_test, average='micro') * 100
confusion_matrix_test = multilabel_confusion_matrix(labels_test, predicted_labels_test)

print(f"Test Accuracy: {accuracy_test:.2f}")
print(f"Test F1-macro: {f1_macro_test:.2f}")
print(f"Test F1-micro: {f1_micro_test:.2f}")
print("Confusion Matrix:")
for label, matrix in zip(label_columns, confusion_matrix_test):
    print(f"Label: {label}")
    print(matrix)
    print()

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1, Train Loss: 0.5742, Val Loss: 0.4861
Epoch 2, Train Loss: 0.4548, Val Loss: 0.4808
Epoch 3, Train Loss: 0.4342, Val Loss: 0.4116
Epoch 4, Train Loss: 0.3920, Val Loss: 0.3754
Epoch 5, Train Loss: 0.3595, Val Loss: 0.3522
Epoch 6, Train Loss: 0.3385, Val Loss: 0.3382
Epoch 7, Train Loss: 0.3234, Val Loss: 0.3384
Epoch 8, Train Loss: 0.3053, Val Loss: 0.3276
Epoch 9, Train Loss: 0.2969, Val Loss: 0.3189
Epoch 10, Train Loss: 0.2851, Val Loss: 0.3182
Test Accuracy: 26.88
Test F1-macro: 46.78
Test F1-micro: 67.21
Confusion Matrix:
Label: anticipation
[[2831    3]
 [ 424    1]]

Label: optimism
[[1647  469]
 [ 208  935]]

Label: trust
[[3106    0]
 [ 153    0]]

Label: joy
[[1551  266]
 [ 196 1246]]

Label: love
[[2579  164]
 [ 221  295]]

Label: anger
[[2013  145]
 [ 349  752]]

Label: disgust
[[1987  173]
 [ 377  722]]

Label: pessimism
[[2845   39]
 [ 319   56]]

Label: sadness
[[2199  100]
 [ 511  449]]

Label: fear
[[2741   33]
 [ 256  229]]

Label: surprise
[[3088    1]
 [ 16

Increased multi-label

In [None]:
from sklearn.metrics import accuracy_score, f1_score, multilabel_confusion_matrix
from torch.utils.data import DataLoader, TensorDataset, RandomSampler
from transformers import AutoTokenizer, RobertaForSequenceClassification

# Define hyperparameters
num_epochs = 10
batch_size = 32
learning_rate = 1e-5
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
dropout = 0.2
gradient_accumulation_steps = 4

# Load dataset (assuming combined_df and sem_eval_2018_task_1 are defined)
df = pd.read_csv("gsd_gpt_increased_multi.csv")
df_val = pd.DataFrame(sem_eval_2018_task_1['validation'])
df_test = pd.DataFrame(sem_eval_2018_task_1['test'])
label_columns = ['anticipation', 'optimism', 'trust', 'joy', 'love', 'anger', 'disgust', 'pessimism', 'sadness', 'fear', 'surprise']

# Initialize BERT tokenizer and model with dropout
tokenizer = AutoTokenizer.from_pretrained("roberta-base")
model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=len(label_columns), hidden_dropout_prob=dropout, attention_probs_dropout_prob=dropout)
model.to(device)

# Tokenize training data
inputs_train = tokenizer(list(df['Tweet']), padding=True, truncation=True, return_tensors="pt")
labels_train = torch.tensor(df[label_columns].values, dtype=torch.float32)
train_data = TensorDataset(inputs_train['input_ids'], inputs_train['attention_mask'], labels_train)
train_sampler = RandomSampler(train_data, replacement=False, num_samples=None, generator=torch.Generator().manual_seed(seed))  # Ensuring the shuffle is consistent
train_dataloader = DataLoader(train_data, batch_size=batch_size, sampler=train_sampler)

# Tokenize validation data
inputs_val = tokenizer(list(df_val['Tweet']), padding=True, truncation=True, return_tensors="pt")
labels_val = torch.tensor(df_val[label_columns].values, dtype=torch.float32)
val_data = TensorDataset(inputs_val['input_ids'], inputs_val['attention_mask'], labels_val)
val_dataloader = DataLoader(val_data, batch_size=batch_size, shuffle=False)

# Optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# Training loop
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    optimizer.zero_grad()

    for step, batch in enumerate(train_dataloader):
        input_ids, attention_mask, labels = [b.to(device) for b in batch]
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        total_loss += loss.item()

        if (step + 1) % gradient_accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()

        # Free up memory
        del input_ids, attention_mask, labels, outputs, loss
        torch.cuda.empty_cache()

    avg_train_loss = total_loss / len(train_dataloader)

    # Validation
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch in val_dataloader:
            input_ids, attention_mask, labels = [b.to(device) for b in batch]
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            val_loss += loss.item()

            # Free up memory
            del input_ids, attention_mask, labels, outputs, loss
            torch.cuda.empty_cache()

    avg_val_loss = val_loss / len(val_dataloader)
    print(f"Epoch {epoch+1}, Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}")

torch.cuda.empty_cache()

# Evaluation on test set in batches
inputs_test = tokenizer(list(df_test['Tweet']), padding=True, truncation=True, return_tensors="pt")
labels_test = torch.tensor(df_test[label_columns].values, dtype=torch.float32)

test_data = TensorDataset(inputs_test['input_ids'], inputs_test['attention_mask'], labels_test)
test_dataloader = DataLoader(test_data, batch_size=batch_size, shuffle=False)

model.eval()
all_logits = []
with torch.no_grad():
    for batch in test_dataloader:
        input_ids, attention_mask, labels = [b.to(device) for b in batch]
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        all_logits.append(logits.cpu())

        # Free up memory
        del input_ids, attention_mask, labels, outputs, logits
        torch.cuda.empty_cache()

logits_test = torch.cat(all_logits, dim=0)
predicted_labels_test = (torch.sigmoid(logits_test) > 0.5).numpy().astype(int)

# Calculate test metrics
accuracy_test = accuracy_score(labels_test, predicted_labels_test) * 100
f1_macro_test = f1_score(labels_test, predicted_labels_test, average='macro') * 100
f1_micro_test = f1_score(labels_test, predicted_labels_test, average='micro') * 100
confusion_matrix_test = multilabel_confusion_matrix(labels_test, predicted_labels_test)

print(f"Test Accuracy: {accuracy_test:.2f}")
print(f"Test F1-macro: {f1_macro_test:.2f}")
print(f"Test F1-micro: {f1_micro_test:.2f}")
print("Confusion Matrix:")
for label, matrix in zip(label_columns, confusion_matrix_test):
    print(f"Label: {label}")
    print(matrix)
    print()

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1, Train Loss: 0.6135, Val Loss: 0.4731
Epoch 2, Train Loss: 0.4339, Val Loss: 0.3949
Epoch 3, Train Loss: 0.3595, Val Loss: 0.3745
Epoch 4, Train Loss: 0.3273, Val Loss: 0.3593
Epoch 5, Train Loss: 0.3043, Val Loss: 0.3479
Epoch 6, Train Loss: 0.2888, Val Loss: 0.3444
Epoch 7, Train Loss: 0.2716, Val Loss: 0.3332
Epoch 8, Train Loss: 0.2606, Val Loss: 0.3364
Epoch 9, Train Loss: 0.2525, Val Loss: 0.3330
Epoch 10, Train Loss: 0.2407, Val Loss: 0.3204
Test Accuracy: 24.42
Test F1-macro: 52.58
Test F1-micro: 67.58
Confusion Matrix:
Label: anticipation
[[2698  136]
 [ 376   49]]

Label: optimism
[[1770  346]
 [ 279  864]]

Label: trust
[[2949  157]
 [ 135   18]]

Label: joy
[[1617  200]
 [ 283 1159]]

Label: love
[[2558  185]
 [ 213  303]]

Label: anger
[[1941  217]
 [ 268  833]]

Label: disgust
[[1907  253]
 [ 285  814]]

Label: pessimism
[[2694  190]
 [ 253  122]]

Label: sadness
[[2049  250]
 [ 353  607]]

Label: fear
[[2659  115]
 [ 186  299]]

Label: surprise
[[3048   41]
 [ 15

Increased single-label

In [None]:
from sklearn.metrics import accuracy_score, f1_score, multilabel_confusion_matrix
from torch.utils.data import DataLoader, TensorDataset, RandomSampler
from transformers import AutoTokenizer, RobertaForSequenceClassification

# Define hyperparameters
num_epochs = 10
batch_size = 32
learning_rate = 1e-5
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
dropout = 0.2
gradient_accumulation_steps = 4

# Load dataset (assuming combined_df and sem_eval_2018_task_1 are defined)
df = pd.read_csv("gsd_gpt_increased_single.csv")
df_val = pd.DataFrame(sem_eval_2018_task_1['validation'])
df_test = pd.DataFrame(sem_eval_2018_task_1['test'])
label_columns = ['anticipation', 'optimism', 'trust', 'joy', 'love', 'anger', 'disgust', 'pessimism', 'sadness', 'fear', 'surprise']

# Initialize BERT tokenizer and model with dropout
tokenizer = AutoTokenizer.from_pretrained("roberta-base")
model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=len(label_columns), hidden_dropout_prob=dropout, attention_probs_dropout_prob=dropout)
model.to(device)

# Tokenize training data
inputs_train = tokenizer(list(df['Tweet']), padding=True, truncation=True, return_tensors="pt")
labels_train = torch.tensor(df[label_columns].values, dtype=torch.float32)
train_data = TensorDataset(inputs_train['input_ids'], inputs_train['attention_mask'], labels_train)
train_sampler = RandomSampler(train_data, replacement=False, num_samples=None, generator=torch.Generator().manual_seed(seed))  # Ensuring the shuffle is consistent
train_dataloader = DataLoader(train_data, batch_size=batch_size, sampler=train_sampler)

# Tokenize validation data
inputs_val = tokenizer(list(df_val['Tweet']), padding=True, truncation=True, return_tensors="pt")
labels_val = torch.tensor(df_val[label_columns].values, dtype=torch.float32)
val_data = TensorDataset(inputs_val['input_ids'], inputs_val['attention_mask'], labels_val)
val_dataloader = DataLoader(val_data, batch_size=batch_size, shuffle=False)

# Optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# Training loop
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    optimizer.zero_grad()

    for step, batch in enumerate(train_dataloader):
        input_ids, attention_mask, labels = [b.to(device) for b in batch]
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        total_loss += loss.item()

        if (step + 1) % gradient_accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()

        # Free up memory
        del input_ids, attention_mask, labels, outputs, loss
        torch.cuda.empty_cache()

    avg_train_loss = total_loss / len(train_dataloader)

    # Validation
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch in val_dataloader:
            input_ids, attention_mask, labels = [b.to(device) for b in batch]
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            val_loss += loss.item()

            # Free up memory
            del input_ids, attention_mask, labels, outputs, loss
            torch.cuda.empty_cache()

    avg_val_loss = val_loss / len(val_dataloader)
    print(f"Epoch {epoch+1}, Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}")

torch.cuda.empty_cache()

# Evaluation on test set in batches
inputs_test = tokenizer(list(df_test['Tweet']), padding=True, truncation=True, return_tensors="pt")
labels_test = torch.tensor(df_test[label_columns].values, dtype=torch.float32)

test_data = TensorDataset(inputs_test['input_ids'], inputs_test['attention_mask'], labels_test)
test_dataloader = DataLoader(test_data, batch_size=batch_size, shuffle=False)

model.eval()
all_logits = []
with torch.no_grad():
    for batch in test_dataloader:
        input_ids, attention_mask, labels = [b.to(device) for b in batch]
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        all_logits.append(logits.cpu())

        # Free up memory
        del input_ids, attention_mask, labels, outputs, logits
        torch.cuda.empty_cache()

logits_test = torch.cat(all_logits, dim=0)
predicted_labels_test = (torch.sigmoid(logits_test) > 0.5).numpy().astype(int)

# Calculate test metrics
accuracy_test = accuracy_score(labels_test, predicted_labels_test) * 100
f1_macro_test = f1_score(labels_test, predicted_labels_test, average='macro') * 100
f1_micro_test = f1_score(labels_test, predicted_labels_test, average='micro') * 100
confusion_matrix_test = multilabel_confusion_matrix(labels_test, predicted_labels_test)

print(f"Test Accuracy: {accuracy_test:.2f}")
print(f"Test F1-macro: {f1_macro_test:.2f}")
print(f"Test F1-micro: {f1_micro_test:.2f}")
print("Confusion Matrix:")
for label, matrix in zip(label_columns, confusion_matrix_test):
    print(f"Label: {label}")
    print(matrix)
    print()

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1, Train Loss: 0.5494, Val Loss: 0.4948
Epoch 2, Train Loss: 0.4178, Val Loss: 0.4276
Epoch 3, Train Loss: 0.3669, Val Loss: 0.3920
Epoch 4, Train Loss: 0.3266, Val Loss: 0.3632
Epoch 5, Train Loss: 0.2968, Val Loss: 0.3502
Epoch 6, Train Loss: 0.2746, Val Loss: 0.3341
Epoch 7, Train Loss: 0.2524, Val Loss: 0.3270
Epoch 8, Train Loss: 0.2405, Val Loss: 0.3238
Epoch 9, Train Loss: 0.2267, Val Loss: 0.3175
Epoch 10, Train Loss: 0.2176, Val Loss: 0.3152
Test Accuracy: 25.50
Test F1-macro: 49.97
Test F1-micro: 67.15
Confusion Matrix:
Label: anticipation
[[2787   47]
 [ 382   43]]

Label: optimism
[[1850  266]
 [ 408  735]]

Label: trust
[[3090   16]
 [ 145    8]]

Label: joy
[[1668  149]
 [ 340 1102]]

Label: love
[[2604  139]
 [ 257  259]]

Label: anger
[[1937  221]
 [ 273  828]]

Label: disgust
[[1885  275]
 [ 273  826]]

Label: pessimism
[[2833   51]
 [ 340   35]]

Label: sadness
[[2118  181]
 [ 393  567]]

Label: fear
[[2660  114]
 [ 183  302]]

Label: surprise
[[3076   13]
 [ 15