# Sentiment Analysis

## Imports

In [1]:
import torch
from transformers import (
    AutoConfig,
    RobertaTokenizerFast,
    RobertaForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
    EarlyStoppingCallback
)

import os
import evaluate
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from datasets import load_dataset
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

## Dataset

In [2]:
DATASET_ID = "imdb"

In [3]:
dataset = load_dataset(DATASET_ID)

val_test_split = dataset['test'].train_test_split(test_size=0.2, seed=42)

raw_train = dataset['train']
raw_val = val_test_split['test']
raw_test = val_test_split['train']

df_results = pd.DataFrame({
    'text': raw_test['text'],
    'label': raw_test['label']
})

print(f"Train samples: {len(raw_train)}")
print(f"Validation samples: {len(raw_val)}")
print(f"Test samples: {len(raw_test)}")

Train samples: 25000
Validation samples: 5000
Test samples: 20000


In [4]:
class_names = raw_train.features['label'].names
num_labels = len(class_names)
id2label = {i: label for i, label in enumerate(class_names)}
label2id = {label: i for i, label in enumerate(class_names)}

print(f"Number of labels: {num_labels}")
print(f"The labels: {class_names}") # -> ['neg', 'pos']

Number of labels: 2
The labels: ['neg', 'pos']


## Metrics

In [5]:
metric_f1 = evaluate.load("f1")
metric_accuracy = evaluate.load("accuracy")
metric_precision = evaluate.load("precision")
metric_recall = evaluate.load("recall")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    f1_weighted = metric_f1.compute(predictions=predictions, references=labels, average="weighted")
    f1_macro = metric_f1.compute(predictions=predictions, references=labels, average="macro")
    accuracy = metric_accuracy.compute(predictions=predictions, references=labels)
    precision = metric_precision.compute(predictions=predictions, references=labels, average="weighted")
    recall = metric_recall.compute(predictions=predictions, references=labels, average="weighted")

    return {
        "accuracy": accuracy["accuracy"],
        "f1_weighted": f1_weighted["f1"],
        "f1_macro": f1_macro["f1"],
        "precision": precision["precision"],
        "recall": recall["recall"],
    }

# Case 1

## Config

In [6]:
MODEL_ID_CASE_1 = 'roberta-base'
REPOSITORY_ID_CASE_1 = os.path.join('models', f"{MODEL_ID_CASE_1}-finetuned-{DATASET_ID}")

## Pre-processing

In [7]:
tokenizer = RobertaTokenizerFast.from_pretrained(MODEL_ID_CASE_1)

def tokenize(batch):
    # 'imdb' uses the column 'text'
    return tokenizer(
        batch['text'],
        truncation=True,
        max_length=512,
        padding=False
    )

In [8]:
tokenized_train = raw_train.map(tokenize, batched=True)
tokenized_val = raw_val.map(tokenize, batched=True)
tokenized_test = raw_test.map(tokenize, batched=True)

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

In [9]:
columns_to_keep = ["input_ids", "attention_mask", "label"]
tokenized_train.set_format("torch", columns=columns_to_keep)
tokenized_val.set_format("torch", columns=columns_to_keep)
tokenized_test.set_format("torch", columns=columns_to_keep)

## Model

In [10]:
config_1 = AutoConfig.from_pretrained(MODEL_ID_CASE_1)
config_1.update({"id2label": id2label, "label2id": label2id})
model_1 = RobertaForSequenceClassification.from_pretrained(MODEL_ID_CASE_1, config=config_1)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


train: 25k / 32 ~ 780 steps per epoch

In [11]:
training_args_1 = TrainingArguments(
    output_dir=REPOSITORY_ID_CASE_1,
    num_train_epochs=10,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,

    eval_strategy="steps",
    eval_steps=150,
    save_strategy="steps",
    save_steps=150,
    logging_strategy="steps",
    logging_steps=150,
    logging_dir=os.path.join(REPOSITORY_ID_CASE_1, 'logs'),

    learning_rate=5e-5,
    weight_decay=0.01,
    warmup_steps=100,
    load_best_model_at_end=True,
    save_total_limit=2,
    report_to="tensorboard",
    metric_for_best_model="f1_weighted",
)

In [12]:
trainer_1 = Trainer(
    model=model_1,
    args=training_args_1,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=5)]
)

## Train

In [13]:
trainer_1.train()

Step,Training Loss,Validation Loss,Accuracy,F1 Weighted,F1 Macro,Precision,Recall
150,0.4399,0.311236,0.912,0.911769,0.911758,0.916142,0.912
300,0.2468,0.205298,0.9312,0.931154,0.931158,0.932489,0.9312
450,0.2445,0.318356,0.8976,0.896837,0.896815,0.909342,0.8976
600,0.234,0.161162,0.9388,0.938793,0.938794,0.939056,0.9388
750,0.1964,0.313417,0.9282,0.92806,0.928052,0.931377,0.9282
900,0.1688,0.205842,0.9386,0.938533,0.938528,0.940372,0.9386
1050,0.1564,0.172806,0.9434,0.943382,0.94338,0.943885,0.9434
1200,0.1329,0.160895,0.947,0.946999,0.946998,0.94703,0.947
1350,0.1512,0.237048,0.9422,0.942198,0.942198,0.94231,0.9422
1500,0.1471,0.158706,0.9464,0.946399,0.946398,0.946422,0.9464


TrainOutput(global_step=2700, training_loss=0.1568860771037914, metrics={'train_runtime': 1693.1032, 'train_samples_per_second': 147.658, 'train_steps_per_second': 4.619, 'total_flos': 2.269744374615024e+16, 'train_loss': 0.1568860771037914, 'epoch': 3.452685421994885})

## Evaluate

In [14]:
test_predictions_1 = trainer_1.predict(tokenized_test)

y_pred_1 = np.argmax(test_predictions_1.predictions, axis=-1)

print("Test Set Metrics - Case 1 (Baseline):")
print(test_predictions_1.metrics)

Test Set Metrics - Case 1 (Baseline):
{'test_loss': 0.18828286230564117, 'test_accuracy': 0.9469, 'test_f1_weighted': 0.946899558203476, 'test_f1_macro': 0.946899456250432, 'test_precision': 0.9469119189049521, 'test_recall': 0.9469, 'test_runtime': 97.1362, 'test_samples_per_second': 205.896, 'test_steps_per_second': 6.434}


In [15]:
df_results['case_1_pred'] = y_pred_1

In [19]:
df_results.to_csv('experiment_results.csv', index=False)

                                                text  label  case_1_pred  \
0  I found it real shocking at first to see Willi...      1            1   
1  it's a great movie for the whole family. i don...      1            1   
2  This movie is not a remake of She's all That (...      0            0   
3  Believe me I wanted this series to work, but t...      1            0   
4  It's not a movie, but an experience!<br /><br ...      1            1   

  label_name case_1_pred_name  
0        pos              pos  
1        pos              pos  
2        neg              neg  
3        pos              neg  
4        pos              pos  


# Case 2

## Config

In [20]:
MODEL_ID_CASE_2 = "./models/roberta-base-dm-4class"
REPOSITORY_ID_CASE_2 = f"./models/roberta-dm-finetuned-{DATASET_ID}"

## Pre-processing

In [21]:
tokenizer = RobertaTokenizerFast.from_pretrained(MODEL_ID_CASE_2)

def tokenize(batch):
    # 'imdb' uses the column 'text'
    return tokenizer(
        batch['text'],
        truncation=True,
        max_length=512,
        padding=False
    )

In [22]:
tokenized_train = raw_train.map(tokenize, batched=True)
tokenized_val = raw_val.map(tokenize, batched=True)
tokenized_test = raw_test.map(tokenize, batched=True)

columns_to_keep = ["input_ids", "attention_mask", "label"]
tokenized_train.set_format("torch", columns=columns_to_keep)
tokenized_val.set_format("torch", columns=columns_to_keep)
tokenized_test.set_format("torch", columns=columns_to_keep)

## Model

In [23]:
config_2 = AutoConfig.from_pretrained(MODEL_ID_CASE_2)
config_2.update({"id2label": id2label, "label2id": label2id, "num_labels": num_labels})
model_2 = RobertaForSequenceClassification.from_pretrained(
    MODEL_ID_CASE_2,
    config=config_2,
    ignore_mismatched_sizes=True
)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at ./models/roberta-base-dm-4class and are newly initialized because the shapes did not match:
- classifier.out_proj.bias: found shape torch.Size([4]) in the checkpoint and torch.Size([2]) in the model instantiated
- classifier.out_proj.weight: found shape torch.Size([4, 768]) in the checkpoint and torch.Size([2, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


train: 25k / 32 ~ 780 steps per epoch

In [24]:
training_args_2 = TrainingArguments(
    output_dir=REPOSITORY_ID_CASE_2,
    num_train_epochs=10,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,

    eval_strategy="steps",
    eval_steps=150,
    save_strategy="steps",
    save_steps=150,
    logging_strategy="steps",
    logging_steps=150,
    logging_dir=f"{REPOSITORY_ID_CASE_2}/logs",

    learning_rate=5e-5,
    weight_decay=0.01,
    warmup_steps=100,
    load_best_model_at_end=True,
    save_total_limit=2,
    report_to="tensorboard",
    metric_for_best_model="f1_weighted",
)

In [25]:
trainer_2 = Trainer(
    model=model_2,
    args=training_args_2,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=5)]
)

## Train

In [26]:
trainer_2.train()

Step,Training Loss,Validation Loss,Accuracy,F1 Weighted,F1 Macro,Precision,Recall
150,0.3954,0.216524,0.9236,0.923582,0.923579,0.923927,0.9236
300,0.2375,0.197645,0.9246,0.924422,0.924413,0.928435,0.9246
450,0.2339,0.243134,0.9168,0.916471,0.916458,0.923203,0.9168
600,0.2226,0.165756,0.9418,0.941795,0.941794,0.941905,0.9418
750,0.1932,0.188752,0.9358,0.935741,0.935737,0.937262,0.9358
900,0.1492,0.205042,0.9392,0.939183,0.939185,0.939775,0.9392
1050,0.137,0.159611,0.9376,0.937576,0.937579,0.938365,0.9376
1200,0.1324,0.171073,0.943,0.942976,0.942973,0.943663,0.943
1350,0.1471,0.169473,0.9462,0.946194,0.946195,0.946452,0.9462
1500,0.1309,0.185406,0.9372,0.937145,0.93714,0.938607,0.9372


TrainOutput(global_step=2400, training_loss=0.1568963054815928, metrics={'train_runtime': 1510.5983, 'train_samples_per_second': 165.497, 'train_steps_per_second': 5.177, 'total_flos': 2.01717585035448e+16, 'train_loss': 0.1568963054815928, 'epoch': 3.0690537084398977})

## Evaluate

In [27]:
test_predictions_2 = trainer_2.predict(tokenized_test)

y_pred_2 = np.argmax(test_predictions_2.predictions, axis=-1)

print("Test Set Metrics - Case 2:")
print(test_predictions_2.metrics)

Test Set Metrics - Case 2:
{'test_loss': 0.22920134663581848, 'test_accuracy': 0.9449, 'test_f1_weighted': 0.9448955415215378, 'test_f1_macro': 0.9448958291653096, 'test_precision': 0.9450545826047639, 'test_recall': 0.9449, 'test_runtime': 97.0582, 'test_samples_per_second': 206.062, 'test_steps_per_second': 6.439}


In [28]:
df_results['case_2_pred'] = y_pred_2

In [33]:
df_results.to_csv('experiment_results.csv', index=False)

# Save Models

In [35]:
trainer_1.save_model(REPOSITORY_ID_CASE_1)
tokenizer.save_pretrained(REPOSITORY_ID_CASE_1)

('models/roberta-base-finetuned-imdb/tokenizer_config.json',
 'models/roberta-base-finetuned-imdb/special_tokens_map.json',
 'models/roberta-base-finetuned-imdb/vocab.json',
 'models/roberta-base-finetuned-imdb/merges.txt',
 'models/roberta-base-finetuned-imdb/added_tokens.json',
 'models/roberta-base-finetuned-imdb/tokenizer.json')

In [36]:
trainer_2.save_model(REPOSITORY_ID_CASE_2)
tokenizer.save_pretrained(REPOSITORY_ID_CASE_2)

('./models/roberta-dm-finetuned-imdb/tokenizer_config.json',
 './models/roberta-dm-finetuned-imdb/special_tokens_map.json',
 './models/roberta-dm-finetuned-imdb/vocab.json',
 './models/roberta-dm-finetuned-imdb/merges.txt',
 './models/roberta-dm-finetuned-imdb/added_tokens.json',
 './models/roberta-dm-finetuned-imdb/tokenizer.json')

# Case 3

## Config

In [6]:
MODEL_ID_CASE_3 = "roberta-base"
REPOSITORY_ID_CASE_3 = f"./models/roberta-base-mtl-dm-sa"

## Dataset

### Discourse Markers Loading

In [7]:
df_dm = pd.read_csv(os.path.join('data', 'en.csv'))

In [8]:
dm_to_class_map = {
    # == Contrastive Discourse Markers (CDMs) ==
    # Show opposition, contrast, concession, or correction
    'although': 'CDM',
    'but': 'CDM',
    'by comparison': 'CDM',
    'by contrast': 'CDM',
    'conversely': 'CDM',
    'however': 'CDM',
    'in contrast': 'CDM',
    'instead': 'CDM',
    'nevertheless': 'CDM',
    'nonetheless': 'CDM',
    'on the contrary': 'CDM',
    'on the other hand': 'CDM',
    'otherwise': 'CDM',
    'rather': 'CDM',
    'regardless': 'CDM',
    'still': 'CDM',
    'though': 'CDM',
    'yet': 'CDM',

    # == Elaborative Discourse Markers (EDMs) ==
    # Add info, specify, rephrase, give examples, or add speaker stance
    'absolutely': 'EDM',
    'actually': 'EDM',
    'additionally': 'EDM',
    'admittedly': 'EDM',
    'again': 'EDM',
    'also': 'EDM',
    'alternately': 'EDM',
    'alternatively': 'EDM',
    'altogether': 'EDM',
    'amazingly': 'EDM',
    'and': 'EDM',
    'anyway': 'EDM',
    'apparently': 'EDM',
    'arguably': 'EDM',
    'basically': 'EDM',
    'besides': 'EDM',
    'certainly': 'EDM',
    'clearly': 'EDM',
    'coincidentally': 'EDM',
    'collectively': 'EDM',
    'curiously': 'EDM',
    'elsewhere': 'EDM',
    'especially': 'EDM',
    'essentially': 'EDM',
    'evidently': 'EDM',
    'for example': 'EDM',
    'for instance': 'EDM',
    'fortunately': 'EDM',
    'frankly': 'EDM',
    'further': 'EDM',
    'furthermore': 'EDM',
    'generally': 'EDM',
    'happily': 'EDM',
    'here': 'EDM',
    'honestly': 'EDM',
    'hopefully': 'EDM',
    'ideally': 'EDM',
    'importantly': 'EDM',
    'in fact': 'EDM',
    'in other words': 'EDM',
    'in particular': 'EDM',
    'in short': 'EDM',
    'in sum': 'EDM',
    'incidentally': 'EDM',
    'indeed': 'EDM',
    'interestingly': 'EDM',
    'ironically': 'EDM',
    'likewise': 'EDM',
    'locally': 'EDM',
    'luckily': 'EDM',
    'maybe': 'EDM',
    'meaning': 'EDM',
    'moreover': 'EDM',
    'mostly': 'EDM',
    'namely': 'EDM',
    'nationally': 'EDM',
    'naturally': 'EDM',
    'notably': 'EDM',
    'obviously': 'EDM',
    'oddly': 'EDM',
    'only': 'EDM',
    'optionally': 'EDM',
    'or': 'EDM',
    'overall': 'EDM',
    'particularly': 'EDM',
    'perhaps': 'EDM',
    'personally': 'EDM',
    'plus': 'EDM',
    'preferably': 'EDM',
    'presumably': 'EDM',
    'probably': 'EDM',
    'realistically': 'EDM',
    'really': 'EDM',
    'remarkably': 'EDM',
    'sadly': 'EDM',
    'separately': 'EDM',
    'seriously': 'EDM',
    'significantly': 'EDM',
    'similarly': 'EDM',
    'specifically': 'EDM',
    'strangely': 'EDM',
    'supposedly': 'EDM',
    'surely': 'EDM',
    'surprisingly': 'EDM',
    'technically': 'EDM',
    'thankfully': 'EDM',
    'theoretically': 'EDM',
    'together': 'EDM',
    'truly': 'EDM',
    'truthfully': 'EDM',
    'undoubtedly': 'EDM',
    'unfortunately': 'EDM',
    'unsurprisingly': 'EDM',
    'well': 'EDM',

    # == Implicative Discourse Markers (IDMs) ==
    # Show result, consequence, or inference
    'accordingly': 'IDM',
    'as a result': 'IDM',
    'because of that': 'IDM',
    'because of this': 'IDM',
    'by doing this': 'IDM',
    'consequently': 'IDM',
    'hence': 'IDM',
    'in turn': 'IDM',
    'inevitably': 'IDM',
    'so': 'IDM',
    'thereby': 'IDM',
    'therefore': 'IDM',
    'thus': 'IDM',

    # == Temporal Discourse Markers (TDMs) ==
    # Show time or sequence
    'afterward': 'TDM',
    'already': 'TDM',
    'by then': 'TDM',
    'currently': 'TDM',
    'eventually': 'TDM',
    'finally': 'TDM',
    'first': 'TDM',
    'firstly': 'TDM',
    'frequently': 'TDM',
    'gradually': 'TDM',
    'historically': 'TDM',
    'immediately': 'TDM',
    'in the end': 'TDM',
    'in the meantime': 'TDM',
    'increasingly': 'TDM',
    'initially': 'TDM',
    'lastly': 'TDM',
    'lately': 'TDM',
    'later': 'TDM',
    'meantime': 'TDM',
    'meanwhile': 'TDM',
    'next': 'TDM',
    'normally': 'TDM',
    'now': 'TDM',
    'occasionally': 'TDM',
    'often': 'TDM',
    'once': 'TDM',
    'originally': 'TDM',
    'presently': 'TDM',
    'previously': 'TDM',
    'recently': 'TDM',
    'second': 'TDM',
    'secondly': 'TDM',
    'simultaneously': 'TDM',
    'slowly': 'TDM',
    'sometimes': 'TDM',
    'soon': 'TDM',
    'subsequently': 'TDM',
    'suddenly': 'TDM',
    'then': 'TDM',
    'thereafter': 'TDM',
    'third': 'TDM',
    'thirdly': 'TDM',
    'traditionally': 'TDM',
    'typically': 'TDM',
    'ultimately': 'TDM',
    'usually': 'TDM',
}

In [9]:
df_dm['label'] = [ dm_to_class_map.get(str(dm).lower().strip()) for dm in df_dm.dm ]

print(f'Original size: {len(df_dm)}')
df_dm = df_dm.loc[df_dm['label'].notnull()].copy()
print(f'Size after filtering: {len(df_dm)}')

Original size: 438913
Size after filtering: 437346


### Discourse Markers Pre-processing

In [10]:
from datasets import Dataset

dataset_dm = Dataset.from_pandas(df_dm)
dataset_dm = dataset_dm.class_encode_column("label")
dataset_dm = dataset_dm.rename_column("label", "dm_label")

Casting to class labels:   0%|          | 0/437346 [00:00<?, ? examples/s]

In [11]:
print(f"Original DM dataset size: {len(dataset_dm)}")

quarter_split = dataset_dm.train_test_split(
    test_size=0.75,
    seed=42,
    stratify_by_column="dm_label"
)
dataset_dm_quarter = quarter_split['train']

print(f"Reduced DM dataset size (25%): {len(dataset_dm_quarter)}")

Original DM dataset size: 437346
Reduced DM dataset size (25%): 109336


In [12]:
tokenizer_3 = RobertaTokenizerFast.from_pretrained("roberta-base")
def tokenize_dm(batch):
    return tokenizer_3(
        batch['s1'],
        batch['s2'],
        truncation=True,
        max_length=512,
        padding=False
    )

tokenized_dm = dataset_dm_quarter.map(
    tokenize_dm,
    batched=True,
    remove_columns=['s1', 's2', 'dm', 'article_id']
)
print(f"DM dataset size: {len(tokenized_dm)}")

Map:   0%|          | 0/109336 [00:00<?, ? examples/s]

DM dataset size: 109336


### Sentiment Analysis

In [13]:
raw_train_sa = raw_train.rename_column("label", "sa_label")
raw_val_sa = raw_val.rename_column("label", "sa_label")

In [14]:
def tokenize_sa(batch):
    return tokenizer_3(
        batch['text'],
        truncation=True,
        max_length=512,
        padding=False
    )
tokenized_train_sa = raw_train_sa.map(tokenize_sa, batched=True, remove_columns=['text'])
tokenized_val_sa = raw_val_sa.map(tokenize_sa, batched=True, remove_columns=['text'])

print(f"SA train size: {len(tokenized_train_sa)}")
print(f"SA val size: {len(tokenized_val_sa)}")

SA train size: 25000
SA val size: 5000


### Combining Datasets

In [15]:
from datasets import concatenate_datasets, Value

In [16]:
tokenized_dm = tokenized_dm.cast_column('dm_label', Value('int64'))
tokenized_train_sa = tokenized_train_sa.cast_column('sa_label', Value('int64'))
tokenized_val_sa = tokenized_val_sa.cast_column('sa_label', Value('int64'))

Casting the dataset:   0%|          | 0/109336 [00:00<?, ? examples/s]

In [17]:
IGNORE_INDEX = -100

tokenized_dm = tokenized_dm.add_column("sa_label", [IGNORE_INDEX] * len(tokenized_dm))

tokenized_train_sa = tokenized_train_sa.add_column("dm_label", [IGNORE_INDEX] * len(tokenized_train_sa))
tokenized_val_sa = tokenized_val_sa.add_column("dm_label", [IGNORE_INDEX] * len(tokenized_val_sa))

In [18]:
mtl_train_dataset = concatenate_datasets([tokenized_dm, tokenized_train_sa])
mtl_val_dataset = concatenate_datasets([tokenized_val_sa])

In [19]:
print("\nDatasets concatenated successfully for MTL!")
print(f"MTL train size: {len(mtl_train_dataset)}")
print(f"MTL val size: {len(mtl_val_dataset)}")


Datasets concatenated successfully for MTL!
MTL train size: 134336
MTL val size: 5000


## Metrics

In [20]:
def compute_metrics_mtl(eval_pred):
    # eval_pred.predictions é agora uma tupla: (logits_dm, logits_sa)
    logits_tuple, labels_tuple = eval_pred

    # Estamos validando apenas no dataset de SA
    logits_sa = logits_tuple[1]
    labels_sa = labels_tuple[1]

    # Filtrar onde o label não é -100
    # (Embora nosso val_set só tenha labels de SA, isso é uma boa prática)
    valid_indices = labels_sa != IGNORE_INDEX
    labels = labels_sa[valid_indices]
    predictions = np.argmax(logits_sa[valid_indices], axis=-1)

    # Reutilizar as métricas carregadas anteriormente (metric_f1, etc.)
    f1_weighted = metric_f1.compute(predictions=predictions, references=labels, average="weighted")
    f1_macro = metric_f1.compute(predictions=predictions, references=labels, average="macro")
    accuracy = metric_accuracy.compute(predictions=predictions, references=labels)

    return {
        "accuracy": accuracy["accuracy"],
        "f1_weighted": f1_weighted["f1"],
        "f1_macro": f1_macro["f1"],
    }

## Model

In [21]:
from torch import nn
from torch.nn import CrossEntropyLoss
from transformers import RobertaPreTrainedModel, RobertaModel
from transformers.modeling_outputs import SequenceClassifierOutput

class RobertaForMultitaskClassification(RobertaPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels_dm = 4
        self.num_labels_sa = 2

        self.roberta = RobertaModel(config)

        # --- SEPARATE CLASSIFICATION HEADS ---
        # Head 1: Discourse Markers (DM)
        self.classifier_dm = nn.Linear(config.hidden_size, self.num_labels_dm)
        # Head 2: Sentiment Analysis (SA)
        self.classifier_sa = nn.Linear(config.hidden_size, self.num_labels_sa)

        self.init_weights()

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        dm_label=None,
        sa_label=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
    ):
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # Passing the entrance through RoBERTa's body
        outputs = self.roberta(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # Use the [CLS] token output for sorting
        pooled_output = outputs[1]

        # Passar o output pelas DUAS cabeças
        logits_dm = self.classifier_dm(pooled_output)
        logits_sa = self.classifier_sa(pooled_output)

        total_loss = 0
        loss_fct = CrossEntropyLoss(ignore_index=IGNORE_INDEX)

        # Calcular perda (loss) para a tarefa de DM (se os labels existirem)
        if dm_label is not None:
            loss_dm = loss_fct(logits_dm.view(-1, self.num_labels_dm), dm_label.view(-1))
            total_loss += loss_dm

        # Calcular perda (loss) para a tarefa de SA (se os labels existirem)
        if sa_label is not None:
            loss_sa = loss_fct(logits_sa.view(-1, self.num_labels_sa), sa_label.view(-1))
            total_loss += loss_sa

        return SequenceClassifierOutput(
            loss=total_loss,
            logits=(logits_dm, logits_sa),
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

In [22]:
config_3 = AutoConfig.from_pretrained("roberta-base")
model_3 = RobertaForMultitaskClassification.from_pretrained("roberta-base", config=config_3)

Some weights of RobertaForMultitaskClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier_dm.bias', 'classifier_dm.weight', 'classifier_sa.bias', 'classifier_sa.weight', 'roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [30]:
training_args_3 = TrainingArguments(
    output_dir=REPOSITORY_ID_CASE_3,
    num_train_epochs=3,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,

    eval_strategy="steps",
    eval_steps=1000,
    save_strategy="steps",
    save_steps=1000,
    logging_strategy="steps",
    logging_steps=1000,
    logging_dir=f"{REPOSITORY_ID_CASE_3}/logs",

    learning_rate=2e-5,
    weight_decay=0.01,
    max_grad_norm=1.0,
    fp16=False,
    gradient_accumulation_steps=2,

    warmup_steps=500,
    load_best_model_at_end=True,
    save_total_limit=2,
    report_to="tensorboard",
    metric_for_best_model="f1_weighted",

    label_names=["dm_label", "sa_label"]
)

In [31]:
trainer_3 = Trainer(
    model=model_3,
    args=training_args_3,
    train_dataset=mtl_train_dataset,
    eval_dataset=mtl_val_dataset,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer_3),
    compute_metrics=compute_metrics_mtl,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

## Train

In [32]:
trainer_3.train()

Step,Training Loss,Validation Loss,Accuracy,F1 Weighted,F1 Macro
1000,0.7104,,0.9412,0.941166,0.941163


NaN or Inf found in input tensor.


KeyboardInterrupt: 

# END