In [None]:
from transformers import BertTokenizer, BertModel, BertConfig, PreTrainedModel, TrainingArguments, Trainer
from transformers.modeling_outputs import SequenceClassifierOutput
from datasets import Dataset
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.model_selection import train_test_split
import pandas as pd
import torch
import gc
import os
import random
import numpy as np
import torch.nn as nn


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

In [None]:
# Data
DATA_DIR = "../Dataset"
MODELS_DIR = "../Bert_multiTask_modelli"

# Model
MODEL = 'bert-large-uncased'
EPOCHS = 10
BATCH_SIZE = 8
LEARNING_RATE = 2e-5

# Reproducibility
SEED = 42
set_seed(SEED)

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

print(f"Utilizzo device: {device}")

Utilizzo device: cuda


In [None]:
tokenizer = BertTokenizer.from_pretrained(MODEL)

In [None]:
#load csv dataset
df_train = pd.read_csv(f'{DATA_DIR}/train.csv')
df_test = pd.read_csv(f'{DATA_DIR}/valid.csv')


In [None]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)

In [None]:
class BertMultiTaskHF(PreTrainedModel):
    config_class = BertConfig

    def __init__(self, config):
        super().__init__(config)

        self.num_labels = config.num_labels
        self.bert = BertModel(config)

        hidden_size = config.hidden_size

        # Classificatori separati per le due task
        self.sentiment_classifier = nn.Linear(hidden_size, config.num_labels)
        self.sarcasm_classifier = nn.Linear(hidden_size, config.num_labels)

        self.dropout = nn.Dropout(0.1)
        self.loss_fct = nn.CrossEntropyLoss()

        self.task = None
        self.post_init()

    def set_test_task(self, task_name):
        """Set the task for the single test"""
        self.task = task_name

    def forward(self, input_ids, attention_mask, sentiment_label=None, sarcasm_label=None):
        # Shared encoder
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled = self.dropout(outputs.pooler_output)

        # Logits for the two tasks
        sentiment_logits = self.sentiment_classifier(pooled)
        sarcasm_logits = self.sarcasm_classifier(pooled)

        # Test / Eval single task
        if self.task is not None:
            if self.task == "Sentiment":
                logits = sentiment_logits
            elif self.task == "Sarcasm":
                logits = sarcasm_logits
            else:
                raise ValueError(f"Invalid task: {self.task}")

            return SequenceClassifierOutput(loss=None, logits=logits)

        # Multi-task training
        loss = None

        # Loss sentiment
        if sentiment_label is not None:
            mask = sentiment_label != -1
            if mask.sum() > 0:
                loss = self.loss_fct(sentiment_logits[mask], sentiment_label[mask])

        # Loss sarcasm
        if sarcasm_label is not None:
            mask = sarcasm_label != -1
            if mask.sum() > 0:
                if loss is not None:
                    loss = loss + self.loss_fct(sarcasm_logits[mask], sarcasm_label[mask])
                else:
                    loss = self.loss_fct(sarcasm_logits[mask], sarcasm_label[mask])

        # Return combined logits for multi-tasks
        combined_logits = torch.stack([sentiment_logits, sarcasm_logits], dim=1)

        return SequenceClassifierOutput(loss=loss, logits=combined_logits)


In [None]:
class MultiTaskDataset(torch.utils.data.Dataset):
    def __init__(self, df, tokenizer, max_len=128):
        self.df = df
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]

        encoding = self.tokenizer(
            row["text"],
            truncation=True,
            padding="max_length",
            max_length=self.max_len,
            return_tensors="pt"
        )

        return {
            "input_ids": encoding["input_ids"].squeeze(),
            "attention_mask": encoding["attention_mask"].squeeze(),
            "sentiment_label": torch.tensor(row["sentiment_label"], dtype=torch.long),
            "sarcasm_label": torch.tensor(row["sarcasm_label"], dtype=torch.long)
        }


Most reviews/comments have been classified for both Sentiment and Sarcasm.

I've modified the dataset so that each review/comment has two labels: sentiment_label and sarcasm_label.

The labels can be 0, 1, or -1 if not annotated. This way, I don't have duplicates, and the model learns the correlation between the two tasks.

In [None]:
df_tain = df_train.dropna()
df_train_sentiment = df_train[df_train['task'] == 'Sentiment'].copy()
df_train_sarcasm = df_train[df_train['task'] == 'Sarcasm'].copy()
df_train_sentiment = df_train_sentiment.rename(columns={'label': 'sentiment_label'})
df_train_sarcasm = df_train_sarcasm.rename(columns={'label': 'sarcasm_label'})
df_train_merge = pd.merge(df_train_sentiment, df_train_sarcasm, on=['text', 'variety', 'source'], how="outer")
df_train_merge = df_train_merge.drop(columns=['task_x', 'task_y'])
df_train_merge["sentiment_label"] = df_train_merge["sentiment_label"].fillna(-1).astype("int64")
df_train_merge["sarcasm_label"] = df_train_merge["sarcasm_label"].fillna(-1).astype("int64")

df_train_merge.head()

Unnamed: 0,text,sentiment_label,variety,source,sarcasm_label
0,\nThank you for capturing Indian culture and...,-1,en-IN,Reddit,0
1,10 year old establishment that used to serve ...,-1,en-IN,Google,0
2,One of the 2 popular Khara Mandakki places in...,1,en-IN,Google,0
3,""" BIG 5 "", "" Exotica Smoothie "" and "" Chicken ...",1,en-IN,Google,0
4,""" Baker Pride offers a delightful array of bak...",1,en-IN,Google,0


In [None]:
import sys

# 1. Create a dummy file so the library can "read" it without crashing
with open("dummy_model_def.py", "w") as f:
    f.write("pass")

# 2. Mock the __file__ attribute for the __main__ module
sys.modules['__main__'].__file__ = "dummy_model_def.py"

In [None]:
# TRAINING
grouped_train = df_train_merge.groupby(['variety', 'source'])

print(f"Start training of {len(grouped_train)} combinations...")

for (variety, source), df_group in grouped_train:
    run_id = f"{variety}_{source}".replace(" ", "_")
    save_path = os.path.join(MODELS_DIR, run_id)

    print(f"\nTraining combination: {run_id} (Samples: {len(df_group)})")


    # Setup Dataset
    train_ds = MultiTaskDataset(df_group, tokenizer)

    num_labels = 2
    config = BertConfig.from_pretrained(MODEL, num_labels=2)
    model = BertMultiTaskHF.from_pretrained(MODEL, config=config)


    # Setup Trainer
    training_args = TrainingArguments(
        output_dir=f"./checkpoints_temp/{run_id}",
        num_train_epochs=EPOCHS,
        per_device_train_batch_size=BATCH_SIZE,
        learning_rate=LEARNING_RATE,
        save_strategy="no",
        eval_strategy="no",
        report_to="none"
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_ds
    )

    trainer.train()


    print(f"Saved in: {save_path}")
    model.save_pretrained(save_path)
    tokenizer.save_pretrained(save_path)


    # Memory cleaning
    del model, trainer
    torch.cuda.empty_cache()
    gc.collect()


print("\nAll modells have been saved")

Start training of 6 combinations...

Training combination: en-AU_Google (Samples: 1235)


Loading weights:   0%|          | 0/391 [00:00<?, ?it/s]

BertMultiTaskHF LOAD REPORT from: bert-large-uncased
Key                                        | Status     | 
-------------------------------------------+------------+-
cls.seq_relationship.bias                  | UNEXPECTED | 
cls.predictions.transform.LayerNorm.weight | UNEXPECTED | 
cls.predictions.transform.dense.weight     | UNEXPECTED | 
cls.seq_relationship.weight                | UNEXPECTED | 
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED | 
cls.predictions.bias                       | UNEXPECTED | 
cls.predictions.transform.dense.bias       | UNEXPECTED | 
sentiment_classifier.weight                | MISSING    | 
sentiment_classifier.bias                  | MISSING    | 
sarcasm_classifier.weight                  | MISSING    | 
sarcasm_classifier.bias                    | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING	:those params were newly initialized because miss

Step,Training Loss
500,0.438246
1000,0.127435
1500,0.027576


Saved in: /content/drive/MyDrive/DNLP project - Figurative Language Understanding/Bert_multiTask_modelli/en-AU_Google


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]


Training combination: en-AU_Reddit (Samples: 2289)


Loading weights:   0%|          | 0/391 [00:00<?, ?it/s]

BertMultiTaskHF LOAD REPORT from: bert-large-uncased
Key                                        | Status     | 
-------------------------------------------+------------+-
cls.seq_relationship.bias                  | UNEXPECTED | 
cls.predictions.transform.LayerNorm.weight | UNEXPECTED | 
cls.predictions.transform.dense.weight     | UNEXPECTED | 
cls.seq_relationship.weight                | UNEXPECTED | 
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED | 
cls.predictions.bias                       | UNEXPECTED | 
cls.predictions.transform.dense.bias       | UNEXPECTED | 
sentiment_classifier.weight                | MISSING    | 
sentiment_classifier.bias                  | MISSING    | 
sarcasm_classifier.weight                  | MISSING    | 
sarcasm_classifier.bias                    | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING	:those params were newly initialized because miss

Step,Training Loss
500,0.913543
1000,0.375158
1500,0.134864
2000,0.03572
2500,0.006986


Saved in: /content/drive/MyDrive/DNLP project - Figurative Language Understanding/Bert_multiTask_modelli/en-AU_Reddit


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]


Training combination: en-IN_Google (Samples: 2135)


Loading weights:   0%|          | 0/391 [00:00<?, ?it/s]

BertMultiTaskHF LOAD REPORT from: bert-large-uncased
Key                                        | Status     | 
-------------------------------------------+------------+-
cls.seq_relationship.bias                  | UNEXPECTED | 
cls.predictions.transform.LayerNorm.weight | UNEXPECTED | 
cls.predictions.transform.dense.weight     | UNEXPECTED | 
cls.seq_relationship.weight                | UNEXPECTED | 
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED | 
cls.predictions.bias                       | UNEXPECTED | 
cls.predictions.transform.dense.bias       | UNEXPECTED | 
sentiment_classifier.weight                | MISSING    | 
sentiment_classifier.bias                  | MISSING    | 
sarcasm_classifier.weight                  | MISSING    | 
sarcasm_classifier.bias                    | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING	:those params were newly initialized because miss

Step,Training Loss
500,0.426765
1000,0.16861
1500,0.050832
2000,0.010073
2500,0.005218


Saved in: /content/drive/MyDrive/DNLP project - Figurative Language Understanding/Bert_multiTask_modelli/en-IN_Google


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]


Training combination: en-IN_Reddit (Samples: 2204)


Loading weights:   0%|          | 0/391 [00:00<?, ?it/s]

BertMultiTaskHF LOAD REPORT from: bert-large-uncased
Key                                        | Status     | 
-------------------------------------------+------------+-
cls.seq_relationship.bias                  | UNEXPECTED | 
cls.predictions.transform.LayerNorm.weight | UNEXPECTED | 
cls.predictions.transform.dense.weight     | UNEXPECTED | 
cls.seq_relationship.weight                | UNEXPECTED | 
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED | 
cls.predictions.bias                       | UNEXPECTED | 
cls.predictions.transform.dense.bias       | UNEXPECTED | 
sentiment_classifier.weight                | MISSING    | 
sentiment_classifier.bias                  | MISSING    | 
sarcasm_classifier.weight                  | MISSING    | 
sarcasm_classifier.bias                    | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING	:those params were newly initialized because miss

Step,Training Loss
500,0.7287
1000,0.296375
1500,0.087673
2000,0.020539
2500,0.005567


Saved in: /content/drive/MyDrive/DNLP project - Figurative Language Understanding/Bert_multiTask_modelli/en-IN_Reddit


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]


Training combination: en-UK_Google (Samples: 2367)


Loading weights:   0%|          | 0/391 [00:00<?, ?it/s]

BertMultiTaskHF LOAD REPORT from: bert-large-uncased
Key                                        | Status     | 
-------------------------------------------+------------+-
cls.seq_relationship.bias                  | UNEXPECTED | 
cls.predictions.transform.LayerNorm.weight | UNEXPECTED | 
cls.predictions.transform.dense.weight     | UNEXPECTED | 
cls.seq_relationship.weight                | UNEXPECTED | 
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED | 
cls.predictions.bias                       | UNEXPECTED | 
cls.predictions.transform.dense.bias       | UNEXPECTED | 
sentiment_classifier.weight                | MISSING    | 
sentiment_classifier.bias                  | MISSING    | 
sarcasm_classifier.weight                  | MISSING    | 
sarcasm_classifier.bias                    | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING	:those params were newly initialized because miss

Step,Training Loss
500,0.247023
1000,0.051625
1500,0.022029
2000,0.003835
2500,0.002137


Saved in: /content/drive/MyDrive/DNLP project - Figurative Language Understanding/Bert_multiTask_modelli/en-UK_Google


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]


Training combination: en-UK_Reddit (Samples: 1324)


Loading weights:   0%|          | 0/391 [00:00<?, ?it/s]

BertMultiTaskHF LOAD REPORT from: bert-large-uncased
Key                                        | Status     | 
-------------------------------------------+------------+-
cls.seq_relationship.bias                  | UNEXPECTED | 
cls.predictions.transform.LayerNorm.weight | UNEXPECTED | 
cls.predictions.transform.dense.weight     | UNEXPECTED | 
cls.seq_relationship.weight                | UNEXPECTED | 
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED | 
cls.predictions.bias                       | UNEXPECTED | 
cls.predictions.transform.dense.bias       | UNEXPECTED | 
sentiment_classifier.weight                | MISSING    | 
sentiment_classifier.bias                  | MISSING    | 
sarcasm_classifier.weight                  | MISSING    | 
sarcasm_classifier.bias                    | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING	:those params were newly initialized because miss

Step,Training Loss
500,0.605025
1000,0.1255
1500,0.0388


Saved in: /content/drive/MyDrive/DNLP project - Figurative Language Understanding/Bert_multiTask_modelli/en-UK_Reddit


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]


All modells have been saved


### Test the models

In [None]:
report_data = []

In [None]:
class Test_Dataset(torch.utils.data.Dataset):
    def __init__(self, df, tokenizer, max_len=128):
        self.df = df
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        encoding = self.tokenizer(
            row["text"],
            truncation=True,
            padding="max_length",
            max_length=self.max_len,
            return_tensors="pt"
        )


        return {
            "input_ids": encoding["input_ids"].squeeze(),
            "attention_mask": encoding["attention_mask"].squeeze(),
        }


In [None]:
# EVALUATION
grouped_val = df_test.groupby(['variety', 'source', 'task'])

print(f"Start evaluation of {len(grouped_val)} combinations...")

for (variety, source, task), df_group in grouped_val:
    run_id = f"{variety}_{source}".replace(" ", "_")
    model_path = os.path.join(MODELS_DIR, run_id)

    print(f"\nTesting combination: {run_id}_{task}")

    # Loading the right model
    if not os.path.exists(model_path):
        print(f"Model not found in {model_path}")
        report_data.append({
            "variety": variety, "source": source, "task": task,
            "status": "Model Missing"
        })
        continue

    tokenizer_for_eval = BertTokenizer.from_pretrained(model_path)
    eval_dataset = Test_Dataset(df_group, tokenizer_for_eval)


    model = BertMultiTaskHF.from_pretrained(model_path)
    # Setting the task to use the corresponding head
    model.set_test_task(task)

    args = TrainingArguments(
    output_dir="tmp",
    report_to="none",
    logging_strategy="no",
    per_device_eval_batch_size=BATCH_SIZE
    )

    trainer = Trainer(
        model=model,
        args=args,
        eval_dataset=eval_dataset,
    )


    results = trainer.predict(eval_dataset)

    predictions = np.argmax(results.predictions, axis=-1)

    labels = df_group["label"]


    from sklearn.metrics import accuracy_score, f1_score
    accuracy = accuracy_score(labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, predictions, average="binary", pos_label=1
    )


    print(f"   --> Accuracy: {accuracy:.4%} | F1: {f1:.4f}")

    # Saving results report
    report_data.append({
        "variety": variety,
        "source": source,
        "task": task,
        "status": "Success",
        "accuracy": accuracy,
        "f1": f1,
        "precision": precision,
        "recall": recall,
        "num_samples": len(df_group)
    })

    # Cleaning
    del model, trainer, eval_dataset
    torch.cuda.empty_cache()

# Export final report
df_report = pd.DataFrame(report_data)
df_report.to_csv("report_performance_finale.csv", index=False)
print("\nTesting completed")

Start evaluation of 12 combinations...

Testing combination: en-AU_Google_Sarcasm


Loading weights:   0%|          | 0/395 [00:00<?, ?it/s]

   --> Accuracy: 89.2308% | F1: 0.0000

Testing combination: en-AU_Google_Sentiment


Loading weights:   0%|          | 0/395 [00:00<?, ?it/s]

   --> Accuracy: 93.8462% | F1: 0.9588

Testing combination: en-AU_Reddit_Sarcasm


Loading weights:   0%|          | 0/395 [00:00<?, ?it/s]

   --> Accuracy: 74.2739% | F1: 0.6702

Testing combination: en-AU_Reddit_Sentiment


Loading weights:   0%|          | 0/395 [00:00<?, ?it/s]

   --> Accuracy: 89.2116% | F1: 0.8375

Testing combination: en-IN_Google_Sarcasm


Loading weights:   0%|          | 0/395 [00:00<?, ?it/s]

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


   --> Accuracy: 99.1111% | F1: 0.0000

Testing combination: en-IN_Google_Sentiment


Loading weights:   0%|          | 0/395 [00:00<?, ?it/s]

   --> Accuracy: 84.4444% | F1: 0.9003

Testing combination: en-IN_Reddit_Sarcasm


Loading weights:   0%|          | 0/395 [00:00<?, ?it/s]

   --> Accuracy: 90.0000% | F1: 0.4651

Testing combination: en-IN_Reddit_Sentiment


Loading weights:   0%|          | 0/395 [00:00<?, ?it/s]

   --> Accuracy: 85.6522% | F1: 0.7080

Testing combination: en-UK_Google_Sarcasm


Loading weights:   0%|          | 0/395 [00:00<?, ?it/s]

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


   --> Accuracy: 100.0000% | F1: 0.0000

Testing combination: en-UK_Google_Sentiment


Loading weights:   0%|          | 0/395 [00:00<?, ?it/s]

   --> Accuracy: 93.5484% | F1: 0.9574

Testing combination: en-UK_Reddit_Sarcasm


Loading weights:   0%|          | 0/395 [00:00<?, ?it/s]

   --> Accuracy: 77.3050% | F1: 0.4286

Testing combination: en-UK_Reddit_Sentiment


Loading weights:   0%|          | 0/395 [00:00<?, ?it/s]

   --> Accuracy: 97.8261% | F1: 0.9032

Testing completed


In [None]:
report_path = os.path.join(MODELS_DIR, "report_performance_MultiTask.csv")
df_report = pd.DataFrame(report_data)
df_report.to_csv(report_path, index=False)