In [1]:
from transformers import BertTokenizer, BertModel, BertConfig, PreTrainedModel, TrainingArguments, Trainer
from transformers.modeling_outputs import SequenceClassifierOutput
from datasets import Dataset
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.model_selection import train_test_split
import pandas as pd
import torch
import gc
import os
import numpy as np
import torch.nn as nn
import random

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

In [None]:
# Data
DATA_DIR = "../Dataset"
MODELS_DIR = "../Bert_multiTask_dynamic_weighting_modelli"

# Model
MODEL = 'bert-large-uncased'
EPOCHS = 10
BATCH_SIZE = 8
LEARNING_RATE = 2e-5

# Reproducibility
SEED = 42
set_seed(SEED)

In [5]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

print(f"Utilizzo device: {device}")

Utilizzo device: cuda


In [6]:
tokenizer = BertTokenizer.from_pretrained(MODEL)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [7]:
#load csv dataset
df_train = pd.read_csv(f"{DATA_DIR}/train.csv")
df_test = pd.read_csv(f"{DATA_DIR}/valid.csv")


In [8]:
report_data = []

In [9]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)

In [10]:

class BertMultiTaskHF(PreTrainedModel):
    config_class = BertConfig

    def __init__(self, config):
        super().__init__(config)

        self.num_labels = config.num_labels
        self.bert = BertModel(config)

        hidden_size = config.hidden_size

        # Classifiers for the two task
        self.sentiment_classifier = nn.Linear(hidden_size, config.num_labels)
        self.sarcasm_classifier = nn.Linear(hidden_size, config.num_labels)

        self.dropout = nn.Dropout(0.1)
        self.loss_fct = nn.CrossEntropyLoss()

        self.task = None
        self.post_init()

        # For dynamic weighting
        self.log_sigma_sent = nn.Parameter(torch.zeros(1))
        self.log_sigma_sarc = nn.Parameter(torch.zeros(1))


    def set_test_task(self, task_name):
        """Set the task for the single test"""
        self.task = task_name

    def forward(self, input_ids, attention_mask, sentiment_label=None, sarcasm_label=None):
        # shared encoder
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled = self.dropout(outputs.pooler_output)

        # Logits for the two task
        sentiment_logits = self.sentiment_classifier(pooled)
        sarcasm_logits = self.sarcasm_classifier(pooled)

        # Test / Eval single task
        if self.task is not None:
            if self.task == "Sentiment":
                logits = sentiment_logits
            elif self.task == "Sarcasm":
                logits = sarcasm_logits
            else:
                raise ValueError(f"Invalid task: {self.task}")

            return SequenceClassifierOutput(loss=None, logits=logits)


        # Multi-task training
        loss = 0.0
        loss_terms = 0

        # Loss sentiment
        if sentiment_label is not None:
            mask = sentiment_label != -1
            if mask.sum() > 0:
                sent_loss = self.loss_fct(
                    sentiment_logits[mask],
                    sentiment_label[mask]
                )
                loss += torch.exp(-self.log_sigma_sent) * sent_loss + self.log_sigma_sent
                loss_terms += 1

       # Loss sarcasm
        if sarcasm_label is not None:
            mask = sarcasm_label != -1
            if mask.sum() > 0:
                sarc_loss = self.loss_fct(
                    sarcasm_logits[mask],
                    sarcasm_label[mask]
                )
                loss += torch.exp(-self.log_sigma_sarc) * sarc_loss + self.log_sigma_sarc
                loss_terms += 1

        # If no label is present
        if loss_terms == 0:
            loss = None

        # return combined logits for multi-task
        combined_logits = torch.stack([sentiment_logits, sarcasm_logits], dim=1)

        return SequenceClassifierOutput(loss=loss, logits=combined_logits)


In [11]:
class MultiTaskDataset(torch.utils.data.Dataset):
    def __init__(self, df, tokenizer, max_len=128):
        self.df = df
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]

        encoding = self.tokenizer(
            row["text"],
            truncation=True,
            padding="max_length",
            max_length=self.max_len,
            return_tensors="pt"
        )

        return {
            "input_ids": encoding["input_ids"].squeeze(),
            "attention_mask": encoding["attention_mask"].squeeze(),
            "sentiment_label": torch.tensor(row["sentiment_label"], dtype=torch.long),
            "sarcasm_label": torch.tensor(row["sarcasm_label"], dtype=torch.long)
        }


In [12]:
from transformers import TrainerCallback

In [13]:
class SigmaPrinterCallback(TrainerCallback):
    def on_log(self, args, state, control, model=None, **kwargs):
        # This runs whenever the Trainer logs to console (usually every 50 or 500 steps)
        if model is not None:
            # We access the parameters directly from the model
            sent_sigma = model.log_sigma_sent.item()
            sarc_sigma = model.log_sigma_sarc.item()
            print(f"\n[Step {state.global_step}] Sigma Sent: {sent_sigma:.4f} | Sigma Sarc: {sarc_sigma:.4f}")

Most of the reviews/comments have been annotated for both Sentiment and Sarcasm.

I modify the dataset so that each review/comment has two labels: sentiment_label and sarcasm_label.

The labels can be 0, 1, or -1 if it was not annotated.
In this way, I avoid duplicates and the model can learn the correlation between the two tasks.

In [19]:
df_tain = df_train.dropna()
df_train_sentiment = df_train[df_train['task'] == 'Sentiment'].copy()
df_train_sarcasm = df_train[df_train['task'] == 'Sarcasm'].copy()
df_train_sentiment = df_train_sentiment.rename(columns={'label': 'sentiment_label'})
df_train_sarcasm = df_train_sarcasm.rename(columns={'label': 'sarcasm_label'})
df_train_merge = pd.merge(df_train_sentiment, df_train_sarcasm, on=['text', 'variety', 'source'], how="outer")
df_train_merge = df_train_merge.drop(columns=['task_x', 'task_y'])
df_train_merge["sentiment_label"] = df_train_merge["sentiment_label"].fillna(-1).astype("int64")
df_train_merge["sarcasm_label"] = df_train_merge["sarcasm_label"].fillna(-1).astype("int64")

df_train_merge.head()

Unnamed: 0,text,sentiment_label,variety,source,sarcasm_label
0,\nThank you for capturing Indian culture and...,-1,en-IN,Reddit,0
1,10 year old establishment that used to serve ...,-1,en-IN,Google,0
2,One of the 2 popular Khara Mandakki places in...,1,en-IN,Google,0
3,""" BIG 5 "", "" Exotica Smoothie "" and "" Chicken ...",1,en-IN,Google,0
4,""" Baker Pride offers a delightful array of bak...",1,en-IN,Google,0


In [20]:
import sys

# 1. Create a dummy file so the library can "read" it without crashing
with open("dummy_model_def.py", "w") as f:
    f.write("pass")

# 2. Mock the __file__ attribute for the __main__ module
sys.modules['__main__'].__file__ = "dummy_model_def.py"

In [21]:
# Select only Reddit
#df_train_merge = df_train_merge[df_train_merge['source'] == 'Reddit']


# TRAINING
grouped_train = df_train_merge.groupby(['variety', 'source'])

print(f"Start training of {len(grouped_train)} combinations...")

for (variety, source), df_group in grouped_train:
    run_id = f"{variety}_{source}".replace(" ", "_")
    save_path = os.path.join(MODELS_DIR, run_id)

    print(f"\nTraining combination: {run_id} (Samples: {len(df_group)})")

    # Setup Dataset
    train_ds = MultiTaskDataset(df_group, tokenizer)

    num_labels = 2
    config = BertConfig.from_pretrained(MODEL, num_labels=2)
    model = BertMultiTaskHF.from_pretrained(MODEL, config=config)

    with torch.no_grad():
            # Reset the uncertainty weights to 0 (equal weight)
            model.log_sigma_sent.fill_(0.0)
            model.log_sigma_sarc.fill_(0.0)


    # Setup Trainer
    training_args = TrainingArguments(
        output_dir=f"./checkpoints_temp/{run_id}",
        num_train_epochs=EPOCHS,
        per_device_train_batch_size=BATCH_SIZE,
        learning_rate=LEARNING_RATE,
        save_strategy="no",
        eval_strategy="no",
        report_to="none"
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_ds,
        callbacks=[SigmaPrinterCallback]
    )

    trainer.train()


    print(f"Salvataggio in: {save_path}")
    model.save_pretrained(save_path)
    tokenizer.save_pretrained(save_path)


    # Clean memory
    del model, trainer
    torch.cuda.empty_cache()
    gc.collect()


print("\nAll modells have been saved")

Start training of 6 combinations...

Training combination: en-AU_Google (Samples: 1235)


Loading weights:   0%|          | 0/391 [00:00<?, ?it/s]

BertMultiTaskHF LOAD REPORT from: bert-large-uncased
Key                                        | Status     | 
-------------------------------------------+------------+-
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED | 
cls.predictions.transform.LayerNorm.weight | UNEXPECTED | 
cls.predictions.transform.dense.bias       | UNEXPECTED | 
cls.predictions.transform.dense.weight     | UNEXPECTED | 
cls.seq_relationship.bias                  | UNEXPECTED | 
cls.predictions.bias                       | UNEXPECTED | 
cls.seq_relationship.weight                | UNEXPECTED | 
sarcasm_classifier.weight                  | MISSING    | 
sentiment_classifier.bias                  | MISSING    | 
sarcasm_classifier.bias                    | MISSING    | 
log_sigma_sarc                             | MISSING    | 
log_sigma_sent                             | MISSING    | 
sentiment_classifier.weight                | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from differe

Step,Training Loss
500,0.339772
1000,0.028475
1500,-0.029007



[Step 500] Sigma Sent: -0.0084 | Sigma Sarc: -0.0082

[Step 1000] Sigma Sent: -0.0150 | Sigma Sarc: -0.0148

[Step 1500] Sigma Sent: -0.0173 | Sigma Sarc: -0.0171

[Step 1550] Sigma Sent: -0.0173 | Sigma Sarc: -0.0171
Salvataggio in: /content/drive/MyDrive/DNLP project - Figurative Language Understanding/Bert_multiTask_dynamic_weighting_modelli/en-AU_Google


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]


Training combination: en-AU_Reddit (Samples: 2289)


Loading weights:   0%|          | 0/391 [00:00<?, ?it/s]

BertMultiTaskHF LOAD REPORT from: bert-large-uncased
Key                                        | Status     | 
-------------------------------------------+------------+-
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED | 
cls.predictions.transform.LayerNorm.weight | UNEXPECTED | 
cls.predictions.transform.dense.bias       | UNEXPECTED | 
cls.predictions.transform.dense.weight     | UNEXPECTED | 
cls.seq_relationship.bias                  | UNEXPECTED | 
cls.predictions.bias                       | UNEXPECTED | 
cls.seq_relationship.weight                | UNEXPECTED | 
sarcasm_classifier.weight                  | MISSING    | 
sentiment_classifier.bias                  | MISSING    | 
sarcasm_classifier.bias                    | MISSING    | 
log_sigma_sarc                             | MISSING    | 
log_sigma_sent                             | MISSING    | 
sentiment_classifier.weight                | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from differe

Step,Training Loss
500,0.951741
1000,0.459715
1500,0.132793
2000,-0.014075
2500,-0.054967



[Step 500] Sigma Sent: -0.0092 | Sigma Sarc: -0.0090

[Step 1000] Sigma Sent: -0.0166 | Sigma Sarc: -0.0163

[Step 1500] Sigma Sent: -0.0251 | Sigma Sarc: -0.0249

[Step 2000] Sigma Sent: -0.0303 | Sigma Sarc: -0.0301

[Step 2500] Sigma Sent: -0.0329 | Sigma Sarc: -0.0327

[Step 2870] Sigma Sent: -0.0334 | Sigma Sarc: -0.0332
Salvataggio in: /content/drive/MyDrive/DNLP project - Figurative Language Understanding/Bert_multiTask_dynamic_weighting_modelli/en-AU_Reddit


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]


Training combination: en-IN_Google (Samples: 2135)


Loading weights:   0%|          | 0/391 [00:00<?, ?it/s]

BertMultiTaskHF LOAD REPORT from: bert-large-uncased
Key                                        | Status     | 
-------------------------------------------+------------+-
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED | 
cls.predictions.transform.LayerNorm.weight | UNEXPECTED | 
cls.predictions.transform.dense.bias       | UNEXPECTED | 
cls.predictions.transform.dense.weight     | UNEXPECTED | 
cls.seq_relationship.bias                  | UNEXPECTED | 
cls.predictions.bias                       | UNEXPECTED | 
cls.seq_relationship.weight                | UNEXPECTED | 
sarcasm_classifier.weight                  | MISSING    | 
sentiment_classifier.bias                  | MISSING    | 
sarcasm_classifier.bias                    | MISSING    | 
log_sigma_sarc                             | MISSING    | 
log_sigma_sent                             | MISSING    | 
sentiment_classifier.weight                | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from differe

Step,Training Loss
500,0.4175
1000,0.256722
1500,0.05405
2000,-0.009789
2500,-0.048488



[Step 500] Sigma Sent: -0.0082 | Sigma Sarc: -0.0086

[Step 1000] Sigma Sent: -0.0153 | Sigma Sarc: -0.0157

[Step 1500] Sigma Sent: -0.0216 | Sigma Sarc: -0.0220

[Step 2000] Sigma Sent: -0.0255 | Sigma Sarc: -0.0259

[Step 2500] Sigma Sent: -0.0272 | Sigma Sarc: -0.0276

[Step 2670] Sigma Sent: -0.0273 | Sigma Sarc: -0.0277
Salvataggio in: /content/drive/MyDrive/DNLP project - Figurative Language Understanding/Bert_multiTask_dynamic_weighting_modelli/en-IN_Google


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]


Training combination: en-IN_Reddit (Samples: 2204)


Loading weights:   0%|          | 0/391 [00:00<?, ?it/s]

BertMultiTaskHF LOAD REPORT from: bert-large-uncased
Key                                        | Status     | 
-------------------------------------------+------------+-
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED | 
cls.predictions.transform.LayerNorm.weight | UNEXPECTED | 
cls.predictions.transform.dense.bias       | UNEXPECTED | 
cls.predictions.transform.dense.weight     | UNEXPECTED | 
cls.seq_relationship.bias                  | UNEXPECTED | 
cls.predictions.bias                       | UNEXPECTED | 
cls.seq_relationship.weight                | UNEXPECTED | 
sarcasm_classifier.weight                  | MISSING    | 
sentiment_classifier.bias                  | MISSING    | 
sarcasm_classifier.bias                    | MISSING    | 
log_sigma_sarc                             | MISSING    | 
log_sigma_sent                             | MISSING    | 
sentiment_classifier.weight                | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from differe

Step,Training Loss
500,0.740767
1000,0.270767
1500,0.027362
2000,-0.03925
2500,-0.053696



[Step 500] Sigma Sent: -0.0089 | Sigma Sarc: -0.0083

[Step 1000] Sigma Sent: -0.0171 | Sigma Sarc: -0.0163

[Step 1500] Sigma Sent: -0.0249 | Sigma Sarc: -0.0241

[Step 2000] Sigma Sent: -0.0294 | Sigma Sarc: -0.0286

[Step 2500] Sigma Sent: -0.0315 | Sigma Sarc: -0.0307

[Step 2760] Sigma Sent: -0.0317 | Sigma Sarc: -0.0310
Salvataggio in: /content/drive/MyDrive/DNLP project - Figurative Language Understanding/Bert_multiTask_dynamic_weighting_modelli/en-IN_Reddit


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]


Training combination: en-UK_Google (Samples: 2367)


Loading weights:   0%|          | 0/391 [00:00<?, ?it/s]

BertMultiTaskHF LOAD REPORT from: bert-large-uncased
Key                                        | Status     | 
-------------------------------------------+------------+-
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED | 
cls.predictions.transform.LayerNorm.weight | UNEXPECTED | 
cls.predictions.transform.dense.bias       | UNEXPECTED | 
cls.predictions.transform.dense.weight     | UNEXPECTED | 
cls.seq_relationship.bias                  | UNEXPECTED | 
cls.predictions.bias                       | UNEXPECTED | 
cls.seq_relationship.weight                | UNEXPECTED | 
sarcasm_classifier.weight                  | MISSING    | 
sentiment_classifier.bias                  | MISSING    | 
sarcasm_classifier.bias                    | MISSING    | 
log_sigma_sarc                             | MISSING    | 
log_sigma_sent                             | MISSING    | 
sentiment_classifier.weight                | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from differe

Step,Training Loss
500,0.222441
1000,0.021846
1500,-0.02782
2000,-0.048592
2500,-0.056112



[Step 500] Sigma Sent: -0.0098 | Sigma Sarc: -0.0098

[Step 1000] Sigma Sent: -0.0180 | Sigma Sarc: -0.0180

[Step 1500] Sigma Sent: -0.0242 | Sigma Sarc: -0.0242

[Step 2000] Sigma Sent: -0.0284 | Sigma Sarc: -0.0284

[Step 2500] Sigma Sent: -0.0309 | Sigma Sarc: -0.0309

[Step 2960] Sigma Sent: -0.0316 | Sigma Sarc: -0.0316
Salvataggio in: /content/drive/MyDrive/DNLP project - Figurative Language Understanding/Bert_multiTask_dynamic_weighting_modelli/en-UK_Google


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]


Training combination: en-UK_Reddit (Samples: 1324)


Loading weights:   0%|          | 0/391 [00:00<?, ?it/s]

BertMultiTaskHF LOAD REPORT from: bert-large-uncased
Key                                        | Status     | 
-------------------------------------------+------------+-
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED | 
cls.predictions.transform.LayerNorm.weight | UNEXPECTED | 
cls.predictions.transform.dense.bias       | UNEXPECTED | 
cls.predictions.transform.dense.weight     | UNEXPECTED | 
cls.seq_relationship.bias                  | UNEXPECTED | 
cls.predictions.bias                       | UNEXPECTED | 
cls.seq_relationship.weight                | UNEXPECTED | 
sarcasm_classifier.weight                  | MISSING    | 
sentiment_classifier.bias                  | MISSING    | 
sarcasm_classifier.bias                    | MISSING    | 
log_sigma_sarc                             | MISSING    | 
log_sigma_sent                             | MISSING    | 
sentiment_classifier.weight                | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from differe

Step,Training Loss
500,0.59176
1000,0.044408
1500,-0.02202



[Step 500] Sigma Sent: -0.0084 | Sigma Sarc: -0.0081

[Step 1000] Sigma Sent: -0.0170 | Sigma Sarc: -0.0168

[Step 1500] Sigma Sent: -0.0200 | Sigma Sarc: -0.0198

[Step 1660] Sigma Sent: -0.0202 | Sigma Sarc: -0.0200
Salvataggio in: /content/drive/MyDrive/DNLP project - Figurative Language Understanding/Bert_multiTask_dynamic_weighting_modelli/en-UK_Reddit


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]


All modells have been saved


### Test the models

In [24]:
class Test_Dataset(torch.utils.data.Dataset):
    def __init__(self, df, tokenizer, max_len=128):
        self.df = df
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        encoding = self.tokenizer(
            row["text"],
            truncation=True,
            padding="max_length",
            max_length=self.max_len,
            return_tensors="pt"
        )


        return {
            "input_ids": encoding["input_ids"].squeeze(),
            "attention_mask": encoding["attention_mask"].squeeze(),
        }


In [None]:
# EVALUATION
grouped_val = df_test.groupby(['variety', 'source', 'task'])

print(f"Start evaluation on {len(grouped_val)} combinations...")

for (variety, source, task), df_group in grouped_val:
    run_id = f"{variety}_{source}".replace(" ", "_")
    model_path = os.path.join(MODELS_DIR, run_id)

    print(f"\nTesting combination: {run_id}_{task}")

    if not os.path.exists(model_path):
        print(f"Model not found in {model_path}")
        report_data.append({
            "variety": variety, "source": source, "task": task,
            "status": "Model Missing"
        })
        continue


    tokenizer_for_eval = BertTokenizer.from_pretrained(model_path)
    eval_dataset = Test_Dataset(df_group, tokenizer_for_eval)

    model = BertMultiTaskHF.from_pretrained(model_path)

    model.set_test_task(task)

    args = TrainingArguments(
    output_dir="tmp",
    report_to="none",   # niente wandb
    logging_strategy="no",
    per_device_eval_batch_size=BATCH_SIZE
    )

    trainer = Trainer(
        model=model,
        args=args,
        eval_dataset=eval_dataset,
    )


    results = trainer.predict(eval_dataset)

    predictions = np.argmax(results.predictions, axis=-1)

    labels = df_group["label"]


    from sklearn.metrics import accuracy_score, f1_score
    accuracy = accuracy_score(labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels,
        predictions,
        average="binary",
        pos_label=1
    )


    print(f"   --> Accuracy: {accuracy:.4%} | F1: {f1:.4f}")

    # Saving results report
    report_data.append({
        "variety": variety,
        "source": source,
        "task": task,
        "status": "Success",
        "accuracy": accuracy,
        "f1": f1,
        "precision": precision,
        "recall": recall,
        "num_samples": len(df_group)
    })

    # Clean Memory
    del model, trainer, eval_dataset
    torch.cuda.empty_cache()

# Export final report
df_report = pd.DataFrame(report_data)
df_report.to_csv(f"{MODELS_DIR}/report_performance_finale.csv", index=False)
print("\nTesting completed")

Start evaluation on 12 combinations...

Testing combination: en-AU_Google_Sarcasm


Loading weights:   0%|          | 0/397 [00:00<?, ?it/s]

   --> Accuracy: 90.0000% | F1: 0.0000

Testing combination: en-AU_Google_Sentiment


Loading weights:   0%|          | 0/397 [00:00<?, ?it/s]

   --> Accuracy: 94.6154% | F1: 0.9648

Testing combination: en-AU_Reddit_Sarcasm


Loading weights:   0%|          | 0/397 [00:00<?, ?it/s]

   --> Accuracy: 73.4440% | F1: 0.6923

Testing combination: en-AU_Reddit_Sentiment


Loading weights:   0%|          | 0/397 [00:00<?, ?it/s]

   --> Accuracy: 91.2863% | F1: 0.8645

Testing combination: en-IN_Google_Sarcasm


Loading weights:   0%|          | 0/397 [00:00<?, ?it/s]

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


   --> Accuracy: 99.1111% | F1: 0.0000

Testing combination: en-IN_Google_Sentiment


Loading weights:   0%|          | 0/397 [00:00<?, ?it/s]

   --> Accuracy: 84.0000% | F1: 0.8983

Testing combination: en-IN_Reddit_Sarcasm


Loading weights:   0%|          | 0/397 [00:00<?, ?it/s]

   --> Accuracy: 90.8696% | F1: 0.5116

Testing combination: en-IN_Reddit_Sentiment


Loading weights:   0%|          | 0/397 [00:00<?, ?it/s]

   --> Accuracy: 83.4783% | F1: 0.6724

Testing combination: en-UK_Google_Sarcasm


Loading weights:   0%|          | 0/397 [00:00<?, ?it/s]

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


   --> Accuracy: 100.0000% | F1: 0.0000

Testing combination: en-UK_Google_Sentiment


Loading weights:   0%|          | 0/397 [00:00<?, ?it/s]

   --> Accuracy: 93.1452% | F1: 0.9547

Testing combination: en-UK_Reddit_Sarcasm


Loading weights:   0%|          | 0/397 [00:00<?, ?it/s]

   --> Accuracy: 73.7589% | F1: 0.3273

Testing combination: en-UK_Reddit_Sentiment


Loading weights:   0%|          | 0/397 [00:00<?, ?it/s]

   --> Accuracy: 96.3768% | F1: 0.8276

Testing completed
