In [None]:
DATASET_NAME = "UC3M-LCPM/DIPROMATS_2024"
DATASET_SPLIT = "task1b_bal"

In [None]:
ES_MODEL_NAME = "UC3M-LCPM/Robertuito_sentiment_analysis_task1b_es_bal"
EN_MODEL_NAME = "UC3M-LCPM/XLNet_base_cased_task1b_en_bal"

In [None]:
SELECTED_COLUMNS = ['text', '1 appeal to commonality', '2 discrediting the opponent', '3 loaded language']

In [None]:
labels = ['1 appeal to commonality', '2 discrediting the opponent', '3 loaded language']
id2label = {idx:label for idx, label in enumerate(labels)}
label2id = {label:idx for idx, label in enumerate(labels)}
n_labels = len(labels)

# Login to drive

In [None]:
from google.colab import drive
# mount your google drive
drive.mount('/content/drive')

# cambiamos el directorio de trabajo a la carpeta donde está almacenado el dataset
import os
os.chdir('/content/drive/My Drive/Colab Notebooks/Modelos/')

Mounted at /content/drive


# Install dependencies

In [None]:
!pip install -q transformers[torch] datasets
!pip install accelerate -U

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.4/302.4 kB[0m [31m34.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m16.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m19.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m16.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m401.2/401.2 kB[0m [31m32.6 MB/s[0m eta [36m0:00:00[0m


# Login to HuggingFace

In [None]:
from huggingface_hub import HfFolder, notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

# Load dataset

In [None]:
from datasets import load_dataset, Dataset

dataset = load_dataset(DATASET_NAME, DATASET_SPLIT)

Downloading data:   0%|          | 0.00/3.88M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/418k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/829k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/17927 [00:00<?, ? examples/s]

Generating val split:   0%|          | 0/1826 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3633 [00:00<?, ? examples/s]

In [None]:
dataset['train'] = dataset['train'].filter(lambda example: example['0 not propagandistic'] == 0)
dataset['val'] = dataset['val'].filter(lambda example: example['0 not propagandistic'] == 0)
dataset['test'] = dataset['test'].filter(lambda example: example['0 not propagandistic'] == 0)

Filter:   0%|          | 0/17927 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1826 [00:00<?, ? examples/s]

Filter:   0%|          | 0/3633 [00:00<?, ? examples/s]

In [None]:
dataset['train'][:10]['0 not propagandistic']

[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]

# Define metrics function

In [None]:
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from transformers import EvalPrediction
import torch

# source: https://jesusleal.io/2021/04/21/Longformer-multilabel-classification/
def multi_label_metrics(predictions, labels, threshold=0.5):
    # first, apply sigmoid on predictions which are of shape (batch_size, num_labels)
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    # next, use threshold to turn them into integer predictions
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1
    # finally, compute metrics
    y_true = labels
    f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro')
    roc_auc = roc_auc_score(y_true, y_pred, average = 'micro')
    accuracy = accuracy_score(y_true, y_pred)
    # return as dictionary
    metrics = {'f1': f1_micro_average,
               'roc_auc': roc_auc,
               'accuracy': accuracy}
    return metrics

def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions,
            tuple) else p.predictions
    result = multi_label_metrics(
        predictions=preds,
        labels=p.label_ids)
    return result

# SPANISH MODEL

### Clean dataset columns

In [None]:
COLS_TO_REMOVE = [column for column in dataset['train'].column_names if column not in SELECTED_COLUMNS]

In [None]:
dataset_train_es = dataset['train'].filter(lambda example: example['language'] == 'es')
dataset_train_es = dataset_train_es.remove_columns(COLS_TO_REMOVE)
dataset_train_es

Filter:   0%|          | 0/9406 [00:00<?, ? examples/s]

Dataset({
    features: ['text', '1 appeal to commonality', '2 discrediting the opponent', '3 loaded language'],
    num_rows: 3495
})

In [None]:
dataset_val_es = dataset['val'].filter(lambda example: example['language'] == 'es')
dataset_val_es = dataset_val_es.remove_columns(COLS_TO_REMOVE)
dataset_val_es

Filter:   0%|          | 0/406 [00:00<?, ? examples/s]

Dataset({
    features: ['text', '1 appeal to commonality', '2 discrediting the opponent', '3 loaded language'],
    num_rows: 164
})

### Load model

In [None]:
from transformers import (AutoConfig,
                          AutoTokenizer,
                          AutoModelForSequenceClassification,
                          TrainingArguments,
                          pipeline,
                          set_seed)

set_seed(123)

epochs = 4
batch_size = 32

MAX_LENGTH = 60

model_name_or_path = "pysentimiento/robertuito-sentiment-analysis"

In [None]:
model_config = AutoConfig.from_pretrained(pretrained_model_name_or_path=model_name_or_path,
                                          problem_type="multi_label_classification",
                                          num_labels=n_labels,
                                          id2label=id2label,
                                          label2id=label2id)

tokenizer_es = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_name_or_path)
tokenizer_es.pad_token = tokenizer_es.eos_token
if tokenizer_es.pad_token is None:
    tokenizer_es.add_special_tokens({'pad_token': '[PAD]'})

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
model_es = AutoModelForSequenceClassification.from_pretrained(pretrained_model_name_or_path=model_name_or_path, ignore_mismatched_sizes=True, config=model_config).to("cuda")

# resize model embedding to match new tokenizer
model_es.resize_token_embeddings(len(tokenizer_es))

# fix model padding token id
model_es.config.pad_token_id = model_es.config.eos_token_id

### Tokenize datasets

In [None]:
import numpy as np

def tokenize(examples):
  # take a batch of texts
  text = examples["text"]
  # encode them
  encoding = tokenizer_es(text, return_tensors="pt", padding="max_length", truncation=True, max_length=MAX_LENGTH).to("cuda")
  # add labels
  labels_batch = {k: examples[k] for k in examples.keys() if k in labels}
  # create numpy array of shape (batch_size, num_labels)
  labels_matrix = np.zeros((len(text), len(labels)))
  # fill numpy array
  for idx, label in enumerate(labels):
    labels_matrix[:, idx] = labels_batch[label]

  encoding["labels"] = labels_matrix.tolist()

  return encoding

In [None]:
encoded_data_train_es = dataset_train_es.map(tokenize, batched=True, remove_columns=dataset_train_es.column_names)
encoded_data_train_es

Map:   0%|          | 0/3495 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 3495
})

In [None]:
encoded_data_val_es = dataset_val_es.map(tokenize, batched=True, remove_columns=dataset_val_es.column_names)
encoded_data_val_es

Map:   0%|          | 0/164 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 164
})

In [None]:
encoded_data_train_es.set_format("torch")
encoded_data_val_es.set_format("torch")

### Configure model

In [None]:
from transformers import TrainingArguments

training_args_es = TrainingArguments(
    output_dir="modelos/" + ES_MODEL_NAME,
    num_train_epochs=epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    evaluation_strategy="epoch",
    logging_dir=f"modelos/{ES_MODEL_NAME}/logs",
    logging_strategy="steps",
    logging_steps=10,
    learning_rate=5e-5,
    weight_decay=0.01,
    warmup_steps=500,
    save_strategy="epoch",
    load_best_model_at_end=True,
    save_total_limit=2,
    report_to="tensorboard",
    push_to_hub=True,
    hub_strategy="every_save",
    hub_model_id=ES_MODEL_NAME,
    hub_token=HfFolder.get_token(),
)

### Train model

In [None]:
from transformers import Trainer

trainer = Trainer(
    model = model_es,            # modelo que será ajustado
    train_dataset = encoded_data_train_es, # conjunto training
    eval_dataset = encoded_data_val_es,   # conjunto de validación
    tokenizer = tokenizer_es,
    args = training_args_es,     # hiperparámetros
    compute_metrics=compute_metrics,    # función para computar las métricas
)

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,F1,Roc Auc,Accuracy
1,0.3769,0.391504,0.733333,0.789078,0.542683
2,0.2121,0.369428,0.773481,0.820824,0.664634
3,0.1094,0.461833,0.767123,0.816001,0.670732
4,0.0337,0.510246,0.813187,0.85257,0.731707


TrainOutput(global_step=440, training_loss=0.2848302341320298, metrics={'train_runtime': 55.8338, 'train_samples_per_second': 250.386, 'train_steps_per_second': 7.881, 'total_flos': 431053778887200.0, 'train_loss': 0.2848302341320298, 'epoch': 4.0})

In [None]:
trainer.evaluate()

{'eval_loss': 0.36942845582962036,
 'eval_f1': 0.7734806629834253,
 'eval_roc_auc': 0.8208239327778863,
 'eval_accuracy': 0.6646341463414634,
 'eval_runtime': 0.2177,
 'eval_samples_per_second': 753.499,
 'eval_steps_per_second': 27.567,
 'epoch': 4.0}

### Push model to HuggingFace

In [None]:
model_es.push_to_hub(ES_MODEL_NAME, private=True)

model.safetensors:   0%|          | 0.00/435M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/UC3M-LCPM/Robertuito_sentiment_analysis_task1b_es_bal/commit/d0072b5c37071bf586c3b198bb7ba2ce0aad56ab', commit_message='Upload RobertaForSequenceClassification', commit_description='', oid='d0072b5c37071bf586c3b198bb7ba2ce0aad56ab', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
tokenizer_es.push_to_hub(ES_MODEL_NAME, private=True)

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/UC3M-LCPM/Robertuito_sentiment_analysis_task1b_es_bal/commit/868b4bc3796f49bddd7ede2e6c15e5b7e89c64fd', commit_message='Upload tokenizer', commit_description='', oid='868b4bc3796f49bddd7ede2e6c15e5b7e89c64fd', pr_url=None, pr_revision=None, pr_num=None)

# ENGLISH MODEL

### Clean dataset columns

In [None]:
COLS_TO_REMOVE = [column for column in dataset['train'].column_names if column not in SELECTED_COLUMNS]

In [None]:
dataset_train_en = dataset['train'].filter(lambda example: example['language'] == 'en')
dataset_train_en = dataset_train_en.remove_columns(COLS_TO_REMOVE)
dataset_train_en

Filter:   0%|          | 0/9406 [00:00<?, ? examples/s]

Dataset({
    features: ['text', '1 appeal to commonality', '2 discrediting the opponent', '3 loaded language'],
    num_rows: 5911
})

In [None]:
dataset_val_en = dataset['val'].filter(lambda example: example['language'] == 'en')
dataset_val_en = dataset_val_en.remove_columns(COLS_TO_REMOVE)
dataset_val_en

Filter:   0%|          | 0/406 [00:00<?, ? examples/s]

Dataset({
    features: ['text', '1 appeal to commonality', '2 discrediting the opponent', '3 loaded language'],
    num_rows: 242
})

### Load model

In [None]:
from transformers import (XLNetConfig,
                          XLNetTokenizer,
                          XLNetForSequenceClassification,
                          TrainingArguments,
                          pipeline,
                          set_seed)

set_seed(123)

epochs = 4
batch_size = 32

MAX_LENGTH = 60

model_name_or_path = "xlnet/xlnet-base-cased"

In [None]:
model_config = XLNetConfig.from_pretrained(pretrained_model_name_or_path=model_name_or_path,
                                          problem_type="multi_label_classification",
                                          num_labels=n_labels,
                                          id2label=id2label,
                                          label2id=label2id)

tokenizer_en = XLNetTokenizer.from_pretrained(pretrained_model_name_or_path=model_name_or_path)
tokenizer_en.pad_token = tokenizer_en.eos_token
if tokenizer_en.pad_token is None:
    tokenizer_en.add_special_tokens({'pad_token': '[PAD]'})



In [None]:
model_en = XLNetForSequenceClassification.from_pretrained(pretrained_model_name_or_path=model_name_or_path, config=model_config).to("cuda")

# resize model embedding to match new tokenizer
model_en.resize_token_embeddings(len(tokenizer_en))

# fix model padding token id
model_en.config.pad_token_id = model_en.config.eos_token_id

Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet/xlnet-base-cased and are newly initialized: ['logits_proj.bias', 'logits_proj.weight', 'sequence_summary.summary.bias', 'sequence_summary.summary.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Tokenize datasets

In [None]:
import numpy as np

def tokenize(examples):
  # take a batch of texts
  text = examples["text"]
  # encode them
  encoding = tokenizer_en(text, return_tensors="pt", padding="max_length", truncation=True, max_length=MAX_LENGTH).to("cuda")
  # add labels
  labels_batch = {k: examples[k] for k in examples.keys() if k in labels}
  # create numpy array of shape (batch_size, num_labels)
  labels_matrix = np.zeros((len(text), len(labels)))
  # fill numpy array
  for idx, label in enumerate(labels):
    labels_matrix[:, idx] = labels_batch[label]

  encoding["labels"] = labels_matrix.tolist()

  return encoding

In [None]:
encoded_data_train_en = dataset_train_es.map(tokenize, batched=True, remove_columns=dataset_train_es.column_names)
encoded_data_train_en

Map:   0%|          | 0/3495 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 3495
})

In [None]:
encoded_data_val_en = dataset_val_es.map(tokenize, batched=True, remove_columns=dataset_train_es.column_names)
encoded_data_val_en

Map:   0%|          | 0/164 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 164
})

In [None]:
encoded_data_train_en.set_format("torch")
encoded_data_val_en.set_format("torch")

### Configure model

In [None]:
from transformers import TrainingArguments

training_args_en = TrainingArguments(
    output_dir="modelos/" + EN_MODEL_NAME,
    num_train_epochs=epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    evaluation_strategy="epoch",
    logging_dir=f"modelos/{EN_MODEL_NAME}/logs",
    logging_strategy="steps",
    logging_steps=10,
    learning_rate=5e-5,
    weight_decay=0.01,
    warmup_steps=500,
    save_strategy="epoch",
    load_best_model_at_end=True,
    save_total_limit=2,
    report_to="tensorboard",
    push_to_hub=True,
    hub_strategy="every_save",
    hub_model_id=EN_MODEL_NAME,
    hub_token=HfFolder.get_token(),
)

### Train model

In [None]:
from transformers import Trainer

trainer = Trainer(
    model = model_en,            # modelo que será ajustado
    train_dataset = encoded_data_train_en, # conjunto training
    eval_dataset = encoded_data_val_en,   # conjunto de validación
    tokenizer = tokenizer_en,
    args = training_args_en,     # hiperparámetros
    compute_metrics=compute_metrics,    # función para computar las métricas
)

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,F1,Roc Auc,Accuracy
1,0.6042,0.594513,0.524691,0.641559,0.47561
2,0.422,0.574244,0.600536,0.680775,0.347561
3,0.3238,0.503037,0.719547,0.778481,0.628049
4,0.211,0.723606,0.694678,0.75894,0.621951


TrainOutput(global_step=440, training_loss=0.43982757898894226, metrics={'train_runtime': 73.7994, 'train_samples_per_second': 189.432, 'train_steps_per_second': 5.962, 'total_flos': 466717890708000.0, 'train_loss': 0.43982757898894226, 'epoch': 4.0})

In [None]:
trainer.evaluate()

{'eval_loss': 0.5030366778373718,
 'eval_f1': 0.7195467422096317,
 'eval_roc_auc': 0.778481462400739,
 'eval_accuracy': 0.6280487804878049,
 'eval_runtime': 0.2781,
 'eval_samples_per_second': 589.791,
 'eval_steps_per_second': 21.578,
 'epoch': 4.0}

### Push model to HuggingFace

In [None]:
model_en.push_to_hub(EN_MODEL_NAME, private=True)

model.safetensors:   0%|          | 0.00/469M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/UC3M-LCPM/XLNet_base_cased_task1b_en_bal/commit/050b09205e8546089162c10a687d4d6bf39ea5d4', commit_message='Upload XLNetForSequenceClassification', commit_description='', oid='050b09205e8546089162c10a687d4d6bf39ea5d4', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
tokenizer_en.push_to_hub(EN_MODEL_NAME, private=True)

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/UC3M-LCPM/XLNet_base_cased_task1b_en_bal/commit/af841b85db3819b15d5de1d22596b4dd39b5643e', commit_message='Upload tokenizer', commit_description='', oid='af841b85db3819b15d5de1d22596b4dd39b5643e', pr_url=None, pr_revision=None, pr_num=None)