This notebook fine tunes the GPT2-base model for task1b in DIPROMATS 2024.

Notices that it ignores all the tweets that are non-propagandistic and performs multilabel classification.

In [1]:
DATASET_NAME = "UC3M-LCPM/DIPROMATS_2024"
DATASET_SPLIT = "task1b_bal"

In [2]:
ES_MODEL_NAME = "UC3M-LCPM/Roberta_large_task1b_es_bal"
EN_MODEL_NAME = "UC3M-LCPM/Roberta_large_task1b_en_bal"

In [3]:
SELECTED_COLUMNS = ['text', '1 appeal to commonality', '2 discrediting the opponent', '3 loaded language']

In [4]:
labels = ['1 appeal to commonality', '2 discrediting the opponent', '3 loaded language']
id2label = {idx:label for idx, label in enumerate(labels)}
label2id = {label:idx for idx, label in enumerate(labels)}
n_labels = len(labels)

# Login to drive

In [5]:
from google.colab import drive
# mount your google drive
drive.mount('/content/drive')

# cambiamos el directorio de trabajo a la carpeta donde está almacenado el dataset
import os
os.chdir('/content/drive/My Drive/Colab Notebooks/Modelos/')

Mounted at /content/drive


# Install dependencies

In [6]:
!pip install -q transformers[torch] datasets
!pip install accelerate -U

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.4/302.4 kB[0m [31m20.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m14.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m20.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m14.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m401.2/401.2 kB[0m [31m21.6 MB/s[0m eta [36m0:00:00[0m


# Login to HuggingFace

In [7]:
from huggingface_hub import HfFolder, notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

# Load dataset

In [8]:
from datasets import load_dataset, Dataset

dataset = load_dataset(DATASET_NAME, DATASET_SPLIT)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/11.1k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/3.88M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/418k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/829k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/17927 [00:00<?, ? examples/s]

Generating val split:   0%|          | 0/1826 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3633 [00:00<?, ? examples/s]

In [9]:
dataset['train'] = dataset['train'].filter(lambda example: example['0 not propagandistic'] == 0)
dataset['val'] = dataset['val'].filter(lambda example: example['0 not propagandistic'] == 0)
dataset['test'] = dataset['test'].filter(lambda example: example['0 not propagandistic'] == 0)

Filter:   0%|          | 0/17927 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1826 [00:00<?, ? examples/s]

Filter:   0%|          | 0/3633 [00:00<?, ? examples/s]

In [10]:
dataset['train'][:10]['0 not propagandistic']

[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]

# Define metrics function

In [11]:
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from transformers import EvalPrediction
import torch

# source: https://jesusleal.io/2021/04/21/Longformer-multilabel-classification/
def multi_label_metrics(predictions, labels, threshold=0.5):
    # first, apply sigmoid on predictions which are of shape (batch_size, num_labels)
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    # next, use threshold to turn them into integer predictions
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1
    # finally, compute metrics
    y_true = labels
    f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro')
    roc_auc = roc_auc_score(y_true, y_pred, average = 'micro')
    accuracy = accuracy_score(y_true, y_pred)
    # return as dictionary
    metrics = {'f1': f1_micro_average,
               'roc_auc': roc_auc,
               'accuracy': accuracy}
    return metrics

def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions,
            tuple) else p.predictions
    result = multi_label_metrics(
        predictions=preds,
        labels=p.label_ids)
    return result

# SPANISH MODEL

### Clean dataset columns

In [12]:
COLS_TO_REMOVE = [column for column in dataset['train'].column_names if column not in SELECTED_COLUMNS]

In [13]:
dataset_train_es = dataset['train'].filter(lambda example: example['language'] == 'es')
dataset_train_es = dataset_train_es.remove_columns(COLS_TO_REMOVE)
dataset_train_es

Filter:   0%|          | 0/9406 [00:00<?, ? examples/s]

Dataset({
    features: ['text', '1 appeal to commonality', '2 discrediting the opponent', '3 loaded language'],
    num_rows: 3495
})

In [14]:
dataset_val_es = dataset['val'].filter(lambda example: example['language'] == 'es')
dataset_val_es = dataset_val_es.remove_columns(COLS_TO_REMOVE)
dataset_val_es

Filter:   0%|          | 0/406 [00:00<?, ? examples/s]

Dataset({
    features: ['text', '1 appeal to commonality', '2 discrediting the opponent', '3 loaded language'],
    num_rows: 164
})

### Load model

In [15]:
from transformers import (AutoConfig,
                          AutoTokenizer,
                          AutoModelForSequenceClassification,
                          TrainingArguments,
                          pipeline,
                          set_seed)

set_seed(123)

epochs = 4
batch_size = 32

MAX_LENGTH = 60

model_name_or_path = "PlanTL-GOB-ES/roberta-large-bne"

In [16]:
model_config = AutoConfig.from_pretrained(pretrained_model_name_or_path=model_name_or_path,
                                          problem_type="multi_label_classification",
                                          num_labels=n_labels,
                                          id2label=id2label,
                                          label2id=label2id)

tokenizer_es = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_name_or_path)
tokenizer_es.pad_token = tokenizer_es.eos_token



config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/858k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/516k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.23M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/957 [00:00<?, ?B/s]

In [17]:
model_es = AutoModelForSequenceClassification.from_pretrained(pretrained_model_name_or_path=model_name_or_path, config=model_config).to("cuda")

# resize model embedding to match new tokenizer
model_es.resize_token_embeddings(len(tokenizer_es))

# fix model padding token id
model_es.config.pad_token_id = model_es.config.eos_token_id

pytorch_model.bin:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at PlanTL-GOB-ES/roberta-large-bne and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Tokenize datasets

In [18]:
import numpy as np

def tokenize(examples):
  # take a batch of texts
  text = examples["text"]
  # encode them
  encoding = tokenizer_es(text, return_tensors="pt", padding="max_length", truncation=True, max_length=MAX_LENGTH).to("cuda")
  # add labels
  labels_batch = {k: examples[k] for k in examples.keys() if k in labels}
  # create numpy array of shape (batch_size, num_labels)
  labels_matrix = np.zeros((len(text), len(labels)))
  # fill numpy array
  for idx, label in enumerate(labels):
    labels_matrix[:, idx] = labels_batch[label]

  encoding["labels"] = labels_matrix.tolist()

  return encoding

In [19]:
encoded_data_train_es = dataset_train_es.map(tokenize, batched=True, remove_columns=dataset_train_es.column_names)
encoded_data_train_es

Map:   0%|          | 0/3495 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 3495
})

In [20]:
encoded_data_val_es = dataset_val_es.map(tokenize, batched=True, remove_columns=dataset_val_es.column_names)
encoded_data_val_es

Map:   0%|          | 0/164 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 164
})

In [21]:
encoded_data_train_es.set_format("torch")
encoded_data_val_es.set_format("torch")

### Configure model

In [22]:
from transformers import TrainingArguments

training_args_es = TrainingArguments(
    output_dir="modelos/" + ES_MODEL_NAME,
    num_train_epochs=epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    evaluation_strategy="epoch",
    logging_dir=f"modelos/{ES_MODEL_NAME}/logs",
    logging_strategy="steps",
    logging_steps=10,
    learning_rate=5e-5,
    weight_decay=0.01,
    warmup_steps=500,
    save_strategy="epoch",
    load_best_model_at_end=True,
    save_total_limit=2,
    report_to="tensorboard",
    push_to_hub=True,
    hub_strategy="every_save",
    hub_model_id=ES_MODEL_NAME,
    hub_token=HfFolder.get_token(),
)

### Train model

In [23]:
from transformers import Trainer

trainer = Trainer(
    model = model_es,            # modelo que será ajustado
    train_dataset = encoded_data_train_es, # conjunto training
    eval_dataset = encoded_data_val_es,   # conjunto de validación
    tokenizer = tokenizer_es,
    args = training_args_es,     # hiperparámetros
    compute_metrics=compute_metrics,    # función para computar las métricas
)

In [24]:
trainer.train()

Epoch,Training Loss,Validation Loss,F1,Roc Auc,Accuracy
1,0.2544,0.380694,0.751445,0.802846,0.682927
2,0.1028,0.477185,0.787709,0.831625,0.707317
3,0.0288,0.682185,0.780899,0.8261,0.70122
4,0.0277,0.716678,0.782123,0.827255,0.713415


TrainOutput(global_step=440, training_loss=0.17707324282012202, metrics={'train_runtime': 196.6628, 'train_samples_per_second': 71.086, 'train_steps_per_second': 2.237, 'total_flos': 1526770837447200.0, 'train_loss': 0.17707324282012202, 'epoch': 4.0})

In [None]:
trainer.evaluate()

{'eval_loss': 0.5762932300567627,
 'eval_f1': 0.5333333333333333,
 'eval_roc_auc': 0.6582935105078965,
 'eval_accuracy': 0.4146341463414634,
 'eval_runtime': 0.7193,
 'eval_samples_per_second': 227.984,
 'eval_steps_per_second': 8.341,
 'epoch': 4.0}

### Push model to HuggingFace

In [26]:
model_es.push_to_hub(ES_MODEL_NAME, private=True)

CommitInfo(commit_url='https://huggingface.co/UC3M-LCPM/Roberta_large_task1b_es_bal/commit/4613c0884fd42b39a6ecd33fff825437c5263d21', commit_message='Upload RobertaForSequenceClassification', commit_description='', oid='4613c0884fd42b39a6ecd33fff825437c5263d21', pr_url=None, pr_revision=None, pr_num=None)

In [27]:
tokenizer_es.push_to_hub(ES_MODEL_NAME, private=True)

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/UC3M-LCPM/Roberta_large_task1b_es_bal/commit/47b4978617eba416b508ee98646dc645fc69815b', commit_message='Upload tokenizer', commit_description='', oid='47b4978617eba416b508ee98646dc645fc69815b', pr_url=None, pr_revision=None, pr_num=None)

# ENGLISH MODEL

### Clean dataset columns

In [28]:
COLS_TO_REMOVE = [column for column in dataset['train'].column_names if column not in SELECTED_COLUMNS]

In [29]:
dataset_train_en = dataset['train'].filter(lambda example: example['language'] == 'en')
dataset_train_en = dataset_train_en.remove_columns(COLS_TO_REMOVE)
dataset_train_en

Filter:   0%|          | 0/9406 [00:00<?, ? examples/s]

Dataset({
    features: ['text', '1 appeal to commonality', '2 discrediting the opponent', '3 loaded language'],
    num_rows: 5911
})

In [30]:
dataset_val_en = dataset['val'].filter(lambda example: example['language'] == 'en')
dataset_val_en = dataset_val_en.remove_columns(COLS_TO_REMOVE)
dataset_val_en

Filter:   0%|          | 0/406 [00:00<?, ? examples/s]

Dataset({
    features: ['text', '1 appeal to commonality', '2 discrediting the opponent', '3 loaded language'],
    num_rows: 242
})

### Load model

In [31]:
import torch
from transformers import (
    RobertaConfig,
    RobertaTokenizerFast,
    RobertaForSequenceClassification,
    TrainingArguments,
                          pipeline,
                          set_seed
)

In [32]:
from transformers import (RobertaConfig,
                          RobertaTokenizerFast,
                          RobertaForSequenceClassification,
                          TrainingArguments,
                          pipeline,
                          set_seed)

set_seed(123)

epochs = 4
batch_size = 32

MAX_LENGTH = 60

model_name_or_path = "roberta-large"

In [33]:
model_config = RobertaConfig.from_pretrained(pretrained_model_name_or_path=model_name_or_path,
                                          problem_type="multi_label_classification",
                                          num_labels=n_labels,
                                          id2label=id2label,
                                          label2id=label2id)

tokenizer_en = RobertaTokenizerFast.from_pretrained(pretrained_model_name_or_path=model_name_or_path)
tokenizer_en.pad_token = tokenizer_en.eos_token



config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [34]:
model_en = RobertaForSequenceClassification.from_pretrained(pretrained_model_name_or_path=model_name_or_path, config=model_config).to("cuda")

# resize model embedding to match new tokenizer
model_en.resize_token_embeddings(len(tokenizer_en))

# fix model padding token id
model_en.config.pad_token_id = model_en.config.eos_token_id

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Tokenize datasets

In [35]:
import numpy as np

def tokenize(examples):
  # take a batch of texts
  text = examples["text"]
  # encode them
  encoding = tokenizer_en(text, return_tensors="pt", padding="max_length", truncation=True, max_length=MAX_LENGTH).to("cuda")
  # add labels
  labels_batch = {k: examples[k] for k in examples.keys() if k in labels}
  # create numpy array of shape (batch_size, num_labels)
  labels_matrix = np.zeros((len(text), len(labels)))
  # fill numpy array
  for idx, label in enumerate(labels):
    labels_matrix[:, idx] = labels_batch[label]

  encoding["labels"] = labels_matrix.tolist()

  return encoding

In [36]:
encoded_data_train_en = dataset_train_es.map(tokenize, batched=True, remove_columns=dataset_train_es.column_names)
encoded_data_train_en

Map:   0%|          | 0/3495 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 3495
})

In [37]:
encoded_data_val_en = dataset_val_es.map(tokenize, batched=True, remove_columns=dataset_train_es.column_names)
encoded_data_val_en

Map:   0%|          | 0/164 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 164
})

In [38]:
encoded_data_train_en.set_format("torch")
encoded_data_val_en.set_format("torch")

### Configure model

In [39]:
from transformers import TrainingArguments

training_args_en = TrainingArguments(
    output_dir="modelos/" + EN_MODEL_NAME,
    num_train_epochs=epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    evaluation_strategy="epoch",
    logging_dir=f"modelos/{EN_MODEL_NAME}/logs",
    logging_strategy="steps",
    logging_steps=10,
    learning_rate=5e-5,
    weight_decay=0.01,
    warmup_steps=500,
    save_strategy="epoch",
    load_best_model_at_end=True,
    save_total_limit=2,
    report_to="tensorboard",
    push_to_hub=True,
    hub_strategy="every_save",
    hub_model_id=EN_MODEL_NAME,
    hub_token=HfFolder.get_token(),
)

### Train model

In [40]:
from transformers import Trainer

trainer = Trainer(
    model = model_en,            # modelo que será ajustado
    train_dataset = encoded_data_train_en, # conjunto training
    eval_dataset = encoded_data_val_en,   # conjunto de validación
    tokenizer = tokenizer_en,
    args = training_args_en,     # hiperparámetros
    compute_metrics=compute_metrics,    # función para computar las métricas
)

In [41]:
trainer.train()

Epoch,Training Loss,Validation Loss,F1,Roc Auc,Accuracy
1,0.4661,0.487363,0.70557,0.766117,0.518293
2,0.3041,0.527921,0.699454,0.761951,0.597561
3,0.2011,0.473172,0.775623,0.822432,0.689024
4,0.1804,0.495814,0.78453,0.829564,0.682927


TrainOutput(global_step=440, training_loss=0.3521891677921469, metrics={'train_runtime': 190.1428, 'train_samples_per_second': 73.524, 'train_steps_per_second': 2.314, 'total_flos': 1526770837447200.0, 'train_loss': 0.3521891677921469, 'epoch': 4.0})

In [42]:
trainer.evaluate()

{'eval_loss': 0.4731716513633728,
 'eval_f1': 0.7756232686980609,
 'eval_roc_auc': 0.8224316498196869,
 'eval_accuracy': 0.6890243902439024,
 'eval_runtime': 0.5275,
 'eval_samples_per_second': 310.924,
 'eval_steps_per_second': 11.375,
 'epoch': 4.0}

### Push model to HuggingFace

In [43]:
model_en.push_to_hub(EN_MODEL_NAME, private=True)

CommitInfo(commit_url='https://huggingface.co/UC3M-LCPM/Roberta_large_task1b_en_bal/commit/7ef13b8a1c403f95014626b4de2ba5a66ff387a4', commit_message='Upload RobertaForSequenceClassification', commit_description='', oid='7ef13b8a1c403f95014626b4de2ba5a66ff387a4', pr_url=None, pr_revision=None, pr_num=None)

In [44]:
tokenizer_en.push_to_hub(EN_MODEL_NAME, private=True)

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/UC3M-LCPM/Roberta_large_task1b_en_bal/commit/edc4cc4e72045507e6f4f9c933c5cb69f95a4e4a', commit_message='Upload tokenizer', commit_description='', oid='edc4cc4e72045507e6f4f9c933c5cb69f95a4e4a', pr_url=None, pr_revision=None, pr_num=None)