This notebook fine tunes the GPT2-base model for task1b in DIPROMATS 2024.

Notices that it ignores all the tweets that are non-propagandistic and performs multilabel classification.

In [80]:
DATASET_NAME = "UC3M-LCPM/DIPROMATS_2024"
DATASET_SPLIT = "task1b_bal"

In [81]:
ES_MODEL_NAME = "UC3M-LCPM/Beto_base_uncased_task1b_es_bal"
EN_MODEL_NAME = "UC3M-LCPM/Bert_base_uncased_task1b_en_bal"

In [82]:
SELECTED_COLUMNS = ['text', '1 appeal to commonality', '2 discrediting the opponent', '3 loaded language']

In [83]:
labels = ['1 appeal to commonality', '2 discrediting the opponent', '3 loaded language']
id2label = {idx:label for idx, label in enumerate(labels)}
label2id = {label:idx for idx, label in enumerate(labels)}
n_labels = len(labels)

# Login to drive

In [5]:
from google.colab import drive
# mount your google drive
drive.mount('/content/drive')

# cambiamos el directorio de trabajo a la carpeta donde está almacenado el dataset
import os
os.chdir('/content/drive/My Drive/Colab Notebooks/Modelos/')

Mounted at /content/drive


# Install dependencies

In [6]:
!pip install -q transformers[torch] datasets
!pip install accelerate -U

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.4/302.4 kB[0m [31m37.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m17.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m27.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m20.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m401.2/401.2 kB[0m [31m46.2 MB/s[0m eta [36m0:00:00[0m


# Login to HuggingFace

In [7]:
from huggingface_hub import HfFolder, notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

# Load dataset

In [84]:
from datasets import load_dataset, Dataset

dataset = load_dataset(DATASET_NAME, DATASET_SPLIT)

Downloading data:   0%|          | 0.00/3.88M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/418k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/829k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/17927 [00:00<?, ? examples/s]

Generating val split:   0%|          | 0/1826 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3633 [00:00<?, ? examples/s]

In [85]:
dataset['train'] = dataset['train'].filter(lambda example: example['0 not propagandistic'] == 0)
dataset['val'] = dataset['val'].filter(lambda example: example['0 not propagandistic'] == 0)
dataset['test'] = dataset['test'].filter(lambda example: example['0 not propagandistic'] == 0)

Filter:   0%|          | 0/17927 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1826 [00:00<?, ? examples/s]

Filter:   0%|          | 0/3633 [00:00<?, ? examples/s]

In [86]:
dataset['train'][:10]['0 not propagandistic']

[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]

# Define metrics function

In [87]:
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from transformers import EvalPrediction
import torch

# source: https://jesusleal.io/2021/04/21/Longformer-multilabel-classification/
def multi_label_metrics(predictions, labels, threshold=0.5):
    # first, apply sigmoid on predictions which are of shape (batch_size, num_labels)
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    # next, use threshold to turn them into integer predictions
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1
    # finally, compute metrics
    y_true = labels
    f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro')
    roc_auc = roc_auc_score(y_true, y_pred, average = 'micro')
    accuracy = accuracy_score(y_true, y_pred)
    # return as dictionary
    metrics = {'f1': f1_micro_average,
               'roc_auc': roc_auc,
               'accuracy': accuracy}
    return metrics

def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions,
            tuple) else p.predictions
    result = multi_label_metrics(
        predictions=preds,
        labels=p.label_ids)
    return result

# SPANISH MODEL

### Clean dataset columns

In [88]:
COLS_TO_REMOVE = [column for column in dataset['train'].column_names if column not in SELECTED_COLUMNS]

In [89]:
dataset_train_es = dataset['train'].filter(lambda example: example['language'] == 'es')
dataset_train_es = dataset_train_es.remove_columns(COLS_TO_REMOVE)
dataset_train_es

Filter:   0%|          | 0/9406 [00:00<?, ? examples/s]

Dataset({
    features: ['text', '1 appeal to commonality', '2 discrediting the opponent', '3 loaded language'],
    num_rows: 3495
})

In [90]:
dataset_val_es = dataset['val'].filter(lambda example: example['language'] == 'es')
dataset_val_es = dataset_val_es.remove_columns(COLS_TO_REMOVE)
dataset_val_es

Filter:   0%|          | 0/406 [00:00<?, ? examples/s]

Dataset({
    features: ['text', '1 appeal to commonality', '2 discrediting the opponent', '3 loaded language'],
    num_rows: 164
})

### Load model

In [91]:
from transformers import (AutoConfig,
                          AutoTokenizer,
                          AutoModelForSequenceClassification,
                          TrainingArguments,
                          pipeline,
                          set_seed)

set_seed(123)

epochs = 4
batch_size = 32

MAX_LENGTH = 60

model_name_or_path = "dccuchile/bert-base-spanish-wwm-uncased"

In [92]:
model_config = AutoConfig.from_pretrained(pretrained_model_name_or_path=model_name_or_path,
                                          problem_type="multi_label_classification",
                                          num_labels=n_labels,
                                          id2label=id2label,
                                          label2id=label2id)

tokenizer_es = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_name_or_path)
tokenizer_es.pad_token = tokenizer_es.eos_token
if tokenizer_es.pad_token is None:
    tokenizer_es.add_special_tokens({'pad_token': '[PAD]'})



In [93]:
model_es = AutoModelForSequenceClassification.from_pretrained(pretrained_model_name_or_path=model_name_or_path, config=model_config).to("cuda")

# resize model embedding to match new tokenizer
model_es.resize_token_embeddings(len(tokenizer_es))

# fix model padding token id
model_es.config.pad_token_id = model_es.config.eos_token_id

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dccuchile/bert-base-spanish-wwm-uncased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Tokenize datasets

In [94]:
import numpy as np

def tokenize(examples):
  # take a batch of texts
  text = examples["text"]
  # encode them
  encoding = tokenizer_es(text, return_tensors="pt", padding="max_length", truncation=True, max_length=MAX_LENGTH).to("cuda")
  # add labels
  labels_batch = {k: examples[k] for k in examples.keys() if k in labels}
  # create numpy array of shape (batch_size, num_labels)
  labels_matrix = np.zeros((len(text), len(labels)))
  # fill numpy array
  for idx, label in enumerate(labels):
    labels_matrix[:, idx] = labels_batch[label]

  encoding["labels"] = labels_matrix.tolist()

  return encoding

In [95]:
encoded_data_train_es = dataset_train_es.map(tokenize, batched=True, remove_columns=dataset_train_es.column_names)
encoded_data_train_es

Map:   0%|          | 0/3495 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 3495
})

In [96]:
encoded_data_val_es = dataset_val_es.map(tokenize, batched=True, remove_columns=dataset_val_es.column_names)
encoded_data_val_es

Map:   0%|          | 0/164 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 164
})

In [97]:
encoded_data_train_es.set_format("torch")
encoded_data_val_es.set_format("torch")

### Configure model

In [98]:
from transformers import TrainingArguments

training_args_es = TrainingArguments(
    output_dir="modelos/" + ES_MODEL_NAME,
    num_train_epochs=epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    evaluation_strategy="epoch",
    logging_dir=f"modelos/{ES_MODEL_NAME}/logs",
    logging_strategy="steps",
    logging_steps=10,
    learning_rate=5e-5,
    weight_decay=0.01,
    warmup_steps=500,
    save_strategy="epoch",
    load_best_model_at_end=True,
    save_total_limit=2,
    report_to="tensorboard",
    push_to_hub=True,
    hub_strategy="every_save",
    hub_model_id=ES_MODEL_NAME,
    hub_token=HfFolder.get_token(),
)

### Train model

In [99]:
from transformers import Trainer

trainer = Trainer(
    model = model_es,            # modelo que será ajustado
    train_dataset = encoded_data_train_es, # conjunto training
    eval_dataset = encoded_data_val_es,   # conjunto de validación
    tokenizer = tokenizer_es,
    args = training_args_es,     # hiperparámetros
    compute_metrics=compute_metrics,    # función para computar las métricas
)

In [100]:
trainer.train()

Epoch,Training Loss,Validation Loss,F1,Roc Auc,Accuracy
1,0.3601,0.396712,0.777174,0.824288,0.615854
2,0.2209,0.402374,0.794595,0.838553,0.676829
3,0.1687,0.490189,0.77095,0.818515,0.689024
4,0.0307,0.518991,0.8,0.842222,0.707317


TrainOutput(global_step=440, training_loss=0.2716231940822168, metrics={'train_runtime': 69.381, 'train_samples_per_second': 201.496, 'train_steps_per_second': 6.342, 'total_flos': 431053778887200.0, 'train_loss': 0.2716231940822168, 'epoch': 4.0})

In [101]:
trainer.evaluate()

{'eval_loss': 0.3967120051383972,
 'eval_f1': 0.7771739130434783,
 'eval_roc_auc': 0.8242880744701638,
 'eval_accuracy': 0.6158536585365854,
 'eval_runtime': 0.2171,
 'eval_samples_per_second': 755.345,
 'eval_steps_per_second': 27.635,
 'epoch': 4.0}

### Push model to HuggingFace

In [102]:
model_es.push_to_hub(ES_MODEL_NAME, private=True)

CommitInfo(commit_url='https://huggingface.co/UC3M-LCPM/Beto_base_uncased_task1b_es_bal/commit/96c3d273996bc82a485c61c4d03b6acd39874fd7', commit_message='Upload BertForSequenceClassification', commit_description='', oid='96c3d273996bc82a485c61c4d03b6acd39874fd7', pr_url=None, pr_revision=None, pr_num=None)

In [103]:
tokenizer_es.push_to_hub(ES_MODEL_NAME, private=True)

CommitInfo(commit_url='https://huggingface.co/UC3M-LCPM/Beto_base_uncased_task1b_es_bal/commit/2ae39f5486067ed3d742f43c1778774ed5d8f191', commit_message='Upload tokenizer', commit_description='', oid='2ae39f5486067ed3d742f43c1778774ed5d8f191', pr_url=None, pr_revision=None, pr_num=None)

# ENGLISH MODEL

### Clean dataset columns

In [104]:
COLS_TO_REMOVE = [column for column in dataset['train'].column_names if column not in SELECTED_COLUMNS]

In [105]:
dataset_train_en = dataset['train'].filter(lambda example: example['language'] == 'en')
dataset_train_en = dataset_train_en.remove_columns(COLS_TO_REMOVE)
dataset_train_en

Filter:   0%|          | 0/9406 [00:00<?, ? examples/s]

Dataset({
    features: ['text', '1 appeal to commonality', '2 discrediting the opponent', '3 loaded language'],
    num_rows: 5911
})

In [106]:
dataset_val_en = dataset['val'].filter(lambda example: example['language'] == 'en')
dataset_val_en = dataset_val_en.remove_columns(COLS_TO_REMOVE)
dataset_val_en

Filter:   0%|          | 0/406 [00:00<?, ? examples/s]

Dataset({
    features: ['text', '1 appeal to commonality', '2 discrediting the opponent', '3 loaded language'],
    num_rows: 242
})

### Load model

In [107]:
from transformers import (BertConfig,
                          BertTokenizer,
                          BertForSequenceClassification,
                          TrainingArguments,
                          pipeline,
                          set_seed)

set_seed(123)

epochs = 4
batch_size = 32

MAX_LENGTH = 60

model_name_or_path = "google-bert/bert-base-uncased"

In [108]:
model_config = BertConfig.from_pretrained(pretrained_model_name_or_path=model_name_or_path,
                                          problem_type="multi_label_classification",
                                          num_labels=n_labels,
                                          id2label=id2label,
                                          label2id=label2id)

tokenizer_en = BertTokenizer.from_pretrained(pretrained_model_name_or_path=model_name_or_path)
tokenizer_en.pad_token = tokenizer_en.eos_token
if tokenizer_en.pad_token is None:
    tokenizer_en.add_special_tokens({'pad_token': '[PAD]'})



In [109]:
model_en = BertForSequenceClassification.from_pretrained(pretrained_model_name_or_path=model_name_or_path, config=model_config).to("cuda")

# resize model embedding to match new tokenizer
model_en.resize_token_embeddings(len(tokenizer_en))

# fix model padding token id
model_en.config.pad_token_id = model_en.config.eos_token_id

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Tokenize datasets

In [110]:
import numpy as np

def tokenize(examples):
  # take a batch of texts
  text = examples["text"]
  # encode them
  encoding = tokenizer_en(text, return_tensors="pt", padding="max_length", truncation=True, max_length=MAX_LENGTH).to("cuda")
  # add labels
  labels_batch = {k: examples[k] for k in examples.keys() if k in labels}
  # create numpy array of shape (batch_size, num_labels)
  labels_matrix = np.zeros((len(text), len(labels)))
  # fill numpy array
  for idx, label in enumerate(labels):
    labels_matrix[:, idx] = labels_batch[label]

  encoding["labels"] = labels_matrix.tolist()

  return encoding

In [111]:
encoded_data_train_en = dataset_train_es.map(tokenize, batched=True, remove_columns=dataset_train_es.column_names)
encoded_data_train_en

Map:   0%|          | 0/3495 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 3495
})

In [112]:
encoded_data_val_en = dataset_val_es.map(tokenize, batched=True, remove_columns=dataset_train_es.column_names)
encoded_data_val_en

Map:   0%|          | 0/164 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 164
})

In [113]:
encoded_data_train_en.set_format("torch")
encoded_data_val_en.set_format("torch")

### Configure model

In [114]:
from transformers import TrainingArguments

training_args_en = TrainingArguments(
    output_dir="modelos/" + EN_MODEL_NAME,
    num_train_epochs=epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    evaluation_strategy="epoch",
    logging_dir=f"modelos/{EN_MODEL_NAME}/logs",
    logging_strategy="steps",
    logging_steps=10,
    learning_rate=5e-5,
    weight_decay=0.01,
    warmup_steps=500,
    save_strategy="epoch",
    load_best_model_at_end=True,
    save_total_limit=2,
    report_to="tensorboard",
    push_to_hub=True,
    hub_strategy="every_save",
    hub_model_id=EN_MODEL_NAME,
    hub_token=HfFolder.get_token(),
)

### Train model

In [115]:
from transformers import Trainer

trainer = Trainer(
    model = model_en,            # modelo que será ajustado
    train_dataset = encoded_data_train_en, # conjunto training
    eval_dataset = encoded_data_val_en,   # conjunto de validación
    tokenizer = tokenizer_en,
    args = training_args_en,     # hiperparámetros
    compute_metrics=compute_metrics,    # función para computar las métricas
)

In [116]:
trainer.train()

Epoch,Training Loss,Validation Loss,F1,Roc Auc,Accuracy
1,0.6016,0.599582,0.469314,0.629719,0.365854
2,0.4123,0.462856,0.70317,0.766277,0.597561
3,0.3119,0.51087,0.683196,0.749294,0.567073
4,0.2308,0.553605,0.703601,0.76562,0.609756


TrainOutput(global_step=440, training_loss=0.4475182517008348, metrics={'train_runtime': 62.2047, 'train_samples_per_second': 224.742, 'train_steps_per_second': 7.073, 'total_flos': 431053778887200.0, 'train_loss': 0.4475182517008348, 'epoch': 4.0})

In [117]:
trainer.evaluate()

{'eval_loss': 0.46285614371299744,
 'eval_f1': 0.7031700288184438,
 'eval_roc_auc': 0.7662770247464071,
 'eval_accuracy': 0.5975609756097561,
 'eval_runtime': 0.219,
 'eval_samples_per_second': 748.692,
 'eval_steps_per_second': 27.391,
 'epoch': 4.0}

### Push model to HuggingFace

In [118]:
model_en.push_to_hub(EN_MODEL_NAME, private=True)

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/UC3M-LCPM/Bert_base_uncased_task1b_en_bal/commit/1d06a43c3b7b8d74584bab22dca5e2efe47042a2', commit_message='Upload BertForSequenceClassification', commit_description='', oid='1d06a43c3b7b8d74584bab22dca5e2efe47042a2', pr_url=None, pr_revision=None, pr_num=None)

In [119]:
tokenizer_en.push_to_hub(EN_MODEL_NAME, private=True)

CommitInfo(commit_url='https://huggingface.co/UC3M-LCPM/Bert_base_uncased_task1b_en_bal/commit/f1b785828139dbbfea020f2f0f451b419e6f7358', commit_message='Upload tokenizer', commit_description='', oid='f1b785828139dbbfea020f2f0f451b419e6f7358', pr_url=None, pr_revision=None, pr_num=None)