In [1]:
DATASET_NAME = "UC3M-LCPM/DIPROMATS_2024"
DATASET_SPLIT = "task1a_bal"

In [2]:
ES_MODEL_NAME = "UC3M-LCPM/Roberta_large_task1a_es_bal"
EN_MODEL_NAME = "UC3M-LCPM/Roberta_large_task1a_en_bal"

In [3]:
SELECTED_COLUMNS = ['label', 'text']

In [4]:
labels_ids = {'False': 0, 'True': 1}
n_labels = len(labels_ids)

# Login to drive

In [5]:
from google.colab import drive
# mount your google drive
drive.mount('/content/drive')

# cambiamos el directorio de trabajo a la carpeta donde está almacenado el dataset
import os
os.chdir('/content/drive/My Drive/Colab Notebooks/Modelos/')

Mounted at /content/drive


# Install dependencies

In [6]:
!pip install -q transformers[torch] datasets
!pip install accelerate -U

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.4/302.4 kB[0m [31m31.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m15.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m24.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m17.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m401.2/401.2 kB[0m [31m39.2 MB/s[0m eta [36m0:00:00[0m


# Login to HuggingFace

In [8]:
from huggingface_hub import HfFolder, notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

# Load dataset

In [9]:
from datasets import load_dataset, Dataset

dataset = load_dataset(DATASET_NAME, DATASET_SPLIT)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/11.1k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/3.87M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/521k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/517k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/17512 [00:00<?, ? examples/s]

Generating val split:   0%|          | 0/2179 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2180 [00:00<?, ? examples/s]

In [10]:
dataset = dataset.rename_column('label_task1', 'label')

In [11]:
dataset["train"]

Dataset({
    features: ['test_case', 'id', 'country', 'username', 'tweet_type', 'tweet_id', 'UTC', 'rts&fav', 'language', 'text', 'label', 'label_task2', '0 not propagandistic', '1 appeal to commonality', '2 discrediting the opponent', '3 loaded language', 'label_task3', '__index_level_0__'],
    num_rows: 17512
})

# Define metrics function

In [27]:
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)

    # Calculate accuracy
    accuracy = accuracy_score(labels, preds)

   # Calculate precision, recall, and F1-score
    precision = precision_score(labels, preds, average='weighted')
    recall = recall_score(labels, preds, average='weighted')
    f1 = f1_score(labels, preds, average='weighted')

    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

# SPANISH MODEL

### Clean dataset columns

In [16]:
COLS_TO_REMOVE = [column for column in dataset['train'].column_names if column not in SELECTED_COLUMNS]

In [17]:
dataset_train_es = dataset['train'].filter(lambda example: example['language'] == 'es')
dataset_train_es = dataset_train_es.remove_columns(COLS_TO_REMOVE)
dataset_train_es

Filter:   0%|          | 0/17512 [00:00<?, ? examples/s]

Dataset({
    features: ['text', 'label'],
    num_rows: 7429
})

In [54]:
dataset_val_es = dataset['val'].filter(lambda example: example['language'] == 'es')
dataset_val_es = dataset_val_es.remove_columns(COLS_TO_REMOVE)
dataset_val_es

Dataset({
    features: ['text', 'label'],
    num_rows: 893
})

### Load model

In [48]:
from transformers import (AutoConfig,
                          AutoTokenizer,
                          AutoModelForSequenceClassification,
                          TrainingArguments,
                          pipeline,
                          set_seed)

set_seed(123)

epochs = 4
batch_size = 32

MAX_LENGTH = 60

model_name_or_path = "PlanTL-GOB-ES/roberta-large-bne"

In [49]:
model_config = AutoConfig.from_pretrained(pretrained_model_name_or_path=model_name_or_path, num_labels=n_labels)

tokenizer_es = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_name_or_path)
tokenizer_es.pad_token = tokenizer_es.eos_token



config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/858k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/516k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.23M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/957 [00:00<?, ?B/s]

In [50]:
model_es = AutoModelForSequenceClassification.from_pretrained(pretrained_model_name_or_path=model_name_or_path, config=model_config).to("cuda")

# resize model embedding to match new tokenizer
model_es.resize_token_embeddings(len(tokenizer_es))

# fix model padding token id
model_es.config.pad_token_id = model_es.config.eos_token_id

pytorch_model.bin:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at PlanTL-GOB-ES/roberta-large-bne and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Tokenize datasets

In [55]:
tokenized_ids = [labels_ids[str(label)] for label in dataset_train_es["label"]]
print(tokenized_ids)
dataset_train_es = dataset_train_es.remove_columns("label").add_column("label", tokenized_ids)

[1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 

In [56]:
tokenized_ids = [labels_ids[str(label)] for label in dataset_val_es["label"]]
print(tokenized_ids)
dataset_val_es = dataset_val_es.remove_columns("label").add_column("label", tokenized_ids)

[0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 

In [58]:
def tokenize(examples):
    return tokenizer_es(examples["text"], return_tensors="pt", padding="max_length", truncation=True, max_length=MAX_LENGTH).to("cuda")

In [59]:
encoded_data_train = dataset_train_es.map(tokenize, batched=True)
encoded_data_train

Map:   0%|          | 0/7429 [00:00<?, ? examples/s]

Dataset({
    features: ['text', 'label', 'input_ids', 'attention_mask'],
    num_rows: 7429
})

In [60]:
encoded_data_val = dataset_val_es.map(tokenize, batched=True)
encoded_data_val

Map:   0%|          | 0/893 [00:00<?, ? examples/s]

Dataset({
    features: ['text', 'label', 'input_ids', 'attention_mask'],
    num_rows: 893
})

### Configure model

In [61]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="modelos/" + ES_MODEL_NAME,
    num_train_epochs=epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    evaluation_strategy="epoch",
    logging_dir=f"modelos/{ES_MODEL_NAME}/logs",
    logging_strategy="steps",
    logging_steps=10,
    learning_rate=5e-5,
    weight_decay=0.01,
    warmup_steps=500,
    save_strategy="epoch",
    load_best_model_at_end=True,
    save_total_limit=2,
    report_to="tensorboard",
    push_to_hub=True,
    hub_strategy="every_save",
    hub_model_id=ES_MODEL_NAME,
    hub_token=HfFolder.get_token(),
)

### Train model

In [62]:
from transformers import Trainer

trainer = Trainer(
    model = model_es,            # modelo que será ajustado
    train_dataset = encoded_data_train, # conjunto training
    eval_dataset = encoded_data_val,   # conjunto de validación

    args = training_args,     # hiperparámetros
    compute_metrics=compute_metrics,    # función para computar las métricas
)

In [63]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.217,0.421994,0.851064,0.752231,0.803947,0.724892
2,0.1735,0.873791,0.842105,0.710575,0.818922,0.676979
3,0.0826,0.655931,0.854423,0.784367,0.78883,0.780221
4,0.0117,0.685094,0.863382,0.799893,0.801012,0.798795


TrainOutput(global_step=932, training_loss=0.1772163181356854, metrics={'train_runtime': 1330.7447, 'train_samples_per_second': 22.33, 'train_steps_per_second': 0.7, 'total_flos': 3245305358514240.0, 'train_loss': 0.1772163181356854, 'epoch': 4.0})

In [None]:
trainer.evaluate()

{'eval_loss': 0.5291606783866882,
 'eval_accuracy': 0.8376259798432251,
 'eval_f1': 0.7746774207581808,
 'eval_precision': 0.763253493013972,
 'eval_recall': 0.7896304863408778,
 'eval_runtime': 9.6137,
 'eval_samples_per_second': 92.888,
 'eval_steps_per_second': 2.913,
 'epoch': 4.0}

### Push model to HuggingFace

In [64]:
model_es.push_to_hub(ES_MODEL_NAME, private=True)

CommitInfo(commit_url='https://huggingface.co/UC3M-LCPM/Roberta_large_task1a_es_bal/commit/dd7d13c79988d8f4681615745fa9013cd3e7ec95', commit_message='Upload RobertaForSequenceClassification', commit_description='', oid='dd7d13c79988d8f4681615745fa9013cd3e7ec95', pr_url=None, pr_revision=None, pr_num=None)

In [65]:
tokenizer_es.push_to_hub(ES_MODEL_NAME, private=True)

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/UC3M-LCPM/Roberta_large_task1a_es_bal/commit/aeb9826bbb2ad079ec3c245ab126e13cd2308a4a', commit_message='Upload tokenizer', commit_description='', oid='aeb9826bbb2ad079ec3c245ab126e13cd2308a4a', pr_url=None, pr_revision=None, pr_num=None)

# ENGLISH MODEL

### Clean dataset columns

In [13]:
COLS_TO_REMOVE = [column for column in dataset['train'].column_names if column not in SELECTED_COLUMNS]

In [14]:
dataset_train_en = dataset['train'].filter(lambda example: example['language'] == 'en')
dataset_train_en = dataset_train_en.remove_columns(COLS_TO_REMOVE)
dataset_train_en

Filter:   0%|          | 0/17512 [00:00<?, ? examples/s]

Dataset({
    features: ['text', 'label'],
    num_rows: 10083
})

In [15]:
dataset_val_en = dataset['val'].filter(lambda example: example['language'] == 'en')
dataset_val_en = dataset_val_en.remove_columns(COLS_TO_REMOVE)
dataset_val_en

Filter:   0%|          | 0/2179 [00:00<?, ? examples/s]

Dataset({
    features: ['text', 'label'],
    num_rows: 1286
})

### Load model

In [16]:
from transformers import (RobertaConfig,
                          RobertaTokenizerFast,
                          RobertaForSequenceClassification,
                          TrainingArguments,
                          pipeline,
                          set_seed)

set_seed(123)

epochs = 4
batch_size = 8

MAX_LENGTH = 60

model_name_or_path = "roberta-large"

In [17]:
model_config = RobertaConfig.from_pretrained(pretrained_model_name_or_path=model_name_or_path, num_labels=n_labels)

tokenizer_en = RobertaTokenizerFast.from_pretrained(pretrained_model_name_or_path=model_name_or_path)
tokenizer_en.pad_token = tokenizer_en.eos_token



config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [18]:
model_en = RobertaForSequenceClassification.from_pretrained(pretrained_model_name_or_path=model_name_or_path, config=model_config).to("cuda")

# resize model embedding to match new tokenizer
model_en.resize_token_embeddings(len(tokenizer_en))

# fix model padding token id
model_en.config.pad_token_id = model_en.config.eos_token_id

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Tokenize datasets

In [19]:
tokenized_ids = [labels_ids[str(label)] for label in dataset_train_en["label"]]
print(tokenized_ids)
dataset_train_en = dataset_train_en.remove_columns("label").add_column("label", tokenized_ids)

[1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 

Flattening the indices:   0%|          | 0/10083 [00:00<?, ? examples/s]

In [20]:
tokenized_ids = [labels_ids[str(label)] for label in dataset_val_en["label"]]
print(tokenized_ids)
dataset_val_en = dataset_val_en.remove_columns("label").add_column("label", tokenized_ids)

[0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 

Flattening the indices:   0%|          | 0/1286 [00:00<?, ? examples/s]

In [21]:
def tokenize(examples):
    return tokenizer_en(examples["text"], return_tensors="pt", padding="max_length", truncation=True, max_length=MAX_LENGTH).to("cuda")

In [22]:
encoded_data_train = dataset_train_en.map(tokenize, batched=True)
encoded_data_train

Map:   0%|          | 0/10083 [00:00<?, ? examples/s]

Dataset({
    features: ['text', 'label', 'input_ids', 'attention_mask'],
    num_rows: 10083
})

In [23]:
encoded_data_val = dataset_val_en.map(tokenize, batched=True)
encoded_data_val

Map:   0%|          | 0/1286 [00:00<?, ? examples/s]

Dataset({
    features: ['text', 'label', 'input_ids', 'attention_mask'],
    num_rows: 1286
})

### Configure model

In [28]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="modelos/" + EN_MODEL_NAME,
    num_train_epochs=epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    evaluation_strategy="epoch",
    logging_dir=f"modelos/{EN_MODEL_NAME}/logs",
    logging_strategy="steps",
    logging_steps=10,
    learning_rate=5e-5,
    weight_decay=0.01,
    warmup_steps=500,
    save_strategy="epoch",
    load_best_model_at_end=True,
    save_total_limit=2,
    report_to="tensorboard",
    push_to_hub=True,
    hub_strategy="every_save",
    hub_model_id=EN_MODEL_NAME,
    hub_token=HfFolder.get_token(),
)

### Train model

In [29]:
from transformers import Trainer

trainer = Trainer(
    model = model_en,            # modelo que será ajustado
    train_dataset = encoded_data_train, # conjunto training
    eval_dataset = encoded_data_val,   # conjunto de validación

    args = training_args,     # hiperparámetros
    compute_metrics=compute_metrics,    # función para computar las métricas
)

In [30]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6708,0.78414,0.216952,0.047068,0.216952,0.077354
2,0.7223,0.717445,0.216952,0.047068,0.216952,0.077354
3,0.7118,0.78511,0.216952,0.047068,0.216952,0.077354
4,0.6776,0.736504,0.216952,0.047068,0.216952,0.077354


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


TrainOutput(global_step=5044, training_loss=0.6931827810432682, metrics={'train_runtime': 668.8176, 'train_samples_per_second': 60.303, 'train_steps_per_second': 7.542, 'total_flos': 4404686220204480.0, 'train_loss': 0.6931827810432682, 'epoch': 4.0})

In [31]:
trainer.evaluate()

  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.7174453735351562,
 'eval_accuracy': 0.21695178849144633,
 'eval_precision': 0.04706807852963727,
 'eval_recall': 0.21695178849144633,
 'eval_f1': 0.07735405621611952,
 'eval_runtime': 4.9085,
 'eval_samples_per_second': 261.996,
 'eval_steps_per_second': 32.8,
 'epoch': 4.0}

### Push model to HuggingFace

In [32]:
model_en.push_to_hub(EN_MODEL_NAME, private=True)

CommitInfo(commit_url='https://huggingface.co/UC3M-LCPM/Roberta_large_task1a_en_bal/commit/e8203698e8901ff0ce125172203eb26f7fc74c99', commit_message='Upload RobertaForSequenceClassification', commit_description='', oid='e8203698e8901ff0ce125172203eb26f7fc74c99', pr_url=None, pr_revision=None, pr_num=None)

In [33]:
tokenizer_en.push_to_hub(EN_MODEL_NAME, private=True)

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/UC3M-LCPM/Roberta_large_task1a_en_bal/commit/a7c1748e13370c4ff9296ada9b881ee13932988c', commit_message='Upload tokenizer', commit_description='', oid='a7c1748e13370c4ff9296ada9b881ee13932988c', pr_url=None, pr_revision=None, pr_num=None)