In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Configuración (importar dependencias, librerías, ...)

In [None]:
!pip install --upgrade accelerate



In [None]:
# Hiperparámetros
# model_checkpoint = 'xlm-roberta-base'
# model_checkpoint = 'bert-base-uncased'
# model_checkpoint = 'roberta-base'

model_checkpoint = 'dccuchile/bert-base-spanish-wwm-uncased'
# model_checkpoint = 'xlm-roberta-base'
# model_checkpoint = 'PlanTL-GOB-ES/roberta-base-bne'
# model_checkpoint = 'PlanTL-GOB-ES/roberta-large-bne'

BATCH_SIZE = 16
NUM_TRAIN_EPOCHS = 10
LEARNING_RATE = 5e-5
MAX_LENGTH = 128
WEIGHT_DECAY = 0.1

In [None]:
# Set the seed value all over the place to make this reproducible.
# esto hay que ponerlo justo antes de importar para que los experimentos
# sean reproducible

!pip install pytorch-lightning
import random
import torch
import numpy as np
import os
from pytorch_lightning import seed_everything

seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)# Store the average loss after eachepoch so we can plot them.
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
os.environ["TF_DETERMINISTIC_OPS"] = "1" # See:https://github.com/NVIDIA/tensorflow-determinism#confirmed-current-gpu-specific-sources-of-non-determinism-with-solutions
seed_everything(42, workers=True)

!pip install transformers datasets
# # !pip install --upgrade accelerate
!pip install sentencepiece
!pip install contractions
!pip install textblob
from datasets import Dataset, DatasetDict, load_metric
import pandas as pd
import sklearn as sk
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, average_precision_score, f1_score
from sklearn.model_selection import train_test_split

from transformers import AutoTokenizer, AutoModelForSequenceClassification, \
 TrainingArguments, Trainer, pipeline, EarlyStoppingCallback



INFO:lightning_fabric.utilities.seed:Seed set to 42




In [None]:
# !pip install accelerate -U
# !pip install optuna

In [None]:
import torch

# Verificar si CUDA está disponible
if torch.cuda.is_available():
    # Obtener el nombre de la GPU
    gpu_name = torch.cuda.get_device_name(0)

    # Obtener la capacidad de la GPU
    gpu_capability = torch.cuda.get_device_capability(0)

    # Obtener la memoria total y disponible en la GPU
    gpu_memory_info = torch.cuda.get_device_properties(0)

    print(f"GPU Name: {gpu_name}")
    print(f"GPU Capability: {gpu_capability[0]}.{gpu_capability[1]}")
    print(f"Total GPU Memory: {gpu_memory_info.total_memory / (1024**3):.2f} GB")
    print(f"Free GPU Memory: {torch.cuda.get_device_properties(0).total_memory / (1024**3) - torch.cuda.memory_allocated() / (1024**3):.2f} GB")
else:
    print("CUDA not available. Make sure your Jupyter Notebook is running with a GPU kernel.")

GPU Name: Tesla T4
GPU Capability: 7.5
Total GPU Memory: 14.75 GB
Free GPU Memory: 14.75 GB


In [None]:
# Check that pyTorch is identifying the GPU
if torch.cuda.device_count() > 0:
  print(f'GPU detected. Currently using: "{torch.cuda.get_device_name(0)}"')
else:
  print('Currently using CPU, change the type of the runtime in the \'runtime\' tab')

GPU detected. Currently using: "Tesla T4"


# Preparación de los datos

## Lectura de los ficheros

In [None]:
nombre_etiqueta = 'label'
campo_texto = 'text'

### DIVISIÓN DEL TRAIN (NO EJECUTAR)

In [None]:
# Cargar el archivo CSV
train_path = '/content/drive/MyDrive/TFG/train.csv'
train_df = pd.read_csv(train_path, encoding = 'UTF-8', sep=',')

# Dividir el DataFrame en dos DataFrames según el valor de la columna "source"
train_detests_df = train_df[train_df['source'] == 'detests']
train_stereohoax_df = train_df[train_df['source'] == 'stereohoax']

# Seleccionar las columnas especificadas
columns_to_keep = ['id', 'text', 'stereotype_a1', 'stereotype_a2', 'stereotype_a3',
                   'stereotype', 'stereotype_soft', 'implicit_a1', 'implicit_a2',
                   'implicit_a3', 'implicit', 'implicit_soft']

train_detests_df = train_detests_df[columns_to_keep]
train_stereohoax_df = train_stereohoax_df[columns_to_keep]

# Guardar los DataFrames en archivos CSV separados
train_detests_df.to_csv("/content/drive/MyDrive/TFG/Task 1/Detests/detests_train.csv", index=False)
train_stereohoax_df.to_csv("/content/drive/MyDrive/TFG/Task 1/Stereohoax/stereohoax_train.csv", index=False)

In [None]:
train_detests_path = '/content/drive/MyDrive/TFG/Task 1/Detests/detests_train.csv'
train_detests_full = pd.read_csv(train_detests_path, encoding = 'UTF-8', sep=',').rename(columns={'stereotype': 'label'})

train_stereohoax_path = '/content/drive/MyDrive/TFG/Task 1/Stereohoax/stereohoax_train.csv'
train_stereohoax_full = pd.read_csv(train_stereohoax_path, encoding = 'UTF-8', sep=',').rename(columns={'stereotype': 'label'})

In [None]:
# División train/valid/test
train_detests_df, temp = train_test_split(train_detests_full, test_size=0.3, shuffle = True, stratify = train_detests_full[[nombre_etiqueta]])
valid_detests_df, test_detests_df = train_test_split(temp, test_size=1/3, shuffle = True, stratify = temp[[nombre_etiqueta]])

train_stereohoax_df, temp = train_test_split(train_stereohoax_full, test_size=0.3, shuffle = True, stratify = train_stereohoax_full[[nombre_etiqueta]])
valid_stereohoax_df, test_stereohoax_df = train_test_split(temp, test_size=1/3, shuffle = True, stratify = temp[[nombre_etiqueta]])

In [None]:
# Guardar los conjuntos en archivos CSV
train_detests_df.to_csv('/content/drive/MyDrive/TFG/Task 1/Detests/Dividido/train_detests_df.csv', index=False)
valid_detests_df.to_csv('/content/drive/MyDrive/TFG/Task 1/Detests/Dividido/valid_detests_df.csv', index=False)
test_detests_df.to_csv('/content/drive/MyDrive/TFG/Task 1/Detests/Dividido/test_detests_df.csv', index=False)

train_stereohoax_df.to_csv('/content/drive/MyDrive/TFG/Task 1/Stereohoax/Dividido/train_stereohoax_df.csv', index=False)
valid_stereohoax_df.to_csv('/content/drive/MyDrive/TFG/Task 1/Stereohoax/Dividido/valid_stereohoax_df.csv', index=False)
test_stereohoax_df.to_csv('/content/drive/MyDrive/TFG/Task 1/Stereohoax/Dividido/test_stereohoax_df.csv', index=False)

In [None]:
# Seleccionar las columnas especificadas
columns_to_keep = ['id', 'text', 'stereotype_a1']

# DETESTS
train_detests_df_a1_path = '/content/drive/MyDrive/TFG/Task 1/Detests/Dividido/train_detests_df.csv'
valid_detests_df_a1_path = '/content/drive/MyDrive/TFG/Task 1/Detests/Dividido/valid_detests_df.csv'
test_detests_df_a1_path = '/content/drive/MyDrive/TFG/Task 1/Detests/Dividido/test_detests_df.csv'

train_detests_df_a1 = pd.read_csv(train_detests_df_a1_path, encoding = 'UTF-8', sep=',')
valid_detests_df_a1 = pd.read_csv(valid_detests_df_a1_path, encoding = 'UTF-8', sep=',')
test_detests_df_a1 = pd.read_csv(test_detests_df_a1_path, encoding = 'UTF-8', sep=',')

train_detests_df_a1 = train_detests_df_a1[columns_to_keep]
valid_detests_df_a1 = valid_detests_df_a1[columns_to_keep]
test_detests_df_a1 = test_detests_df_a1[columns_to_keep]

# STEREOHOAX
train_stereohoax_df_a1_path = '/content/drive/MyDrive/TFG/Task 1/Stereohoax/Dividido/train_stereohoax_df.csv'
valid_stereohoax_df_a1_path = '/content/drive/MyDrive/TFG/Task 1/Stereohoax/Dividido/valid_stereohoax_df.csv'
test_stereohoax_df_a1_path = '/content/drive/MyDrive/TFG/Task 1/Stereohoax/Dividido/test_stereohoax_df.csv'

train_stereohoax_df_a1 = pd.read_csv(train_stereohoax_df_a1_path, encoding = 'UTF-8', sep=',')
valid_stereohoax_df_a1 = pd.read_csv(valid_stereohoax_df_a1_path, encoding = 'UTF-8', sep=',')
test_stereohoax_df_a1 = pd.read_csv(test_stereohoax_df_a1_path, encoding = 'UTF-8', sep=',')

train_stereohoax_df_a1 = train_stereohoax_df_a1[columns_to_keep]
valid_stereohoax_df_a1 = valid_stereohoax_df_a1[columns_to_keep]
test_stereohoax_df_a1 = test_stereohoax_df_a1[columns_to_keep]

# Guardar los DataFrames en archivos CSV separados
train_detests_df_a1.to_csv("/content/drive/MyDrive/TFG/Task 1/Detests/Baseline/Datos/train_detests_df_a1.csv", index=False)
valid_detests_df_a1.to_csv("/content/drive/MyDrive/TFG/Task 1/Detests/Baseline/Datos/valid_detests_df_a1.csv", index=False)
test_detests_df_a1.to_csv("/content/drive/MyDrive/TFG/Task 1/Detests/Baseline/Datos/test_detests_df_a1.csv", index=False)

train_stereohoax_df_a1.to_csv("/content/drive/MyDrive/TFG/Task 1/Stereohoax/Baseline/Datos/train_stereohoax_df_a1.csv", index=False)
valid_stereohoax_df_a1.to_csv("/content/drive/MyDrive/TFG/Task 1/Stereohoax/Baseline/Datos/valid_stereohoax_df_a1.csv", index=False)
test_stereohoax_df_a1.to_csv("/content/drive/MyDrive/TFG/Task 1/Stereohoax/Baseline/Datos/test_stereohoax_df_a1.csv", index=False)

### Cargamos los datos de entrenamiento/validacion/test reducidos

In [None]:
train_detests_df_a1_path = '/content/drive/MyDrive/TFG/Task 1/Detests/Baseline/Datos/Train/train_detests_df_a1.csv'
valid_detests_df_a1_path = '/content/drive/MyDrive/TFG/Task 1/Detests/Baseline/Datos/Valid/valid_detests_df_a1.csv'
test_detests_df_a1_path = '/content/drive/MyDrive/TFG/Task 1/Detests/Baseline/Datos/Test/test_detests_df_a1.csv'

train_detests_df_a1 = pd.read_csv(train_detests_df_a1_path, encoding = 'UTF-8', sep=',').rename(columns={'stereotype_a1': 'label'})
valid_detests_df_a1 = pd.read_csv(valid_detests_df_a1_path, encoding = 'UTF-8', sep=',').rename(columns={'stereotype_a1': 'label'})
test_detests_df_a1 = pd.read_csv(test_detests_df_a1_path, encoding = 'UTF-8', sep=',').rename(columns={'stereotype_a1': 'label'})

print("Ejemplos usados para entrenar: ", len(train_detests_df_a1))
print("Ejemplos usados para validar: ", len(valid_detests_df_a1))
print("Ejemplos usados para test: ", len(test_detests_df_a1))

Ejemplos usados para entrenar:  3940
Ejemplos usados para validar:  1126
Ejemplos usados para test:  563


## Limpieza de datos

In [None]:
#@title DETESTS
# Esto solo es para ver la longitud (en palabras) de los tweets
def divide(texto):
  return texto.split()

def cuenta_tokens(lista):
  return len(lista)

train_df_palabras = train_detests_df.copy()
train_df_palabras['text_split'] = train_df_palabras[campo_texto].apply(divide)
train_df_palabras['num_palabras'] = train_df_palabras['text_split'].apply(cuenta_tokens)
train_df_palabras

Unnamed: 0,text,label,text_split,num_palabras
0,Salir a la calle no sirve de nada.,0,"[Salir, a, la, calle, no, sirve, de, nada.]",8
1,Siento ser tan crudo pero es lo que observo.,0,"[Siento, ser, tan, crudo, pero, es, lo, que, o...",9
2,600000 ilegales y no pasa nada.,1,"[600000, ilegales, y, no, pasa, nada.]",6
3,"ese tío no es un delicuente, en su cultura may...",1,"[ese, tío, no, es, un, delicuente,, en, su, cu...",20
4,Y todavía hay gente que vota a estos individuo...,0,"[Y, todavía, hay, gente, que, vota, a, estos, ...",9
...,...,...,...,...
3935,Hay que hacerlo ya!!... hay que exigirlo ya!!.,0,"[Hay, que, hacerlo, ya!!..., hay, que, exigirl...",8
3936,"¿Como que es su único objetivo, o crees que re...",0,"[¿Como, que, es, su, único, objetivo,, o, cree...",16
3937,En un par de años (si es que llega) en la call...,1,"[En, un, par, de, años, (si, es, que, llega), ...",18
3938,Toda la razón.,0,"[Toda, la, razón.]",3


In [None]:
max = train_df_palabras.max()['num_palabras']
print(f'El texto de mayor longitud tiene {max} palabras')

El texto de mayor longitud tiene 181 palabras


In [None]:
# Funciones de limpieza
import re

def remove_links(tweet):
    """Takes a string and removes web links from it"""
    tweet = re.sub(r'http\S+', '', tweet)        # remove http links
    tweet = re.sub(r'bit.ly/\S+', '', tweet)     # remove bitly links
    tweet = re.sub(r'\[link\]', '', tweet )      # remove [link]
    tweet = re.sub(r'\[url\]', '', tweet )       # remove [url]
    tweet = re.sub(r'pic.twitter\S+','', tweet)
    return tweet

def remove_users(tweet):
    """Takes a string and removes retweet and @user information"""
    tweet = re.sub('(RT\s@[A-Za-z]+[A-Za-z0-9-_]+)', '', tweet)  # remove re-tweet
    tweet = re.sub('(@[A-Za-z_]+[A-Za-z0-9-_]+)', '', tweet)     # remove tweeted at
    tweet = re.sub(r'\[user\]', '', tweet )                      # remove [user]
    return tweet

def remove_hashtags(tweet):
    """Takes a string and removes any hash tags"""
    tweet = re.sub('(#[A-Za-z]+[A-Za-z0-9-_]+)', '', tweet)      # remove hash tags
    return tweet

def remove_av(tweet):
    """Takes a string and removes AUDIO/VIDEO tags or labels"""
    tweet = re.sub('VIDEO:', '', tweet)  # remove 'VIDEO:' from start of tweet
    tweet = re.sub('AUDIO:', '', tweet)  # remove 'AUDIO:' from start of tweet
    return tweet

def remove_emojis(tweet):
    emoj = re.compile("["
        u"\U00002700-\U000027BF"  # Dingbats
        u"\U0001F600-\U0001F64F"  # Emoticons
        u"\U00002600-\U000026FF"  # Miscellaneous Symbols
        u"\U0001F300-\U0001F5FF"  # Miscellaneous Symbols And Pictographs
        u"\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
        u"\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
        u"\U00010000-\U0010FFFF"
        u"\U0001F680-\U0001F6FF"  # Transport and Map Symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642"
        u"\u2600-\u2B55"
        u"\u231B"
        u"\ufe0f"  # dingbats

                      "]+", re.UNICODE)
    return re.sub(emoj, '', tweet)


# Función de eliminación de contracción
import contractions
def expand_contraction(tweet):
    tweet = contractions.fix(tweet)
    return tweet

In [None]:
#@title LIMPIEZA DETESTS
# train_detests_df[campo_texto] = train_detests_df[campo_texto].str.lower()
# valid_detests_df[campo_texto] = valid_detests_df[campo_texto].str.lower()

# train_detests_df[campo_texto] = train_detests_df[campo_texto].apply(remove_links)
# valid_detests_df[campo_texto] = valid_detests_df[campo_texto].apply(remove_links)

# train_detests_df[campo_texto] = train_detests_df[campo_texto].apply(remove_users)
# valid_detests_df[campo_texto] = valid_detests_df[campo_texto].apply(remove_users)

# train_detests_df[campo_texto] = train_detests_df[campo_texto].apply(remove_hashtags)
# valid_detests_df[campo_texto] = valid_detests_df[campo_texto].apply(remove_hashtags)

# train_detests_df[campo_texto] = train_detests_df[campo_texto].apply(expand_contraction)
# valid_detests_df[campo_texto] = valid_detests_df[campo_texto].apply(expand_contraction)

# train_detests_df[campo_texto] = train_detests_df[campo_texto].apply(remove_emojis)
# valid_detests_df[campo_texto] = valid_detests_df[campo_texto].apply(remove_emojis)

# train_detests_df

In [None]:
#@title DETESTS
ruta = '/content/drive/MyDrive/TFG/Training Data/Detests/Preprocesados/'

train_detests_df.to_csv(ruta + "train_detests_df_preprocesados_expand_hashtags.csv", index=False)
valid_detests_df.to_csv(ruta + "valid_detests_df_preprocesado_expand_hashtags.csv", index=False)

In [None]:
#DETESTS BASICO
train_detests_df_path = '/content/drive/MyDrive/TFG/Training Data/Detests/Preprocesados/train_detests_df_preprocesado.csv'
valid_detests_df_path = '/content/drive/MyDrive/TFG/Training Data/Detests/Preprocesados/valid_detests_df_preprocesado.csv'

train_detests_df = pd.read_csv(train_detests_df_path, encoding = 'UTF-8', sep=',')
valid_detests_df = pd.read_csv(valid_detests_df_path, encoding = 'UTF-8', sep=',')

# Preparación de los conjuntos para el entrenamiento

In [None]:
#@title Se convierten los dataframes en objetos datasets para que los acepten los transformers DETESTS
train_detests_dataset_a1 = Dataset.from_pandas(train_detests_df_a1)
valid_detests_dataset_a1 = Dataset.from_pandas(valid_detests_df_a1)
test_detests_dataset_a1 = Dataset.from_pandas(test_detests_df_a1)

print(train_detests_dataset_a1, valid_detests_dataset_a1, test_detests_dataset_a1)

Dataset({
    features: ['id', 'text', 'label'],
    num_rows: 3940
}) Dataset({
    features: ['id', 'text', 'label'],
    num_rows: 1126
}) Dataset({
    features: ['id', 'text', 'label'],
    num_rows: 563
})


In [None]:
datasets = [train_detests_dataset_a1, valid_detests_dataset_a1, test_detests_dataset_a1]
for idx, dataset in enumerate(datasets, start=1):
    clases_contadas = {}
    for example in dataset:
        clase = example['label']
        if clase in clases_contadas:
            clases_contadas[clase] += 1
        else:
            clases_contadas[clase] = 1

    print(f"Dataset {idx}: {clases_contadas}")

Dataset 1: {0: 2779, 1: 1161}
Dataset 2: {1: 329, 0: 797}
Dataset 3: {1: 161, 0: 402}


In [None]:
# Los objetos de tipo Dataset también se pueden mostrar en formato pandas
train_detests_dataset_a1.set_format("pandas")
train_detests_dataset_a1[:]

Unnamed: 0,id,text,label
0,d_244_03,Salir a la calle no sirve de nada.,0
1,d_1648_03,Siento ser tan crudo pero es lo que observo.,0
2,d_684_01,600000 ilegales y no pasa nada.,1
3,d_848_01,"ese tío no es un delicuente, en su cultura may...",1
4,d_539_02,Y todavía hay gente que vota a estos individuo...,0
...,...,...,...
3935,d_175_11,Hay que hacerlo ya!!... hay que exigirlo ya!!.,1
3936,d_38_01,"¿Como que es su único objetivo, o crees que re...",0
3937,d_424_01,En un par de años (si es que llega) en la call...,1
3938,d_1372_01,Toda la razón.,0


In [None]:
# Se pueden eliminar los dataframes puesto que no se van a usar más
del train_detests_dataset_a1
del valid_detests_dataset_a1
del test_detests_dataset_a1

In [None]:
# Se asigna una etiqueta numérica en función de la etiqueta principal
def set_labels(records):
  if records[nombre_etiqueta] == 0:
    label = 0
  else:
    label = 1
  return {'labels': label}

In [None]:
# Reseteamos el formato para que no haya fallos DETESTS
train_detests_dataset_a1.reset_format()
valid_detests_dataset_a1.reset_format()
test_detests_dataset_a1.reset_format()

In [None]:
# Map the functions to the dataset DETESTS
train_detests_dataset_a1 = train_detests_dataset_a1.map(set_labels)
valid_detests_dataset_a1 = valid_detests_dataset_a1.map(set_labels)

print(train_detests_dataset_a1, valid_detests_dataset_a1)

Map:   0%|          | 0/3940 [00:00<?, ? examples/s]

Map:   0%|          | 0/1126 [00:00<?, ? examples/s]

Dataset({
    features: ['id', 'text', 'label', 'labels'],
    num_rows: 3940
}) Dataset({
    features: ['id', 'text', 'label', 'labels'],
    num_rows: 1126
})


In [None]:
# Reseteamos el formato para que no haya fallos
train_detests_dataset_a1.reset_format()
valid_detests_dataset_a1.reset_format()
test_detests_dataset_a1.reset_format()

# Proceso de clasificación

## Tokenización

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
# tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
# Tamaño del vocabulario del tokenizador
tokenizer.vocab_size

31002

In [None]:
#@title Esto es para ver cómo queda el texto una vez tokenizado DETESTS
# La función tokenizer() hace el tokenizado y devuelve los 'inputs_id' y los 'attention_mask'

print(train_detests_dataset_a1[1][campo_texto])
tokenizado = tokenizer.tokenize(train_detests_dataset_a1[1][campo_texto])
print(tokenizado)

Siento ser tan crudo pero es lo que observo.
['siento', 'ser', 'tan', 'crudo', 'pero', 'es', 'lo', 'que', 'observ', '##o', '.']


In [None]:
# Función para tokenizar un dataset
def tokenize_data(examples):
  #return tokenizer(examples[campo_texto], truncation=True, padding='longest')
  return tokenizer(examples[campo_texto], truncation=True, max_length=MAX_LENGTH, padding=True)

In [None]:
#@title Celda para para construir los ficheros codificados (encoded) DETESTS
columns_train_detests = train_detests_dataset_a1.column_names  # Coge todas las columnas
columns_valid_detests = valid_detests_dataset_a1.column_names  # Coge todas las columnas
columns_train_detests.remove("labels") # Elimina la columna "labels"
columns_valid_detests.remove("labels") # Elimina la columna "labels"


# Hace el tokenizado y elimina todas las columnas que no se necesitan
encoded_train_dataset_detests = train_detests_dataset_a1.map(tokenize_data, batched=True, remove_columns=columns_train_detests)
encoded_valid_dataset_detests = valid_detests_dataset_a1.map(tokenize_data, batched=True, remove_columns=columns_valid_detests)
encoded_train_dataset_detests

Map:   0%|          | 0/3940 [00:00<?, ? examples/s]

Map:   0%|          | 0/1126 [00:00<?, ? examples/s]

Dataset({
    features: ['labels', 'input_ids', 'attention_mask'],
    num_rows: 3940
})

In [None]:
len(encoded_train_dataset_detests[3]['input_ids'])

122

In [None]:
encoded_train_dataset_detests[4]['input_ids']


In [None]:
encoded_train_dataset_detests[4]

## Carga del modelo

In [None]:
# Se carga el modelo preentrenado
n_labels = 2
def model_init():
    return AutoModelForSequenceClassification.from_pretrained(model_checkpoint,
                                                              num_labels = n_labels )
                                                              # use_auth_token = 'token propio de HugginFace')

## Definición de la métricas

In [None]:
# Función para realizar distintas métricas en ejecución

def compute_metrics(eval_pred):

  labels = eval_pred.label_ids
  preds = eval_pred.predictions.argmax(-1)

  # Compute precision, recall, F1-score, and support
  precision, recall, f1, _ = sk.metrics.precision_recall_fscore_support(labels, preds, average="macro")

  # Calculate F1-score for the minority class (label = 1)
  f1_minoritaria= f1_score(labels, preds, pos_label=1)

  # Calculate F1-score for the majority class (label = 0)
  f1_mayoritaria = f1_score(labels, preds, pos_label=0)

  # Calculate accuracy
  acc = sk.metrics.accuracy_score(labels, preds)

  # Calculate Area Under the Curve (AUC)
  AUC = roc_auc_score(labels, preds)

  # Calculate Precision-Recall Area Under the Curve (AUC)
  PREC_REC = average_precision_score(labels, preds)

  print(labels)
  print('---')
  print(preds)

  return {
      'accuracy': acc,
      'f1': f1,
      'precision': precision,
      'recall': recall,
      'AUC': AUC,
      'f1_minoritaria': f1_minoritaria,
      'f1_mayoritaria': f1_mayoritaria,
      'PREC_REC': PREC_REC
  }

## Fine-tuning

In [None]:
model_name = model_checkpoint.split("/content/drive/MyDrive/TFG/Training Data/Modelos/")[-1]
model_name

'PlanTL-GOB-ES/roberta-large-bne'

In [None]:
#@title DETESTS
def maximum(a, b):
    if a >= b:
        return a
    else:
        return b

# Se definen los parámetros del Trainer()
num_train_samples = int(len(encoded_train_dataset_detests))
num_evaluation_samples = int(len(encoded_valid_dataset_detests))

#logging_steps = max(1,len(encoded_train_dataset) // (2 * BATCH_SIZE * NUM_TRAIN_EPOCHS))
value = len(encoded_train_dataset_detests) // (2 * BATCH_SIZE * NUM_TRAIN_EPOCHS)
logging_steps = maximum(1,value) #para que funcione con modelos que no admiten logging steps 0. creo la funcion maximum pq max me da error


optim = ["adamw_hf", "adamw_torch", "adamw_apex_fused","adafactor","adamw_torch_xla"]

training_args = TrainingArguments(
    output_dir = 'results',
    num_train_epochs = NUM_TRAIN_EPOCHS,
    learning_rate = LEARNING_RATE,
    per_device_train_batch_size = BATCH_SIZE,
    per_device_eval_batch_size = BATCH_SIZE,
    load_best_model_at_end = True,
    metric_for_best_model = 'f1',
    #metric_for_best_model = 'eval_loss',
    weight_decay = WEIGHT_DECAY,
    evaluation_strategy = 'epoch',
    save_strategy = 'epoch',
    #logging_steps = logging_steps,
    save_total_limit = 3,
    optim = optim[1],
    push_to_hub = False
    #push_to_hub=True,
    #push_to_hub_model_id=f"{model_name}-finetuned-amazon_reviews_multi"
)

In [None]:
# Se crea el objeto Trainer()
trainer = Trainer(
    model_init = model_init,
    #model = model,
    args = training_args,
    compute_metrics = compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience=3)],
    train_dataset = encoded_train_dataset_detests,
    eval_dataset = encoded_valid_dataset_detests,
    tokenizer = tokenizer
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


config.json:   0%|          | 0.00/613 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at PlanTL-GOB-ES/roberta-base-bne and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# A entrenar
trainer.train()

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at PlanTL-GOB-ES/roberta-base-bne and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Auc,F1 Minoritaria,F1 Mayoritaria,Prec Rec
1,No log,0.408161,0.831261,0.79781,0.795328,0.800487,0.800487,0.715569,0.880051,0.592083
2,No log,0.51279,0.83659,0.793151,0.810365,0.781048,0.781048,0.698361,0.887942,0.593766
3,0.359000,0.706523,0.837478,0.796961,0.808645,0.787922,0.787922,0.70626,0.887661,0.597185
4,0.359000,0.865106,0.825044,0.787557,0.788884,0.786279,0.786279,0.698315,0.876798,0.577371


[1 0 0 ... 0 0 0]
---
[0 0 1 ... 1 0 0]
[1 0 0 ... 0 0 0]
---
[1 0 1 ... 1 0 0]
[1 0 0 ... 0 0 0]
---
[1 0 1 ... 1 0 0]
[1 0 0 ... 0 0 0]
---
[1 0 0 ... 1 0 0]


TrainOutput(global_step=988, training_loss=0.21826933656144237, metrics={'train_runtime': 431.1906, 'train_samples_per_second': 91.375, 'train_steps_per_second': 5.728, 'total_flos': 1036657558118400.0, 'train_loss': 0.21826933656144237, 'epoch': 4.0})

#EVALUACION

# EVALUACION DETESTS

In [None]:
eval = trainer.evaluate()
# Se pasa el resultado a dataframe
dfeval = pd.DataFrame(list(eval.items()), columns = ['Nombre','Valor'])
dfeval

[1 0 0 ... 0 0 0]
---
[0 0 1 ... 1 0 0]


Unnamed: 0,Nombre,Valor
0,eval_loss,0.408161
1,eval_accuracy,0.831261
2,eval_f1,0.79781
3,eval_precision,0.795328
4,eval_recall,0.800487
5,eval_AUC,0.800487
6,eval_f1_minoritaria,0.715569
7,eval_f1_mayoritaria,0.880051
8,eval_PREC_REC,0.592083
9,eval_runtime,8.319


In [None]:
# Se graba el modelo entrenado
# trainer.save_model('./models/bert_hp')
trainer.save_model('/content/drive/MyDrive/TFG/Task 1/Detests/Baseline/Modelos/PlanTL-GOB-ES-roberta-base-bne-detests-a1')

## Evaluar el modelo con el test

In [None]:
print(train_detests_df_a1, valid_detests_df_a1, test_detests_df_a1)

             id                                               text  label
0      d_244_03                 Salir a la calle no sirve de nada.      0
1     d_1648_03       Siento ser tan crudo pero es lo que observo.      0
2      d_684_01                    600000 ilegales y no pasa nada.      1
3      d_848_01  ese tío no es un delicuente, en su cultura may...      1
4      d_539_02  Y todavía hay gente que vota a estos individuo...      0
...         ...                                                ...    ...
3935   d_175_11     Hay que hacerlo ya!!... hay que exigirlo ya!!.      1
3936    d_38_01  ¿Como que es su único objetivo, o crees que re...      0
3937   d_424_01  En un par de años (si es que llega) en la call...      1
3938  d_1372_01                                     Toda la razón.      0
3939  d_1635_04  De todas formas estos que defienden incluso qu...      1

[3940 rows x 3 columns]              id                                               text  label
0     d_1323_

In [None]:
# Se mapean las etiquetas del test
test_detests_dataset_a1 = test_detests_dataset_a1.map(set_labels)  # La función set_labels ya se definió en el entrenamiento
print(train_detests_dataset_a1, valid_detests_dataset_a1, test_detests_dataset_a1)

Map:   0%|          | 0/563 [00:00<?, ? examples/s]

Dataset({
    features: ['id', 'text', 'label', 'labels'],
    num_rows: 3940
}) Dataset({
    features: ['id', 'text', 'label', 'labels'],
    num_rows: 1126
}) Dataset({
    features: ['id', 'text', 'label', 'labels'],
    num_rows: 563
})


In [None]:
test_detests_dataset_a1[5]
valid_detests_dataset_a1[5]

{'id': 'd_888_02',
 'text': 'Lo máximo es que te echen de la casa Real.',
 'label': 0,
 'labels': 0}

## Hacer las predicciones

In [None]:
# Al usar model_init es necesario cargar el modelo entrenado previamente
# model_path = './models/bert_hp'
model_path = '/content/drive/MyDrive/TFG/Task 1/Stereohoax/Baseline/Modelos/dccuchile-bert-base-spanish-wwm-uncased-stereohoax-a1'

model = AutoModelForSequenceClassification.from_pretrained(model_path)

In [None]:
# Predicción con pipeline
# device = 0 se utiliza para que haga las predicciones con la GPU
pipe = pipeline("text-classification", model=model, tokenizer=tokenizer, device=0)

In [None]:
# Se hacen las prediciones
'''def get_predictions(records):
  p = pipe(str(records["text"]),truncation=True)

  return {"pred_label":int(p[0]["label"][-1])}  #Nos quedamos con el número de la etiqueta solo
'''

def get_predictions(records):
  result = pipe(records[campo_texto], truncation=True)
  pred_label = result[0]['label']
  score_label = result[0]['score']

  if pred_label == 'LABEL_0':
    pred_label = 0
  else:
    pred_label = 1

  return {'pred_label': pred_label, 'score_label': score_label}

In [None]:
test_dataset_predicted_detests_a1 = test_detests_dataset_a1.map(get_predictions)
#valid_dataset_predicted = valid_dataset.map(get_predictions)
test_dataset_predicted_detests_a1[0]
#valid_dataset_predicted[0]

Map:   0%|          | 0/563 [00:00<?, ? examples/s]



{'id': 'd_2001_01',
 'text': 'Me encanta eso de defender siempre al “inmigrante” solo por el hecho de serlo.',
 'label': 1,
 'labels': 1,
 'pred_label': 0,
 'score_label': 0.9737274050712585}

In [None]:
test_dataset_predicted_detests_a1.set_format('pandas')
df_test_detests = test_dataset_predicted_detests_a1[:]
df_test_detests

#valid_dataset_predicted.set_format('pandas')
#df_valid = valid_dataset_predicted[:]

Unnamed: 0,id,text,label,labels,pred_label,score_label
0,d_2001_01,Me encanta eso de defender siempre al “inmigra...,1,1,0,0.973727
1,d_849_01,servicios del menor puede hacerse cargo de un ...,0,0,0,0.856839
2,d_2129_01,De nuevo miles de madres marroquíes llorando p...,0,0,1,0.861448
3,d_836_03,"Por supuesto, el año que viene regularizamos a...",0,0,1,0.829816
4,d_2216_01,Y lo que haga falta y más para eso está la car...,1,1,1,0.663168
...,...,...,...,...,...,...
558,d_2493_03,"Los españoles, con sus votos, se han dejado ll...",1,1,1,0.724508
559,d_2244_01,¿Pero que alma se te va a caer a tí?,0,0,0,0.991724
560,d_1799_01,"Vd y el gob. no entienden nada, lea mis com.",0,0,0,0.989604
561,d_1147_01,¡¡¡Qué asco de gentuza!!!,1,1,0,0.990481


In [None]:
### SOLO CUANDO ESTAMOS EVALUANDO UN TEST ETIQUETADO
# Añadimos la función de evaluación

def compute_metrics(pred):

  labels = pred[0]
  preds = pred[1]

  precision, recall, f1, _ = sk.metrics.precision_recall_fscore_support(labels, preds, average="macro")
  f1_minoritaria = f1_score(labels, preds, pos_label=1)
  f1_mayoritaria = f1_score(labels, preds, pos_label=0)
  acc = sk.metrics.accuracy_score(labels, preds)
  AUC = roc_auc_score(labels, preds)
  PREC_REC = average_precision_score(labels, preds)
  return { 'accuracy': acc, 'f1': f1, 'precision': precision, 'recall': recall, 'AUC': AUC,
           'f1_minoritaria': f1_minoritaria, 'f1_mayoritaria': f1_mayoritaria, 'PREC_REC': PREC_REC }

In [None]:
### SOLO CUANDO ESTAMOS EVALUANDO UN TEST ETIQUETADO
# Convert the pandas series to python list to apply the compute_metric function
test_labels_detests = df_test_detests['labels'].values.tolist()
test_predictions_detests = df_test_detests['pred_label'].values.tolist()
eval_pred_test_detests = [test_labels_detests, test_predictions_detests]


In [None]:
### SOLO CUANDO ESTAMOS EVALUANDO UN TEST ETIQUETADO
# Informe de resultados
p_test_detests = compute_metrics(eval_pred_test_detests)
dftest_detests = pd.DataFrame([[key, p_test_detests[key]] for key in p_test_detests.keys()], columns=['Name', 'Value'])


dftest_detests

Unnamed: 0,Name,Value
0,accuracy,0.811723
1,f1,0.766846
2,precision,0.770082
3,recall,0.763898
4,AUC,0.763898
5,f1_minoritaria,0.664557
6,f1_mayoritaria,0.869136
7,PREC_REC,0.541262


In [None]:
### SOLO CUANDO ESTAMOS EVALUANDO UN TEST ETIQUETADO
print(f'AUC del test: {roc_auc_score(test_labels_detests, test_predictions_detests)}')
# print(f'AUC del valid: {roc_auc_score(valid_labels, valid_predictions)}')
print('*********************************')
print(f'PREC_REC del test: {average_precision_score(test_labels_detests, test_predictions_detests)}')
# print(f'PREC_REC del valid: {average_precision_score(valid_labels, valid_predictions)}')

AUC del test: 0.7638979017953711
*********************************
PREC_REC del test: 0.5412623717362656


In [None]:
### SOLO CUANDO ESTAMOS EVALUANDO UN TEST ETIQUETADO
### Resultados para el test
print("epoch ", NUM_TRAIN_EPOCHS)
print("batch size:", BATCH_SIZE)
print("max_len :", MAX_LENGTH)

print(classification_report(test_labels_detests, test_predictions_detests))

print('Matriz de confusión')
print(confusion_matrix(test_labels_detests, test_predictions_detests))
print(f'AUC: {roc_auc_score(test_labels_detests, test_predictions_detests)}')
print(f'PREC_REC: {average_precision_score(test_labels_detests, test_predictions_detests)}')

epoch  10
batch size: 16
max_len : 128
              precision    recall  f1-score   support

           0       0.86      0.88      0.87       402
           1       0.68      0.65      0.66       161

    accuracy                           0.81       563
   macro avg       0.77      0.76      0.77       563
weighted avg       0.81      0.81      0.81       563

Matriz de confusión
[[352  50]
 [ 56 105]]
AUC: 0.7638979017953711
PREC_REC: 0.5412623717362656


In [None]:
# ### SOLO CUANDO ESTAMOS EVALUANDO UN TEST ETIQUETADO
# ### Resultados para el valid
# print("epoch ", NUM_TRAIN_EPOCHS)
# print("batch size:", BATCH_SIZE)
# print("max_len :", MAX_LENGTH)

# print(classification_report(valid_labels, valid_predictions))

# print('Matriz de confusión')
# print(confusion_matrix(valid_labels, valid_predictions))
# print(f'AUC: {roc_auc_score(valid_labels, valid_predictions)}')
# print(f'PREC_REC: {average_precision_score(valid_labels, valid_predictions)}')

In [None]:
# Construir el fichero de salida que pida cada competición

# predicciones = df.drop(['id','reply_to','sentence','stereotype','labels','score_label'], axis=1)
# predicciones = df.drop([campo_texto,'label_sexist','label_vector','label_category','__index_level_0__','labels','score_label'], axis=1)
predicciones = df_test_detests.drop([campo_texto,'score_label'], axis=1)
predicciones.rename(columns={'pred_label':'label_pred'}, inplace=True)
predicciones

Unnamed: 0,id,label,labels,label_pred
0,d_2001_01,1,1,0
1,d_849_01,0,0,0
2,d_2129_01,0,0,1
3,d_836_03,0,0,1
4,d_2216_01,1,1,1
...,...,...,...,...
558,d_2493_03,1,1,1
559,d_2244_01,0,0,0
560,d_1799_01,0,0,0
561,d_1147_01,1,1,0


In [None]:
predicciones['pred_stereotype'] = predicciones['label_pred'].map({0:'not stereotype',
                             1:'stereotype'},na_action=None)
predicciones

Unnamed: 0,id,label,labels,label_pred,pred_stereotype
0,d_2001_01,1,1,0,not stereotype
1,d_849_01,0,0,0,not stereotype
2,d_2129_01,0,0,1,stereotype
3,d_836_03,0,0,1,stereotype
4,d_2216_01,1,1,1,stereotype
...,...,...,...,...,...
558,d_2493_03,1,1,1,stereotype
559,d_2244_01,0,0,0,not stereotype
560,d_1799_01,0,0,0,not stereotype
561,d_1147_01,1,1,0,not stereotype


In [None]:
# Guardamos el fichero de predicciones
fichero_pred = '/content/drive/MyDrive/TFG/Task 1/Detests/Baseline/Predicciones/PlanTL-GOB-ES-roberta-base-bne-detests-a1.csv'
predicciones.to_csv(fichero_pred, index=False, encoding='utf-8',header=True, sep=',')