# Installing

In [1]:
!pip install pytorch_lightning torchmetrics
!pip install torch
!pip install transformers[torch]
!pip install accelerate -U

Collecting pytorch_lightning
  Downloading pytorch_lightning-2.2.5-py3-none-any.whl (802 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m802.3/802.3 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting torchmetrics
  Downloading torchmetrics-1.4.0.post0-py3-none-any.whl (868 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m868.8/868.8 kB[0m [31m15.4 MB/s[0m eta [36m0:00:00[0m
Collecting lightning-utilities>=0.8.0 (from pytorch_lightning)
  Downloading lightning_utilities-0.11.2-py3-none-any.whl (26 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.13.0->pytorch_lightning)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.13.0->pytorch_lightning)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.13.0->pytorch_lightning)
  Using

# Mounting

In [2]:
import os
import pandas as pd
from google.colab import drive
drive.mount("/content/drive", force_remount=True)
os.chdir("/content/drive/MyDrive/DPB2_Claudia Camacho")
data = pd.read_excel('Datasets/dataset_English.xlsx')
data.info()

Mounted at /content/drive
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3316 entries, 0 to 3315
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Dataset     3316 non-null   object
 1   Texto       3316 non-null   object
 2   Distorsión  3316 non-null   object
dtypes: object(3)
memory usage: 77.8+ KB


# Data Stabilize

In [3]:
def convert_labels(labels):
    return [label.split(',') if ',' in label else [label] for label in labels]

data['labels_str'] = data['Distorsión'].apply(lambda x: ''.join(x))

# Filtrar las filas que contienen la etiqueta 'No distorsion'
no_distorsion_rows = data[data['labels_str'] == 'No distorsion']

# Calcular el número de filas a eliminar para balancear
num_to_remove = 650

# Si el número de filas a eliminar es positivo, eliminar filas aleatorias
if num_to_remove > 0:
    rows_to_remove = no_distorsion_rows.sample(n=num_to_remove, random_state=42)
    df = data.drop(rows_to_remove.index)

# Eliminar la columna auxiliar 'labels_str'
df = df.drop(columns=['labels_str'])
df.reset_index(drop=True, inplace=True)
# Actualizar las variables texts y labels con los datos balanceados
texts = df['Texto'].tolist()
labels = df['Distorsión'].tolist()
labels = convert_labels(labels)
print(f"Longitud de data en texts: {len(texts)}")
print(f"Longitud de data en labels: {len(labels)}")

Longitud de data en texts: 2666
Longitud de data en labels: 2666


In [4]:
lab = df['Distorsión'].str.split(',', expand=True).stack()
frecuencia = lab.value_counts().reset_index()
frecuencia.columns = ['Distorsión', 'Frecuencia']
frecuencia["Frecuencia (%)"] = frecuencia["Frecuencia"]/frecuencia["Frecuencia"].sum()*100
frecuencia

Unnamed: 0,Distorsión,Frecuencia,Frecuencia (%)
0,Labeling,467,14.011401
1,Fortune telling,399,11.971197
2,Overgeneralization,371,11.131113
3,Mental filter,347,10.411041
4,Mind reading,334,10.021002
5,Magnification,325,9.750975
6,No distorsion,290,8.70087
7,Personalization,237,7.110711
8,Emotional reasoning,195,5.850585
9,Should statements,193,5.790579


# Training model

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
import torch
from transformers import BertTokenizer, BertForSequenceClassification, BertConfig, Trainer, TrainingArguments

# Preprocesamiento y tokenización de los textos
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')

def tokenize_function(texts):
    return tokenizer(texts, padding='max_length', truncation=True, max_length=512)

# Dividir datos en conjuntos de entrenamiento, validación y prueba
train_texts, temp_texts, train_labels, temp_labels = train_test_split(texts, labels, test_size=0.3, random_state=42)
val_texts, test_texts, val_labels, test_labels = train_test_split(temp_texts, temp_labels, test_size=0.5, random_state=42)

# Tokenizar los textos
train_encodings = tokenize_function(train_texts)
val_encodings = tokenize_function(val_texts)
test_encodings = tokenize_function(test_texts)

# Convertir las etiquetas a formato binario para la clasificación multietiqueta
mlb = MultiLabelBinarizer()
train_labels_bin = mlb.fit_transform(train_labels)
val_labels_bin = mlb.transform(val_labels)
test_labels_bin = mlb.transform(test_labels)

# Crear un Dataset personalizado
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx]).float()
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = CustomDataset(train_encodings, train_labels_bin)
val_dataset = CustomDataset(val_encodings, val_labels_bin)
test_dataset = CustomDataset(test_encodings, test_labels_bin)

class BertForMultilabelSequenceClassification(BertForSequenceClassification):
    def __init__(self, config, dropout_prob=0.3):
        super().__init__(config)
        self.dropout = torch.nn.Dropout(dropout_prob)
        self.classifier = torch.nn.Linear(config.hidden_size, config.num_labels)

    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, labels=None, output_attentions=None, output_hidden_states=None, return_dict=None):
        outputs = self.bert(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict)
        pooled_output = outputs[1]
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        loss = None
        if labels is not None:
            loss_fct = torch.nn.BCEWithLogitsLoss()
            loss = loss_fct(logits, labels)
        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output
        return {'loss': loss, 'logits': logits, 'hidden_states': outputs.hidden_states, 'attentions': outputs.attentions}

config = BertConfig.from_pretrained('bert-large-uncased', num_labels=len(mlb.classes_))
model = BertForMultilabelSequenceClassification.from_pretrained('bert-large-uncased', config=config)

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    warmup_steps=1000,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch",
    learning_rate=2e-5
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

trainer.train()
results = trainer.evaluate(test_dataset)
print(results)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Some weights of BertForMultilabelSequenceClassification were not initialized from the model checkpoint at bert-large-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.3675,0.348135
2,0.3178,0.318622
3,0.2842,0.289898
4,0.2567,0.273976
5,0.2089,0.242951


{'eval_loss': 0.23192133009433746, 'eval_runtime': 36.7835, 'eval_samples_per_second': 10.874, 'eval_steps_per_second': 0.68, 'epoch': 5.0}


## Reporte Test

In [6]:
from sklearn.metrics import classification_report
import numpy as np

predictions = trainer.predict(test_dataset)
y_pred = np.argmax(predictions.predictions, axis=1)
y_true = np.argmax(test_labels_bin, axis=1)
report = classification_report(y_true, y_pred, target_names=mlb.classes_)
print(report)

                     precision    recall  f1-score   support

     All or nothing       0.00      0.00      0.00        29
Emotional reasoning       0.17      0.04      0.06        26
    Fortune telling       0.50      0.56      0.53        52
           Labeling       0.60      0.74      0.66        50
      Magnification       0.47      0.42      0.44        38
      Mental filter       0.42      0.58      0.49        38
       Mind reading       0.53      0.77      0.62        39
      No distorsion       0.89      0.89      0.89        46
 Overgeneralization       0.40      0.41      0.41        41
    Personalization       0.39      0.36      0.37        25
  Should statements       0.40      0.38      0.39        16

           accuracy                           0.52       400
          macro avg       0.43      0.47      0.44       400
       weighted avg       0.47      0.52      0.49       400



## Reporte Validación

In [11]:
predictions = trainer.predict(val_dataset)
y_pred = np.argmax(predictions.predictions, axis=1)
y_true = np.argmax(val_labels_bin, axis=1)
report = classification_report(y_true, y_pred, target_names=mlb.classes_)
print(report)

                     precision    recall  f1-score   support

     All or nothing       0.00      0.00      0.00        28
Emotional reasoning       0.00      0.00      0.00        26
    Fortune telling       0.54      0.54      0.54        52
           Labeling       0.63      0.65      0.64        72
      Magnification       0.33      0.24      0.28        37
      Mental filter       0.28      0.39      0.32        28
       Mind reading       0.52      0.71      0.60        45
      No distorsion       1.00      0.96      0.98        46
 Overgeneralization       0.32      0.47      0.38        34
    Personalization       0.38      0.53      0.44        19
  Should statements       0.47      0.62      0.53        13

           accuracy                           0.51       400
          macro avg       0.41      0.46      0.43       400
       weighted avg       0.47      0.51      0.48       400



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Accuracy per label validation

In [22]:
pred_labels = (predictions.predictions > 0).astype(int)
true_labels = predictions.label_ids
accuracy_per_label = (pred_labels == true_labels).mean(axis=0)
print("Accuracy per label:")
for label, accuracy in zip(mlb.classes_, accuracy_per_label):
    print(f"{label}: {accuracy}")

Accuracy per label:
All or nothing: 0.93
Emotional reasoning: 0.9275
Fortune telling: 0.89
Labeling: 0.8775
Magnification: 0.8825
Mental filter: 0.8625
Mind reading: 0.89
No distorsion: 0.9925
Overgeneralization: 0.8775
Personalization: 0.9425
Should statements: 0.955


## Attention mask validation

In [13]:
for i in range(5):  # Imprime las primeras 5 máscaras de atención
    print(f"Máscara de atención para la muestra {i}: {val_encodings['attention_mask'][i]}")

Máscara de atención para la muestra 0: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

## Save model

In [7]:
# Guardar el modelo y el tokenizador
model.save_pretrained('Models and Tokenizers/saved_model')
tokenizer.save_pretrained('Models and Tokenizers/saved_model')

('Models and Tokenizers/saved_model/tokenizer_config.json',
 'Models and Tokenizers/saved_model/special_tokens_map.json',
 'Models and Tokenizers/saved_model/vocab.txt',
 'Models and Tokenizers/saved_model/added_tokens.json')

## Load model

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification

# Cargar el modelo y el tokenizador
tokenizer = BertTokenizer.from_pretrained('Models and Tokenizers/saved_model')
model = BertForSequenceClassification.from_pretrained('Models and Tokenizers/saved_model')