<a href="https://colab.research.google.com/github/Grimalo/cognitive_distortion/blob/main/Entrenamiento_multilabel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Installing

In [None]:
!pip install pytorch_lightning torchmetrics
!pip install torch
!pip install transformers[torch]
!pip install accelerate -U

Collecting pytorch_lightning
  Downloading pytorch_lightning-2.3.0-py3-none-any.whl (812 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m812.2/812.2 kB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting torchmetrics
  Downloading torchmetrics-1.4.0.post0-py3-none-any.whl (868 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m868.8/868.8 kB[0m [31m48.4 MB/s[0m eta [36m0:00:00[0m
Collecting lightning-utilities>=0.8.0 (from pytorch_lightning)
  Downloading lightning_utilities-0.11.2-py3-none-any.whl (26 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=2.0.0->pytorch_lightning)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=2.0.0->pytorch_lightning)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=2.0.0->pytorch_lightning)
  Using ca

# Mounting

In [None]:
import os
import pandas as pd
from google.colab import drive
drive.mount("/content/drive", force_remount=True)
os.chdir("/content/drive/MyDrive/DPB2_Claudia Camacho")
data = pd.read_excel('Datasets/dataset.xlsx')
data.info()

Mounted at /content/drive
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2772 entries, 0 to 2771
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Dataset     2772 non-null   object
 1   Texto       2772 non-null   object
 2   Distorsión  2772 non-null   object
dtypes: object(3)
memory usage: 65.1+ KB


# Data Stabilize

In [None]:
# def convert_labels(labels):
#     return [label.split(',') if ',' in label else [label] for label in labels]

# data['labels_str'] = data['Distorsión'].apply(lambda x: ''.join(x))

# # Filtrar las filas que contienen la etiqueta 'No distorsion'
# no_distorsion_rows = data[data['labels_str'] == 'No distorsion']

# # Calcular el número de filas a eliminar para balancear
# num_to_remove = 650

# # Si el número de filas a eliminar es positivo, eliminar filas aleatorias
# if num_to_remove > 0:
#     rows_to_remove = no_distorsion_rows.sample(n=num_to_remove, random_state=42)
#     df = data.drop(rows_to_remove.index)

# # Eliminar la columna auxiliar 'labels_str'
# df = df.drop(columns=['labels_str'])
# df.reset_index(drop=True, inplace=True)
# # Actualizar las variables texts y labels con los datos balanceados
# texts = df['Texto'].tolist()
# labels = df['Distorsión'].tolist()
# labels = convert_labels(labels)
# print(f"Longitud de data en texts: {len(texts)}")
# print(f"Longitud de data en labels: {len(labels)}")

In [None]:
texts = data['Texto'].tolist()
unique_strings = data['Distorsión'].unique()
mapping = {string: chr(97 + i) for i, string in enumerate(unique_strings)}
data['Distorsión_letra'] = data['Distorsión'].map(mapping)
labels = data['Distorsión_letra'].tolist()
print(f"Longitud de data en texts: {len(texts)}")
print(f"Longitud de data en labels: {len(labels)}")

Longitud de data en texts: 2772
Longitud de data en labels: 2772


In [None]:
lab = data['Distorsión_letra']
frecuencia = lab.value_counts().reset_index()
frecuencia.columns = ['Distorsión', 'Frecuencia']
frecuencia["Frecuencia (%)"] = frecuencia["Frecuencia"]/frecuencia["Frecuencia"].sum()*100
labels_model = frecuencia['Distorsión']
frecuencia

Unnamed: 0,Distorsión,Frecuencia,Frecuencia (%)
0,a,231,8.333333
1,b,231,8.333333
2,c,231,8.333333
3,d,231,8.333333
4,e,231,8.333333
5,f,231,8.333333
6,g,231,8.333333
7,h,231,8.333333
8,i,231,8.333333
9,j,231,8.333333


# Training model

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
import torch
from transformers import BertTokenizer, BertForSequenceClassification, BertConfig, Trainer, TrainingArguments

tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')

def tokenize_function(texts):
    return tokenizer(texts, padding='max_length', truncation=True, max_length=512)

train_texts, temp_texts, train_labels, temp_labels = train_test_split(texts, labels, test_size=0.3, random_state=42)
val_texts, test_texts, val_labels, test_labels = train_test_split(temp_texts, temp_labels, test_size=0.5, random_state=42)

train_encodings = tokenize_function(train_texts)
val_encodings = tokenize_function(val_texts)
test_encodings = tokenize_function(test_texts)

mlb = MultiLabelBinarizer()
train_labels_bin = mlb.fit_transform(train_labels)
val_labels_bin = mlb.transform(val_labels)
test_labels_bin = mlb.transform(test_labels)

class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx]).float()
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = CustomDataset(train_encodings, train_labels_bin)
val_dataset = CustomDataset(val_encodings, val_labels_bin)
test_dataset = CustomDataset(test_encodings, test_labels_bin)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

In [None]:
class BertForMultilabelSequenceClassification(BertForSequenceClassification):
    def __init__(self, config, dropout_prob=0.5):
        super().__init__(config)
        self.dropout = torch.nn.Dropout(dropout_prob)
        self.classifier = torch.nn.Linear(config.hidden_size, config.num_labels)

    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, labels=None, output_attentions=None, output_hidden_states=None, return_dict=None):
        outputs = self.bert(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict)
        pooled_output = outputs[1]
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        loss = None
        if labels is not None:
            loss_fct = torch.nn.BCEWithLogitsLoss()
            loss = loss_fct(logits, labels)
        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output
        return {'loss': loss, 'logits': logits, 'hidden_states': outputs.hidden_states, 'attentions': outputs.attentions}

config = BertConfig.from_pretrained('bert-large-uncased', num_labels=len(mlb.classes_))
model = BertForMultilabelSequenceClassification.from_pretrained('bert-large-uncased', config=config)

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=7,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    warmup_steps=1000,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch",
    learning_rate=2e-5
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

trainer.train()
results = trainer.evaluate(test_dataset)
print(results)



model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Some weights of BertForMultilabelSequenceClassification were not initialized from the model checkpoint at bert-large-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.3087,0.293181
2,0.2778,0.273022
3,0.2625,0.250402
4,0.2287,0.219247
5,0.1934,0.196378
6,0.1752,0.196993
7,0.1248,0.191464


{'eval_loss': 0.1945771872997284, 'eval_runtime': 39.3295, 'eval_samples_per_second': 10.577, 'eval_steps_per_second': 0.661, 'epoch': 7.0}


In [None]:
# Guardar el modelo y el tokenizador
model.save_pretrained('Models and Tokenizers/saved_modelf')
tokenizer.save_pretrained('Models and Tokenizers/saved_modelf')

('Models and Tokenizers/saved_modelq/tokenizer_config.json',
 'Models and Tokenizers/saved_modelq/special_tokens_map.json',
 'Models and Tokenizers/saved_modelq/vocab.txt',
 'Models and Tokenizers/saved_modelq/added_tokens.json')

## Reporte Test

In [None]:
from sklearn.metrics import classification_report
import numpy as np

predictions = trainer.predict(test_dataset)
y_pred = predictions.predictions
y_true = test_labels_bin

def normalize_rows(matrix):
    matrix = np.array(matrix)
    min_vals = matrix.min(axis=1, keepdims=True)
    max_vals = matrix.max(axis=1, keepdims=True)
    normalized_matrix = (matrix - min_vals) / (max_vals - min_vals)
    return normalized_matrix

In [None]:
y_pred = normalize_rows(y_pred)
y_true = normalize_rows(y_true)

print(y_pred);
print(y_true);

[[0.         0.05079487 0.23872066 ... 0.25295484 0.3486851  0.7909318 ]
 [0.35867888 1.         0.47384202 ... 0.36458245 0.06554691 0.03464605]
 [0.44140348 0.16595836 0.827474   ... 1.         0.5443481  0.        ]
 ...
 [0.         0.1270811  0.10926005 ... 0.01950363 0.01988237 0.08169525]
 [0.18620071 0.0381247  0.08240093 ... 0.05670228 0.09370707 1.        ]
 [0.18650596 0.12530571 0.10020442 ... 0.13215548 0.14577924 0.3438756 ]]
[[0. 0. 0. ... 0. 0. 1.]
 [0. 0. 1. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 1.]
 [0. 0. 0. ... 0. 0. 0.]]


In [None]:
y_pred_labels = np.argmax(y_pred, axis=1)
y_true_labels = np.argmax(y_true, axis=1)

print(y_pred_labels.shape)
print(y_true_labels.shape)

(416,)
(416,)


In [None]:
y = (y_pred_labels - y_true_labels) == 0;
print(y)

predicciones_flat = y.flatten()
accuracy = np.mean(predicciones_flat) * 100
print(f"El accuracy del modelo es: {accuracy:.2f}%")

[False False False False False False False  True  True False  True False
  True  True False False  True False False  True False False False  True
  True  True  True False  True False  True False  True  True False  True
 False False False False  True  True  True False False  True  True False
 False False  True  True False  True False  True  True False  True  True
  True False  True  True False False  True  True False False False False
 False False False False False  True False  True False False  True  True
 False  True False False  True  True False False False  True  True  True
  True False False False False False False  True False  True  True False
 False  True  True  True  True False False  True False  True  True False
 False  True False  True False  True False False False  True False  True
 False  True False  True False False False False  True  True  True  True
  True  True  True  True False False False False False  True  True  True
 False False False  True  True  True  True  True Fa

In [None]:
report = classification_report(y_true_labels, y_pred_labels, target_names=mlb.classes_, output_dict=True)
df_report = pd.DataFrame(report).transpose()

print("Reporte de clasificación original:")

df_report.rename(index={
    'a': unique_strings[0],
    'b': unique_strings[1],
    'c': unique_strings[2],
    'd': unique_strings[3],
    'e': unique_strings[4],
    'f': unique_strings[5],
    'g': unique_strings[6],
    'h': unique_strings[7],
    'i': unique_strings[8],
    'j': unique_strings[9],
    'k': unique_strings[10],
    'l': unique_strings[11],
    'accuracy': 'accuracy',
    'macro avg': 'macro avg',
    'weighted avg': 'weighted avg'
}, inplace=True)

df_report

Reporte de clasificación original:


Unnamed: 0,precision,recall,f1-score,support
Personalization,0.625,0.465116,0.533333,43.0
Mind reading,0.447368,0.607143,0.515152,28.0
Overgeneralization,0.461538,0.139535,0.214286,43.0
No distorsion,1.0,0.870968,0.931034,31.0
Should statements,0.647059,0.758621,0.698413,29.0
Magnification,0.357143,0.357143,0.357143,28.0
Fortune telling,0.444444,0.512821,0.47619,39.0
Labeling,0.510204,0.625,0.561798,40.0
All or nothing,0.243243,0.272727,0.257143,33.0
Emotional reasoning,0.333333,0.424242,0.373333,33.0


## Reporte Validación

In [None]:
predictions = trainer.predict(val_dataset)
y_pred = predictions.predictions
y_true = val_labels_bin

y_pred = normalize_rows(y_pred)
y_true = normalize_rows(y_true)

print(y_pred);
print(y_true);

[[0.84497625 1.         0.4796849  ... 0.8445947  0.06448225 0.        ]
 [0.20247428 0.0417389  0.04266556 ... 0.02299024 0.0211235  1.        ]
 [0.6816217  0.5069228  1.         ... 0.46846005 0.667189   0.42657968]
 ...
 [0.         0.09594703 0.08537506 ... 0.10039138 0.002163   0.09915411]
 [0.5324027  0.35436365 0.79647607 ... 0.82658184 0.19595681 0.        ]
 [0.         0.03620381 0.39511767 ... 0.42036593 0.55138016 0.32075354]]
[[0. 1. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [None]:
y_pred_labels = np.argmax(y_pred, axis=1)
y_true_labels = np.argmax(y_true, axis=1)

print(y_pred_labels.shape)
print(y_true_labels.shape)

(416,)
(416,)


In [None]:
y = (y_pred_labels - y_true_labels) == 0;
print(y)

predicciones_flat = y.flatten()
accuracy = np.mean(predicciones_flat) * 100
print(f"El accuracy del modelo es: {accuracy:.2f}%")

[ True False False  True False False  True  True False False False False
 False False  True  True  True False  True  True  True False  True  True
  True False False False  True False False  True  True False  True  True
 False False  True  True  True  True False False False False  True  True
  True False  True  True False False  True  True False False  True  True
 False  True False  True False  True  True False False  True  True False
 False False False False False False False  True  True  True False  True
  True  True  True  True False  True  True False False  True False False
  True False  True False False False False False False False  True False
  True  True False False  True  True  True False False False  True  True
 False  True False False  True False False  True  True  True False  True
  True False False False  True  True False  True  True False  True False
 False False False False  True False False False False False  True  True
  True False False False False  True False False Fa

In [None]:
report = classification_report(y_true_labels, y_pred_labels, target_names=mlb.classes_, output_dict=True)
df_report = pd.DataFrame(report).transpose()

print("Reporte de clasificación original:")
df_report.rename(index={
    'a': unique_strings[0],
    'b': unique_strings[1],
    'c': unique_strings[2],
    'd': unique_strings[3],
    'e': unique_strings[4],
    'f': unique_strings[5],
    'g': unique_strings[6],
    'h': unique_strings[7],
    'i': unique_strings[8],
    'j': unique_strings[9],
    'k': unique_strings[10],
    'l': unique_strings[11],
    'accuracy': 'accuracy',
    'macro avg': 'macro avg',
    'weighted avg': 'weighted avg'
}, inplace=True)

df_report

Reporte de clasificación original:


Unnamed: 0,precision,recall,f1-score,support
Personalization,0.37037,0.30303,0.333333,33.0
Mind reading,0.526316,0.555556,0.540541,36.0
Overgeneralization,0.125,0.068966,0.088889,29.0
No distorsion,0.92,0.958333,0.938776,48.0
Should statements,0.435897,0.586207,0.5,29.0
Magnification,0.352941,0.342857,0.347826,35.0
Fortune telling,0.35,0.451613,0.394366,31.0
Labeling,0.6,0.545455,0.571429,33.0
All or nothing,0.444444,0.444444,0.444444,36.0
Emotional reasoning,0.348837,0.384615,0.365854,39.0


In [None]:
import scipy.io
data = {
    "train_texts": train_texts,
    "val_texts": val_texts,
    "test_texts": test_texts,
    "val_labels_bin": val_labels_bin,
    "test_labels_bin": test_labels_bin
}

scipy.io.savemat("datasf.mat", data)

print("Variables exportadas a datafs.mat")

Variables exportadas a datas.mat
