In [1]:
# --- Librerías ---
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import AutoModelForSequenceClassification, AutoTokenizer, TrainingArguments, Trainer
from transformers.trainer_utils import IntervalStrategy
import torch
import numpy as np
from imblearn.over_sampling import RandomOverSampler

import sys
import os

notebook_dir = os.path.abspath('.')
project_root = os.path.dirname(notebook_dir)
sys.path.append(project_root)
from src.utils import compute_metrics
# --- Cargar DF ---
df = pd.read_csv("../data/processed/df_final_2.csv")

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
# --- Dividir el dataset en conjuntos de entrenamiento, validación y prueba ---
# --- IMPORTANTE: Aseguramos que la columna 'text_cleaned' no tenga NaN y sea tipo string ---
# Esto es crucial antes de la tokenización para evitar el TypeError
df['text_cleaned'] = df['text_cleaned'].fillna('').astype(str)

X = df['text_cleaned']
y = df['binary_label']

# Una buena práctica es dividir primero en entrenamiento+validación y luego el de prueba.
# Luego, el conjunto de entrenamiento+validación se divide nuevamente en entrenamiento y validación.

# Dividir en Entrenamiento+Validación (80%) y Prueba (20%)
X_train_val, X_test, y_train_val, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)

# Dividir el conjunto Entrenamiento+Validación en Entrenamiento (80% del 80% = 64%) y Validación (20% del 80% = 16%)
X_train, X_val, y_train, y_val = train_test_split(
    X_train_val, y_train_val, test_size=0.25, random_state=42, stratify=y_train_val)

In [11]:
# --- Carga y Preparación del Tokenizer de Hugging Face ---
# Vamos a usar un modelo popular y eficiente, como DistilBERT.
model_name = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [12]:
max_length = 256
train_encodings = tokenizer(
    list(X_train),
    truncation=True,
    padding='max_length',
    max_length=max_length,
    return_tensors='pt'
)

val_encodings = tokenizer(
    list(X_val),
    truncation=True,
    padding='max_length',
    max_length=max_length,
    return_tensors='pt'
)

test_encodings = tokenizer(
    list(X_test),
    truncation=True,
    padding='max_length',
    max_length=max_length,
    return_tensors='pt'
)

# --- CONVERTIR ETIQUETAS Y DATOS A PyTorch Dataset ---
# Esta clase es estándar cuando se usa el Trainer.
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        # Las etiquetas deben ser tensores de PyTorch.
        self.labels = torch.tensor(labels.values, dtype=torch.long)

    def __getitem__(self, idx):
        # Aseguramos que los valores de encodings sean también tensores de PyTorch
        item = {key: val[idx].clone().detach() for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

# Crear los objetos Dataset para entrenamiento, validación y prueba
train_dataset = CustomDataset(train_encodings, y_train)
val_dataset = CustomDataset(val_encodings, y_val)
test_dataset = CustomDataset(test_encodings, y_test)

In [13]:
# --- Cargar el modelo pre-entrenado DistilBERT para clasificación ---
num_labels = len(df['binary_label'].unique())
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=num_labels)

# Mover el modelo a la GPU si está disponible
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
#Lo usarás para acelerar drásticamente el entrenamiento de tu modelo 
#evitar errores de memoria agotada con datasets y modelos grandes.

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [None]:

# --- Configurar los Argumentos de Entrenamiento (Reemplaza model.compile) ---
# Usamos TrainingArguments de Hugging Face
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=100,
    eval_strategy=IntervalStrategy.EPOCH,   
    save_strategy=IntervalStrategy.EPOCH,   
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
)


# --- Crear una instancia del Trainer (Reemplaza model.fit) ---
# Se crea el Trainer con el modelo, argumentos y datasets
trainer = Trainer(
    model=model,                         # El modelo AutoModelForSequenceClassification
    args=training_args,                  # Los argumentos de entrenamiento definidos arriba
    train_dataset=train_dataset,         # Tus datos de entrenamiento (Dataset de PyTorch)
    eval_dataset=val_dataset,            # Tus datos de validación (Dataset de PyTorch)
    compute_metrics=compute_metrics      
)

# --- Entrenar el modelo ---
# Inicia el entrenamiento llamando a trainer.train()
trainer.train()

# --- Evaluar el modelo con el conjunto de prueba ---
print("\nEvaluando el modelo en el conjunto de prueba (test_dataset)...")
eval_results = trainer.evaluate(test_dataset)

# Los resultados de la evaluación están en un diccionario.
print(f"Resultados de la evaluación en el conjunto de prueba: {eval_results}")

Epoch,Training Loss,Validation Loss,Accuracy,F1 Overall,Recall Overall,Precision Overall,F1 Class 0 No Toxic,Recall Class 0 No Toxic,Precision Class 0 No Toxic,F1 Class 1 Toxic,Recall Class 1 Toxic,Precision Class 1 Toxic,Classification Report
1,0.1807,0.166419,0.946424,0.945354,0.946424,0.945884,0.965735,0.980909,0.951024,0.877235,0.831169,0.928707,"{'0': {'precision': 0.951023751023751, 'recall': 0.9809089373204933, 'f1-score': 0.9657351962741184, 'support': 5919.0}, '1': {'precision': 0.9287066246056782, 'recall': 0.8311688311688312, 'f1-score': 0.8772348033373063, 'support': 1771.0}, 'accuracy': 0.9464239271781535, 'macro avg': {'precision': 0.9398651878147146, 'recall': 0.9060388842446623, 'f1-score': 0.9214849998057124, 'support': 7690.0}, 'weighted avg': {'precision': 0.9458841371243483, 'recall': 0.9464239271781535, 'f1-score': 0.945353636340296, 'support': 7690.0}}"
2,0.1408,0.166744,0.949415,0.948854,0.949415,0.948808,0.967434,0.976178,0.958845,0.886754,0.859966,0.915264,"{'0': {'precision': 0.9588450049784268, 'recall': 0.9761784085149519, 'f1-score': 0.9674340728338217, 'support': 5919.0}, '1': {'precision': 0.9152644230769231, 'recall': 0.859966120835686, 'f1-score': 0.8867540029112082, 'support': 1771.0}, 'accuracy': 0.9494148244473342, 'macro avg': {'precision': 0.937054714027675, 'recall': 0.918072264675319, 'f1-score': 0.927094037872515, 'support': 7690.0}, 'weighted avg': {'precision': 0.9488084366367412, 'recall': 0.9494148244473342, 'f1-score': 0.9488535261715397, 'support': 7690.0}}"
3,0.0966,0.194868,0.947204,0.947183,0.947204,0.947163,0.965715,0.966042,0.965389,0.885246,0.884246,0.886248,"{'0': {'precision': 0.9653891608981935, 'recall': 0.9660415610745058, 'f1-score': 0.9657152508022293, 'support': 5919.0}, '1': {'precision': 0.8862478777589134, 'recall': 0.8842461885940147, 'f1-score': 0.8852459016393442, 'support': 1771.0}, 'accuracy': 0.9472041612483745, 'macro avg': {'precision': 0.9258185193285535, 'recall': 0.9251438748342602, 'f1-score': 0.9254805762207867, 'support': 7690.0}, 'weighted avg': {'precision': 0.947162995431397, 'recall': 0.9472041612483745, 'f1-score': 0.9471832329390993, 'support': 7690.0}}"
4,0.0558,0.245786,0.947854,0.947564,0.947854,0.947406,0.966283,0.970772,0.961835,0.885001,0.871259,0.899184,"{'0': {'precision': 0.9618346166722465, 'recall': 0.9707720898800473, 'f1-score': 0.9662826872950475, 'support': 5919.0}, '1': {'precision': 0.8991841491841492, 'recall': 0.8712591756070017, 'f1-score': 0.885001433897333, 'support': 1771.0}, 'accuracy': 0.9478543563068921, 'macro avg': {'precision': 0.9305093829281978, 'recall': 0.9210156327435245, 'f1-score': 0.9256420605961903, 'support': 7690.0}, 'weighted avg': {'precision': 0.9474062710387717, 'recall': 0.9478543563068921, 'f1-score': 0.947563688625691, 'support': 7690.0}}"
5,0.0467,0.297895,0.946944,0.946975,0.946944,0.947008,0.965517,0.965028,0.966007,0.885006,0.886505,0.883512,"{'0': {'precision': 0.9660071029934043, 'recall': 0.9650278763304613, 'f1-score': 0.9655172413793104, 'support': 5919.0}, '1': {'precision': 0.88351153629713, 'recall': 0.8865047995482778, 'f1-score': 0.8850056369785795, 'support': 1771.0}, 'accuracy': 0.9469440832249675, 'macro avg': {'precision': 0.9247593196452671, 'recall': 0.9257663379393695, 'f1-score': 0.9252614391789449, 'support': 7690.0}, 'weighted avg': {'precision': 0.947008449076746, 'recall': 0.9469440832249675, 'f1-score': 0.9469754921733684, 'support': 7690.0}}"



Evaluando el modelo en el conjunto de prueba (test_dataset)...


Resultados de la evaluación en el conjunto de prueba: {'eval_loss': 0.17746470868587494, 'eval_accuracy': 0.9421326397919376, 'eval_f1_overall': 0.9407651667020986, 'eval_recall_overall': 0.9421326397919376, 'eval_precision_overall': 0.941584774447451, 'eval_f1_class_0_no_toxic': 0.9630858564910826, 'eval_recall_class_0_no_toxic': 0.9807399898631526, 'eval_precision_class_0_no_toxic': 0.9460560625814863, 'eval_f1_class_1_toxic': 0.8661654135338346, 'eval_recall_class_1_toxic': 0.8130999435347261, 'eval_precision_class_1_toxic': 0.9266409266409267, 'eval_classification_report': {'0': {'precision': 0.9460560625814863, 'recall': 0.9807399898631526, 'f1-score': 0.9630858564910826, 'support': 5919.0}, '1': {'precision': 0.9266409266409267, 'recall': 0.8130999435347261, 'f1-score': 0.8661654135338346, 'support': 1771.0}, 'accuracy': 0.9421326397919376, 'macro avg': {'precision': 0.9363484946112065, 'recall': 0.8969199666989394, 'f1-score': 0.9146256350124586, 'support': 7690.0}, 'weighted av

In [2]:
df['text_cleaned'] = df['text_cleaned'].fillna('').astype(str)

In [3]:
# --- Modelo para multi-clase ---
X = df['text_cleaned']
y = df['multi_label']

# Dividir en Entrenamiento+Validación (80%) y Prueba (20%)
X_train_val, X_test, y_train_val, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)

# Dividir el conjunto Entrenamiento+Validación en Entrenamiento (80% del 80% = 64%) y Validación (20% del 80% = 16%)
X_train, X_val, y_train, y_val = train_test_split(
    X_train_val, y_train_val, test_size=0.25, random_state=42, stratify=y_train_val)


In [4]:
# 1. Mapear etiquetas de texto a IDs numéricos
label_column_name = 'multi_label' # Tu columna de etiquetas se llama 'multi_label'
unique_labels = sorted(y.unique())
label_to_id = {label: i for i, label in enumerate(unique_labels)}
id_to_label = {i: label for i, label in enumerate(unique_labels)}

print("--- Mapeo de Etiquetas ---")
print("Etiquetas a IDs:", label_to_id)
print("IDs a Etiquetas:", id_to_label)

# Convertir las etiquetas de tus conjuntos a IDs numéricos
y_train_ids = y_train.map(label_to_id).values
y_val_ids = y_val.map(label_to_id).values
y_test_ids = y_test.map(label_to_id).values

# Definir el número total de clases (necesario para el modelo)
num_classes = len(unique_labels)
print(f"\nNúmero total de clases: {num_classes}")

X_train_np = np.array(list(X_train)).reshape(-1, 1)

oversampler = RandomOverSampler(sampling_strategy='auto', random_state=42)

X_train_resampled_np, y_train_resampled_ids = oversampler.fit_resample(
    X_train_np, 
    y_train_ids
)

X_train = X_train_resampled_np.flatten()
y_train_ids = y_train_resampled_ids

--- Mapeo de Etiquetas ---
Etiquetas a IDs: {'Acción/Juego': 0, 'Gravemente Tóxico': 1, 'Levemente Tóxico': 2, 'No Tóxico': 3}
IDs a Etiquetas: {0: 'Acción/Juego', 1: 'Gravemente Tóxico', 2: 'Levemente Tóxico', 3: 'No Tóxico'}

Número total de clases: 4


In [5]:
# --- Carga y Preparación del Tokenizer de Hugging Face ---
model_name = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [6]:
max_length = 256
train_encodings = tokenizer(
    list(X_train),
    truncation=True,
    padding='max_length',
    max_length=max_length,
    return_tensors='pt'
)

val_encodings = tokenizer(
    list(X_val),
    truncation=True,
    padding='max_length',
    max_length=max_length,
    return_tensors='pt'
)

test_encodings = tokenizer(
    list(X_test),
    truncation=True,
    padding='max_length',
    max_length=max_length,
    return_tensors='pt'
)

# --- CONVERTIR ETIQUETAS Y DATOS A PyTorch Dataset ---
# Esta clase es estándar cuando se usa el Trainer.
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels_ids):
        self.encodings = encodings
        # Las etiquetas deben ser tensores de PyTorch.
        self.labels = torch.tensor(labels_ids, dtype=torch.long)

    def __getitem__(self, idx):
        # Aseguramos que los valores de encodings sean también tensores de PyTorch
        item = {key: val[idx].clone().detach() for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

# Crear los objetos Dataset para entrenamiento, validación y prueba
train_dataset = CustomDataset(train_encodings, y_train_ids) 
val_dataset = CustomDataset(val_encodings, y_val_ids)    
test_dataset = CustomDataset(test_encodings, y_test_ids) 

In [7]:
# --- Cargar el modelo pre-entrenado DistilBERT para clasificación ---
num_labels = num_classes
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=num_labels)

# Mover el modelo a la GPU si está disponible
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [8]:
# --- Configurar los Argumentos de Entrenamiento (Reemplaza model.compile) ---
# Usamos TrainingArguments de Hugging Face
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=100,
    eval_strategy=IntervalStrategy.EPOCH,   
    save_strategy=IntervalStrategy.EPOCH,   
    load_best_model_at_end=True,
    metric_for_best_model="eval_f1_overall",
    greater_is_better=True,                  
    remove_unused_columns=False
)


# --- Crear una instancia del Trainer (Reemplaza model.fit) ---
# Se crea el Trainer con el modelo, argumentos y datasets
trainer = Trainer(
    model=model,                         # El modelo AutoModelForSequenceClassification
    args=training_args,                  # Los argumentos de entrenamiento definidos arriba
    train_dataset=train_dataset,         # Tus datos de entrenamiento (Dataset de PyTorch)
    eval_dataset=val_dataset,            # Tus datos de validación (Dataset de PyTorch)
    compute_metrics=lambda p: compute_metrics(p, id_to_label),
)

# --- Entrenar el modelo ---
# Inicia el entrenamiento llamando a trainer.train()
trainer.train()

# --- Evaluar el modelo con el conjunto de prueba ---
print("\nEvaluando el modelo en el conjunto de prueba (test_dataset)...")
eval_results = trainer.evaluate(test_dataset)

# Los resultados de la evaluación están en un diccionario.
print(f"Resultados de la evaluación en el conjunto de prueba: {eval_results}")

Epoch,Training Loss,Validation Loss,Accuracy,F1 Overall,Recall Overall,Precision Overall,F1 Class 0 Acción/juego,Recall Class 0 Acción/juego,Precision Class 0 Acción/juego,F1 Class 1 Gravemente Tóxico,Recall Class 1 Gravemente Tóxico,Precision Class 1 Gravemente Tóxico,F1 Class 2 Levemente Tóxico,Recall Class 2 Levemente Tóxico,Precision Class 2 Levemente Tóxico,F1 Class 3 No Tóxico,Recall Class 3 No Tóxico,Precision Class 3 No Tóxico,Classification Report Dict
1,0.31,0.577349,0.897269,0.899394,0.897269,0.903557,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"{'Acción/Juego': {'precision': 0.6582064297800339, 'recall': 0.8644444444444445, 'f1-score': 0.7473583093179635, 'support': 450.0}, 'Gravemente Tóxico': {'precision': 0.8552631578947368, 'recall': 0.8673894912427023, 'f1-score': 0.8612836438923396, 'support': 1199.0}, 'Levemente Tóxico': {'precision': 0.7038917089678511, 'recall': 0.7272727272727273, 'f1-score': 0.7153912295786758, 'support': 572.0}, 'No Tóxico': {'precision': 0.9552154195011338, 'recall': 0.9243006034009874, 'f1-score': 0.9395037635907444, 'support': 5469.0}, 'accuracy': 0.8972691807542262, 'macro avg': {'precision': 0.7931441790359389, 'recall': 0.8458518165902154, 'f1-score': 0.8158842365949308, 'support': 7690.0}, 'weighted avg': {'precision': 0.90355690591653, 'recall': 0.8972691807542262, 'f1-score': 0.8993940435132358, 'support': 7690.0}}"
2,0.156,0.664243,0.907022,0.907284,0.907022,0.909019,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"{'Acción/Juego': {'precision': 0.706766917293233, 'recall': 0.8355555555555556, 'f1-score': 0.7657841140529531, 'support': 450.0}, 'Gravemente Tóxico': {'precision': 0.8412698412698413, 'recall': 0.8840700583819849, 'f1-score': 0.8621390809272061, 'support': 1199.0}, 'Levemente Tóxico': {'precision': 0.8089430894308943, 'recall': 0.6958041958041958, 'f1-score': 0.7481203007518797, 'support': 572.0}, 'No Tóxico': {'precision': 0.9509803921568627, 'recall': 0.9400255988297678, 'f1-score': 0.9454712643678161, 'support': 5469.0}, 'accuracy': 0.9070221066319896, 'macro avg': {'precision': 0.8269900600377078, 'recall': 0.8388638521428761, 'f1-score': 0.8303786900249637, 'support': 7690.0}, 'weighted avg': {'precision': 0.9090188380136343, 'recall': 0.9070221066319896, 'f1-score': 0.9072841048391692, 'support': 7690.0}}"
3,0.0662,0.747953,0.904421,0.904824,0.904421,0.905901,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"{'Acción/Juego': {'precision': 0.7037037037037037, 'recall': 0.8022222222222222, 'f1-score': 0.7497403946002077, 'support': 450.0}, 'Gravemente Tóxico': {'precision': 0.8438511326860841, 'recall': 0.8698915763135947, 'f1-score': 0.8566735112936344, 'support': 1199.0}, 'Levemente Tóxico': {'precision': 0.7790476190476191, 'recall': 0.715034965034965, 'f1-score': 0.7456700091157703, 'support': 572.0}, 'No Tóxico': {'precision': 0.9494091580502215, 'recall': 0.9402084476138234, 'f1-score': 0.9447864033073037, 'support': 5469.0}, 'accuracy': 0.9044213263979194, 'macro avg': {'precision': 0.8190029033719072, 'recall': 0.8318393027961514, 'f1-score': 0.824217579579229, 'support': 7690.0}, 'weighted avg': {'precision': 0.9059009230467077, 'recall': 0.9044213263979194, 'f1-score': 0.9048237714581308, 'support': 7690.0}}"
4,0.0175,0.879613,0.905462,0.905562,0.905462,0.906266,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"{'Acción/Juego': {'precision': 0.7190569744597249, 'recall': 0.8133333333333334, 'f1-score': 0.7632950990615224, 'support': 450.0}, 'Gravemente Tóxico': {'precision': 0.8539786710418376, 'recall': 0.8682235195996664, 'f1-score': 0.8610421836228288, 'support': 1199.0}, 'Levemente Tóxico': {'precision': 0.7677543186180422, 'recall': 0.6993006993006993, 'f1-score': 0.7319304666056725, 'support': 572.0}, 'No Tóxico': {'precision': 0.9476199228083073, 'recall': 0.9427683305906016, 'f1-score': 0.9451879010082493, 'support': 5469.0}, 'accuracy': 0.9054616384915475, 'macro avg': {'precision': 0.822102471731978, 'recall': 0.8309064707060752, 'f1-score': 0.8253639125745682, 'support': 7690.0}, 'weighted avg': {'precision': 0.9062659158874111, 'recall': 0.9054616384915475, 'f1-score': 0.9055616684335523, 'support': 7690.0}}"
5,0.0411,0.910525,0.904941,0.904134,0.904941,0.904497,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"{'Acción/Juego': {'precision': 0.7163265306122449, 'recall': 0.78, 'f1-score': 0.7468085106382979, 'support': 450.0}, 'Gravemente Tóxico': {'precision': 0.8637510513036165, 'recall': 0.8565471226021685, 'f1-score': 0.8601340033500837, 'support': 1199.0}, 'Levemente Tóxico': {'precision': 0.8189473684210526, 'recall': 0.6800699300699301, 'f1-score': 0.7430754536771729, 'support': 572.0}, 'No Tóxico': {'precision': 0.9378612716763006, 'recall': 0.9493508868166026, 'f1-score': 0.9435711040436165, 'support': 5469.0}, 'accuracy': 0.9049414824447334, 'macro avg': {'precision': 0.8342215555033037, 'recall': 0.8164919848721753, 'f1-score': 0.8233972679272927, 'support': 7690.0}, 'weighted avg': {'precision': 0.9044974822916874, 'recall': 0.9049414824447334, 'f1-score': 0.9041344638910099, 'support': 7690.0}}"



Evaluando el modelo en el conjunto de prueba (test_dataset)...


Resultados de la evaluación en el conjunto de prueba: {'eval_loss': 0.6021584868431091, 'eval_accuracy': 0.9174252275682705, 'eval_f1_overall': 0.9175157609422612, 'eval_recall_overall': 0.9174252275682705, 'eval_precision_overall': 0.9182924623444785, 'eval_f1_class_0_Acción/Juego': 0.0, 'eval_recall_class_0_Acción/Juego': 0.0, 'eval_precision_class_0_Acción/Juego': 0.0, 'eval_f1_class_1_Gravemente_Tóxico': 0.0, 'eval_recall_class_1_Gravemente_Tóxico': 0.0, 'eval_precision_class_1_Gravemente_Tóxico': 0.0, 'eval_f1_class_2_Levemente_Tóxico': 0.0, 'eval_recall_class_2_Levemente_Tóxico': 0.0, 'eval_precision_class_2_Levemente_Tóxico': 0.0, 'eval_f1_class_3_No_Tóxico': 0.0, 'eval_recall_class_3_No_Tóxico': 0.0, 'eval_precision_class_3_No_Tóxico': 0.0, 'eval_classification_report_dict': {'Acción/Juego': {'precision': 0.7392996108949417, 'recall': 0.8444444444444444, 'f1-score': 0.7883817427385892, 'support': 450.0}, 'Gravemente Tóxico': {'precision': 0.8733552631578947, 'recall': 0.8857381

In [None]:
#output_dir = './models/modelo_toxicidad_guardado'
#os.makedirs(output_dir, exist_ok=True) 

# Guarda el modelo
#model.save_pretrained(output_dir)

# Guarda el tokenizador
#tokenizer.save_pretrained(output_dir)

<h3 style="color:#6B7A8F; font-weight:bold;">Conclusiones Generales sobre el Rendimiento del Modelo</h3>

<p>Tras el proceso de entrenamiento y evaluación, nuestro modelo de clasificación de toxicidad ha demostrado un rendimiento sólido y prometedor, alcanzando una <strong>precisión general del ~91.74%</strong>. Esto indica que el sistema es altamente efectivo en identificar y clasificar correctamente la inmensa mayoría de los mensajes de chat.</p>

<h4 style="color:#2E4053;">💪 Fortalezas del Modelo</h4>
<ul>
  <li><strong>Alta Fiabilidad en lo Crucial:</strong> El modelo brilla especialmente en las categorías más críticas:</li>
  <ul>
    <li><strong>Mensajes "No Tóxicos":</strong> F1-score de ~95.4%. Excepcional para preservar conversaciones saludables sin sobre-moderar.</li>
    <li><strong>Mensajes "Gravemente Tóxicos":</strong> F1-score de ~87.9% y recall de ~88.6%. Detecta eficazmente la toxicidad más severa, clave para proteger a la comunidad.</li>
  </ul>
</ul>

<h4 style="color:#B03A2E;">⚠️ Áreas de Oportunidad</h4>
<ul>
  <li><strong>Manejo de la Toxicidad Sutil:</strong> F1-score de ~75.3%, con un recall de ~71.8%. El sarcasmo y la ironía siguen siendo un reto.</li>
  <li><strong>Clasificación de "Acción/Juego":</strong> F1-score de ~78.8%, con precisión de ~73.9%. Podría haber confusiones con otras categorías.</li>
</ul>

<div style="background-color: #eaf6fd; border-left: 5px solid #3498db; padding: 15px; margin-top: 20px; border-radius: 5px;">
  <h4 style="color:#6B7A8F;">🔎 Implicaciones para Empresas</h4>
  <p>Los resultados posicionan a este modelo como una herramienta <strong>robusta y escalable</strong> para la moderación de contenido. Su precisión en detectar toxicidad grave y su fiabilidad con mensajes no tóxicos, lo hacen ideal para empresas que buscan proteger su marca y fomentar comunidades online más sanas y atractivas.</p>
</div>