In [1]:
import time

import pandas as pd
import numpy as np
from tqdm import tqdm


In [2]:
df = pd.read_csv("comentarios_todos_modelar.csv")

In [3]:
df['category'].value_counts()

ideología política    357
racismo               337
machismo              314
Name: category, dtype: int64

In [4]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()

df['num_cat']= label_encoder.fit_transform(df['category'])


In [5]:
df

Unnamed: 0,index,category,comment,num_cat
0,6654,ideología política,Feaaaaaaaaaaaaa comunista,0
1,6019,racismo,Es una gallega oligarca 🤦,2
2,124,machismo,Esta tipa es una vulgar,1
3,6231,ideología política,Cállate petarda que eres una corderita una co...,0
4,5409,ideología política,"TODA RIDICULA GERRA ,,,, SOLAMENTE TRAE MAS PE...",0
...,...,...,...,...
1003,568,racismo,Esta mujer venezolana debio pensarlo y meditar...,2
1004,2143,ideología política,En España preferimos inmigrantes hispaoamerica...,0
1005,712,racismo,La mayoria de los centroamericanos y sudameric...,2
1006,1436,ideología política,Todos los inmigrantes que paguen lo que paga u...,0


In [6]:
df.num_cat.unique() # ideologia politica = 0 , racismo= 2 y machismo = 1

array([0, 2, 1])

In [7]:
# Aplicar one-hot encoding
one_hot = pd.get_dummies(df['category'])

# Concatenar las columnas codificadas one-hot al DataFrame original
df = pd.concat([df, one_hot], axis=1)


In [8]:
df

Unnamed: 0,index,category,comment,num_cat,ideología política,machismo,racismo
0,6654,ideología política,Feaaaaaaaaaaaaa comunista,0,1,0,0
1,6019,racismo,Es una gallega oligarca 🤦,2,0,0,1
2,124,machismo,Esta tipa es una vulgar,1,0,1,0
3,6231,ideología política,Cállate petarda que eres una corderita una co...,0,1,0,0
4,5409,ideología política,"TODA RIDICULA GERRA ,,,, SOLAMENTE TRAE MAS PE...",0,1,0,0
...,...,...,...,...,...,...,...
1003,568,racismo,Esta mujer venezolana debio pensarlo y meditar...,2,0,0,1
1004,2143,ideología política,En España preferimos inmigrantes hispaoamerica...,0,1,0,0
1005,712,racismo,La mayoria de los centroamericanos y sudameric...,2,0,0,1
1006,1436,ideología política,Todos los inmigrantes que paguen lo que paga u...,0,1,0,0


separado

In [9]:
import tensorflow as tf
from transformers import BertTokenizer, TFBertForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Convertir la Serie de Pandas a una lista de strings
comentarios = df['comment'].tolist()
categorias = df['num_cat'].tolist()

# Codificación de Etiquetas
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(categorias)

# Tokenizador BERT
tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")

# Codificar los comentarios
encoded_data = tokenizer.batch_encode_plus(
    comentarios,
    add_special_tokens=True,
    return_attention_mask=True,
    padding='max_length',
    truncation=True,
    max_length=128,
    return_tensors='np'
)

# Datos de entrada y atención
input_ids = encoded_data['input_ids']
attention_masks = encoded_data['attention_mask']

# Verificar las dimensiones
print(f"Dimensiones de input_ids: {input_ids.shape}")
print(f"Dimensiones de encoded_labels: {encoded_labels.shape}")



Dimensiones de input_ids: (1008, 128)
Dimensiones de encoded_labels: (1008,)


In [10]:
# Si las dimensiones son compatibles
if input_ids.shape[0] == len(encoded_labels):
    # Dividir datos en conjuntos de entrenamiento y prueba
    train_inputs, test_inputs, train_labels, test_labels = train_test_split(
        input_ids,
        encoded_labels,
        test_size=0.2,
        random_state=42
    )
    train_masks, test_masks, _, _ = train_test_split(
        attention_masks,
        encoded_labels,
        test_size=0.2,
        random_state=42
    )
else:
    print("Las dimensiones de input_ids y encoded_labels no coinciden.")

In [11]:
# Cargar el modelo preentrenado BERT para clasificación de secuencias
model = TFBertForSequenceClassification.from_pretrained(
    "bert-base-multilingual-cased",
    num_labels=len(set(encoded_labels))
)

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
# Compilar el modelo
optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

# Entrenar el modelo
history = model.fit(
    [train_inputs, train_masks],
    train_labels,
    epochs=3,
    batch_size=32
)

# Evaluar el modelo en el conjunto de prueba
test_loss, test_accuracy = model.evaluate([test_inputs, test_masks], test_labels, verbose=2)
print(f"Test accuracy: {test_accuracy}")

Epoch 1/3
Epoch 2/3
Epoch 3/3
7/7 - 111s - loss: 0.8100 - accuracy: 0.6634 - 111s/epoch - 16s/step
Test accuracy: 0.6633663177490234


In [13]:
def get_class_name_from_index(idx):
    # Aquí debes implementar la lógica para obtener el nombre de clase correspondiente al índice
    # Por ejemplo, si tienes una lista de nombres de clases:
    class_names = ["ideologia politica", "machismo", "racismo"]  # Reemplaza esto con tus nombres reales de clases
    return class_names[idx]

# Crear la lista de nombres de clases
class_names = [get_class_name_from_index(idx) for idx in label_encoder.classes_]

In [14]:
# Predicciones en el conjunto de prueba
predictions = model.predict([test_inputs, test_masks])
predicted_labels = tf.argmax(predictions.logits, axis=1).numpy()

# Reporte de clasificación
target_names = list(label_encoder.classes_)
class_names = [get_class_name_from_index(idx) for idx in label_encoder.classes_]
print(classification_report(test_labels, predicted_labels, target_names=class_names))

                    precision    recall  f1-score   support

ideologia politica       0.74      0.35      0.48        74
          machismo       0.58      0.89      0.70        57
           racismo       0.72      0.80      0.76        71

          accuracy                           0.66       202
         macro avg       0.68      0.68      0.65       202
      weighted avg       0.69      0.66      0.64       202



In [15]:
# Prueba de prediccion

from transformers import pipeline

clf = pipeline("text-classification", model, tokenizer=tokenizer)
clf("Es una gallega oligarca")


[{'label': 'LABEL_1', 'score': 0.6909100413322449}]

In [16]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [17]:
from transformers import BertModel, BertTokenizer


ruta_modelo = "/content/drive/MyDrive/final_bert"

# Guardar el modelo y el tokenizador en un directorio
model.save_pretrained(ruta_modelo)
print(tokenizer.save_pretrained(ruta_modelo))

('/content/drive/MyDrive/final_bert/tokenizer_config.json', '/content/drive/MyDrive/final_bert/special_tokens_map.json', '/content/drive/MyDrive/final_bert/vocab.txt', '/content/drive/MyDrive/final_bert/added_tokens.json')


In [18]:
modelo_cargado = TFBertForSequenceClassification.from_pretrained("/content/drive/MyDrive/final_bert")
tokenizer_cargado = BertTokenizer.from_pretrained("/content/drive/MyDrive/final_bert")

Some layers from the model checkpoint at /content/drive/MyDrive/final_bert were not used when initializing TFBertForSequenceClassification: ['dropout_37']
- This IS expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertForSequenceClassification were initialized from the model checkpoint at /content/drive/MyDrive/final_bert.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForSequenceClassification for predictions without further training.


In [19]:
clf = pipeline("text-classification", modelo_cargado, tokenizer=tokenizer_cargado)


In [20]:
prediccion = clf("Es una gallega oligarca")[0]
label_predicto = int(prediccion["label"][-1])

In [21]:
get_class_name_from_index(label_predicto)

'machismo'