In [2]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, GlobalMaxPool1D, Bidirectional, Layer
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from tensorflow.keras.callbacks import EarlyStopping
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk
import re
import pickle

In [3]:
# Parámetros
MAX_SEQUENCE_LENGTH = 100
VOCAB_SIZE = 10000
EMBEDDING_DIM_GLOVE = 200
EMBEDDING_DIM_FASTTEXT = 300
GLOVE_PATH = '/content/drive/MyDrive/Universidad/NLP/Proyecto Final/Prueba de modelos/GloVe/glove.6B.200d.txt'
FASTTEXT_PATH = '/content/drive/MyDrive/Universidad/NLP/Proyecto Final/Prueba de modelos/GloVe/wiki-news-300d-1M.vec'

In [4]:
data = pd.read_csv('/content/drive/MyDrive/Universidad/NLP/Proyecto Final/Bases de datos/mi_dataframe.csv')

In [5]:
# Calcular los pesos de las sclases
class_weights = compute_class_weight(class_weight='balanced', classes=np.array([0, 1]), y=data["is_toxic"])
class_weights_dict = {0: class_weights[0], 1: class_weights[1]}

print("Pesos de clase:", class_weights_dict)

Pesos de clase: {0: 0.5565938358935721, 1: 4.917442218798151}


In [6]:
X = data['comment_text'].values
y = data['is_toxic'].values

In [7]:
# Preprocesamiento del texto
nltk.download('stopwords')
nltk.download('wordnet')
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [8]:
def preprocess_text(text, lemmatize=True):
    # Convertir a minúsculas
    text = text.lower()
    # Eliminar caracteres especiales y dígitos
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Eliminar stopwords y opcionalmente aplicar lematización
    words = [word for word in text.split() if word not in stop_words]
    if lemmatize:
        words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(words)

In [9]:
X_processed = np.array([preprocess_text(comment) for comment in X])

In [10]:
# Aplicar submuestreo y sobremuestreo para equilibrar el conjunto de datos
rus = RandomUnderSampler(sampling_strategy=0.5, random_state=42)
ros = RandomOverSampler(sampling_strategy=1.0, random_state=42)
X_resampled, y_resampled = rus.fit_resample(X.reshape(-1, 1), y)
X_resampled, y_resampled = ros.fit_resample(X_resampled, y_resampled)
X_resampled = X_resampled.flatten()

In [11]:
# Tokenización de los textos
tokenizer = Tokenizer(num_words=VOCAB_SIZE, oov_token="<OOV>")
tokenizer.fit_on_texts(X_resampled)
X_sequences = tokenizer.texts_to_sequences(X_resampled)
X_padded = pad_sequences(X_sequences, maxlen=MAX_SEQUENCE_LENGTH)

In [12]:
# Guardar el tokenizador para uso futuro
with open('/content/drive/MyDrive/Universidad/NLP/Proyecto Final/Prueba de modelos/Modelos guardados/tokenizer_glove.pkl', 'wb') as file:
    pickle.dump(tokenizer, file)

In [17]:
y_resampled_series = pd.Series(y_resampled)
print("Distribución de clases después del balanceo:", y_resampled_series.value_counts())

Distribución de clases después del balanceo: 0    32450
1    32450
Name: count, dtype: int64


In [24]:
# Dividir en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X_padded, y_resampled, test_size=0.2, random_state=42)

In [None]:
# Cargar los embeddings de GloVe
embeddings_index = {}
with open(GLOVE_PATH, 'r') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefficients = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefficients

In [18]:
# Cargar embeddings de GloVe
embeddings_index_glove = {}
with open(GLOVE_PATH, 'r') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefficients = np.asarray(values[1:], dtype='float32')
        embeddings_index_glove[word] = coefficients

In [19]:
# Cargar embeddings de FastText
embeddings_index_fasttext = {}
with open(FASTTEXT_PATH, 'r') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefficients = np.asarray(values[1:], dtype='float32')
        embeddings_index_fasttext[word] = coefficients


In [20]:
# Crear la matriz de embeddings combinada (GloVe + FastText)
embedding_matrix = np.zeros((VOCAB_SIZE, EMBEDDING_DIM_GLOVE + EMBEDDING_DIM_FASTTEXT))
for word, i in tokenizer.word_index.items():
    if i < VOCAB_SIZE:
        glove_vector = embeddings_index_glove.get(word)
        fasttext_vector = embeddings_index_fasttext.get(word)
        if glove_vector is not None and fasttext_vector is not None:
            embedding_matrix[i] = np.concatenate((glove_vector, fasttext_vector))
        elif glove_vector is not None:
            embedding_matrix[i, :EMBEDDING_DIM_GLOVE] = glove_vector
        elif fasttext_vector is not None:
            embedding_matrix[i, EMBEDDING_DIM_GLOVE:] = fasttext_vector

In [21]:
# Definir capa de atención personalizada
class AttentionLayer(Layer):
    def __init__(self, **kwargs):
        super(AttentionLayer, self).__init__(**kwargs)

    def build(self, input_shape):
        self.W = self.add_weight(name="att_weight", shape=(input_shape[-1], 1), initializer="normal")
        self.b = self.add_weight(name="att_bias", shape=(input_shape[1], 1), initializer="zeros")
        super(AttentionLayer, self).build(input_shape)

    def call(self, x):
        e = tf.keras.activations.tanh(tf.keras.backend.dot(x, self.W) + self.b)
        a = tf.keras.activations.softmax(e, axis=1)
        output = x * a
        return tf.keras.backend.sum(output, axis=1)

In [22]:
# Construir el modelo
def create_model():
    model = Sequential([
        Embedding(VOCAB_SIZE, EMBEDDING_DIM_GLOVE + EMBEDDING_DIM_FASTTEXT, weights=[embedding_matrix], input_length=MAX_SEQUENCE_LENGTH, trainable=False),
        Bidirectional(LSTM(64, return_sequences=True)),
        AttentionLayer(),
        Dropout(0.5),
        Dense(64, activation='relu'),
        Dropout(0.5),
        Dense(1, activation='sigmoid')
    ])
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

# Crear el modelo
model = create_model()

# Configurar EarlyStopping para evitar el sobreajuste
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)



In [26]:
# Entrenar el modelo con los pesos de clase
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y), y=y)
class_weights_dict = {i : class_weights[i] for i in range(len(class_weights))}
history = model.fit(X_train, y_train,
                    epochs=3,
                    batch_size=32,
                    validation_data=(X_test, y_test),
                    class_weight=class_weights_dict,
                    callbacks=[early_stopping])

Epoch 1/3
[1m1623/1623[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m425s[0m 261ms/step - accuracy: 0.7281 - loss: 0.5426 - val_accuracy: 0.8448 - val_loss: 0.3685
Epoch 2/3
[1m1623/1623[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m434s[0m 257ms/step - accuracy: 0.8668 - loss: 0.2988 - val_accuracy: 0.8913 - val_loss: 0.2706
Epoch 3/3
[1m1623/1623[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m422s[0m 260ms/step - accuracy: 0.8777 - loss: 0.2615 - val_accuracy: 0.8881 - val_loss: 0.2689


In [39]:
# Guardar el modelo entrenado
model.save('/content/drive/MyDrive/Universidad/NLP/Proyecto Final/Prueba de modelos/Modelos guardados/LSTM + GloVe + FasText + CA.h5')



In [31]:
def preprocesar_y_clasificar(comentario):
    # Aplicar el preprocesamiento
    comentario_preprocesado = preprocess_text(comentario)
    # Convertir el comentario preprocesado a secuencia de tokens
    comentario_secuencia = tokenizer.texts_to_sequences([comentario_preprocesado])
    # Aplicar padding
    comentario_padded = pad_sequences(comentario_secuencia, maxlen=MAX_SEQUENCE_LENGTH)
    # Obtener predicción
    prediccion = model.predict(comentario_padded)
    return "Tóxico" if prediccion[0][0] > 0.5 else "No tóxico"

In [38]:
# Ejemplo de uso
nuevo_comentario = [
    "You're so dumb, it's actually funny.",
    "No one cares about your opinion. Just shut up already.",
    "What a joke! This is literally the worst thing I've seen.",
    "Please stop. You're completely useless.",
    "Nobody asked for your input, genius.",
    "You clearly don’t know what you’re doing. Why even try?",
    "The fact that you thought this was a good idea is pathetic.",
    "You're an embarrassment. Honestly, give up.",
    "This is just trash. No one with any sense would agree with you.",
    "Why are you even here? You’re just making things worse.",
    "This is garbage. You have no idea what you’re talking about.",
    "If I had a dollar for every dumb thing you've said, I'd be rich.",
    "Stop trying. Everyone can see you're failing.",
    "Wow, how can someone be this clueless?",
    "You're so annoying. No one cares about anything you say.",
    "You are the definition of failure. Just look at this mess.",
    "You clearly don’t have a brain.",
    "Pathetic! Do everyone a favor and stay quiet.",
    "This is painfully bad. Stop embarrassing yourself.",
    "Please, spare us from your nonsense.",
    "You think you’re smart, but you’re actually clueless.",
    "You're the worst at this. Nobody wants you here.",
    "Are you for real? You can’t be that dumb.",
    "It’s embarrassing watching you try so hard and fail.",
    "You’re so bad at this it’s almost sad.",
    "Why are you even trying? You have no talent.",
    "This is cringe-worthy. Please go away.",
    "Get a life, seriously. You're a joke.",
    "Do us a favor and stop posting. No one cares.",
    "You’re so dumb, I can’t even believe it.",
    "Pathetic attempt, as usual.",
    "I can’t believe people like you actually exist.",
    "Stop wasting everyone’s time with your nonsense.",
    "You’ve got to be kidding. This is horrendous.",
    "This is laughably terrible.",
    "You're a failure, plain and simple.",
    "You're embarrassingly bad at this.",
    "Nobody cares what you think, loser.",
    "How can you be this bad? It’s painful to watch.",
    "What a waste of time. You add nothing of value.",
    "Please, you’re way out of your league here.",
    "Wow, you're even worse than I thought.",
    "Stop talking, you’re just making it worse.",
    "You’re so clueless it’s actually painful.",
    "Every word you say is dumber than the last.",
    "Honestly, you’re a joke. Just quit already.",
    "Do you ever think before you talk? Because wow...",
    "Are you trying to be this dumb, or is it natural?",
    "Why don’t you just leave? No one wants you here.",
    "You're not only wrong but also incredibly annoying.",
    "You might be the worst person to attempt this. Ever."
]
# Probar con un conjunto de nuevos comentarios
toxico = 0
notoxico = 0
for comentario in nuevo_comentario:
    resultado = preprocesar_y_clasificar(comentario)
    if resultado == "Tóxico":
        toxico += 1
    else:
        notoxico += 1
    # print("Comentario:", comentario)
    # print("Clasificación:", resultado)
    # print()

print("Tóxico:", toxico)
print("No tóxico:", notoxico)
print("Porcentaje de tóxico:", (toxico / (toxico + notoxico)) * 100)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 65ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 72ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 78ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 72ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 69