In [69]:
import pandas as pd
import numpy as np
import re
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.utils.class_weight import compute_class_weight
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [70]:
# Descargar recursos de NLTK
nltk.download('stopwords')
nltk.download('wordnet')
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [71]:
# Cargar datos
data = pd.read_csv('/content/drive/MyDrive/Universidad/NLP/Proyecto Final/Bases de datos/mi_dataframe.csv')
X = data['comment_text']
y = data['is_toxic']

In [72]:
# Calcular pesos de clase
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y), y=y)
class_weights_dict = {0: class_weights[0], 1: class_weights[1]}
print("Pesos de clase:", class_weights_dict)

Pesos de clase: {0: 0.5565938358935721, 1: 4.917442218798151}


In [74]:
# Preprocesamiento de texto
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'\d+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    words = [lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words]
    return ' '.join(words)

X = X.apply(preprocess_text)

In [75]:
# Vectorización con TF-IDF
tfidf = TfidfVectorizer(max_df=0.9, min_df=5, ngram_range=(1, 2))
X_tfidf = tfidf.fit_transform(X)

# Submuestreo y sobremuestreo para balancear el conjunto de datos
rus = RandomUnderSampler(sampling_strategy=0.5, random_state=42)
ros = RandomOverSampler(sampling_strategy=1.0, random_state=42)
X_resampled, y_resampled = rus.fit_resample(X_tfidf, y)
X_resampled, y_resampled = ros.fit_resample(X_resampled, y_resampled)

# Dividir en conjunto de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled)

In [76]:
# Entrenar el modelo Naive Bayes
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)

In [77]:
# Evaluar el modelo
y_pred = nb_model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.9068567026194145
Confusion Matrix:
 [[5732  758]
 [ 451 6039]]
Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.88      0.90      6490
           1       0.89      0.93      0.91      6490

    accuracy                           0.91     12980
   macro avg       0.91      0.91      0.91     12980
weighted avg       0.91      0.91      0.91     12980



In [80]:
def classify_comments(comments):
    toxic_count = 0
    non_toxic_count = 0
    for comment in comments:
        # Preprocesar y vectorizar el comentario
        comment_processed = preprocess_text(comment)
        comment_tfidf = tfidf.transform([comment_processed])
        # Predicción
        prediction = nb_model.predict(comment_tfidf)[0]
        # Contabilizar
        if prediction == 1:
            toxic_count += 1
        else:
            non_toxic_count += 1
    # Imprimir resultados
    print(f"Total comentarios tóxicos: {toxic_count}")
    print(f"Total comentarios no tóxicos: {non_toxic_count}")

In [88]:
nuevo_comentario = [
    "Wow, could you be any worse at this? You’re embarrassing yourself.",
    "Honestly, your work is just pathetic. No wonder no one respects you.",
    "What a joke! Nobody would even consider this worth their time.",
    "You're totally clueless. Just stop wasting everyone's time.",
    "If this is your best effort, then you’re in big trouble.",
    "No one needs your input here. It’s honestly useless.",
    "Do you even understand what you’re saying? It’s so obvious you don’t.",
    "This is a disaster. Why don’t you just quit while you're behind?",
    "You don’t add anything valuable. Everything you say is worthless.",
    "People like you just bring everyone down with your negativity."
]
# Ejecutar la función con comentarios nuevos
classify_comments(nuevo_comentario)

Total comentarios tóxicos: 9
Total comentarios no tóxicos: 1


In [92]:
!pip install joblib




In [93]:
import joblib

# Guardar el modelo entrenado
joblib.dump(nb_model, 'modelo_naive_bayes.joblib')

# Guardar el vectorizador TF-IDF
joblib.dump(tfidf, 'vectorizador_tfidf.joblib')


['vectorizador_tfidf.joblib']

In [None]:
# Cargar el modelo
nb_model = joblib.load('modelo_naive_bayes.joblib')

# Cargar el vectorizador TF-IDF
tfidf = joblib.load('vectorizador_tfidf.joblib')