In [4]:
import pandas as pd

# 1) Cargar el archivo
ruta = "../datos_raw/SMSSpamCollection"  # Ajusta si cambias la ruta
# El archivo viene sin cabeceras, separador es tabulador: etiqueta (spam/ham) \t mensaje
df = pd.read_csv(ruta, sep="\t", header=None, names=["label", "text"], encoding="utf-8")

# 2) Primer vistazo
print("Número total de registros:", df.shape[0])
print(df["label"].value_counts(), "\n")     # Ver cuántos 'ham' y cuántos 'spam'
print(df.head(10))                           # Mostrar las primeras 10 filas
print("\nInformación de columnas y tipos de dato:")
print(df.info())


Número total de registros: 5572
label
ham     4825
spam     747
Name: count, dtype: int64 

  label                                               text
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...
5  spam  FreeMsg Hey there darling it's been 3 week's n...
6   ham  Even my brother is not like to speak with me. ...
7   ham  As per your request 'Melle Melle (Oru Minnamin...
8  spam  WINNER!! As a valued network customer you have...
9  spam  Had your mobile 11 months or more? U R entitle...

Información de columnas y tipos de dato:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   5572 non-null   object
 1 

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

# 1) Convertir etiquetas a valores binarios
df["label_bin"] = df["label"].map({"ham": 0, "spam": 1})

# 2) Dividir en train/test (manteniendo proporción de clases)
X_train_raw, X_test_raw, y_train, y_test = train_test_split(
    df["text"], df["label_bin"], test_size=0.2, random_state=42, stratify=df["label_bin"]
)

# 3) Vectorización TF-IDF
vectorizer = TfidfVectorizer(
    lowercase=True,
    strip_accents="unicode",
    stop_words="english",
    min_df=2           # descartar términos que aparezcan menos de 2 veces
)

# Ajustar el vectorizador con el conjunto de entrenamiento
X_train = vectorizer.fit_transform(X_train_raw)
X_test = vectorizer.transform(X_test_raw)

print("Shape X_train:", X_train.shape)
print("Shape X_test:", X_test.shape)


Shape X_train: (4457, 3376)
Shape X_test: (1115, 3376)


In [6]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix

# 1) Entrenar Naive Bayes (Multinomial) — funciona bien con conteo/TF-IDF de palabras
modelo_nb = MultinomialNB()
modelo_nb.fit(X_train, y_train)

# 2) Predecir sobre X_test
y_pred = modelo_nb.predict(X_test)

# 3) Evaluar
print("Métricas de clasificación para Naive Bayes:\n")
print(classification_report(y_test, y_pred, target_names=["ham", "spam"]))

# 4) Matriz de confusión (opcional visualizarla)
cm = confusion_matrix(y_test, y_pred)
print("Matriz de confusión:\n", cm)


Métricas de clasificación para Naive Bayes:

              precision    recall  f1-score   support

         ham       0.97      1.00      0.99       966
        spam       1.00      0.81      0.89       149

    accuracy                           0.97      1115
   macro avg       0.99      0.90      0.94      1115
weighted avg       0.97      0.97      0.97      1115

Matriz de confusión:
 [[966   0]
 [ 29 120]]


In [7]:
import joblib
joblib.dump(modelo_nb, "../modelos/modelo_naive_bayes_sms_spam.joblib")


['../modelos/modelo_naive_bayes_sms_spam.joblib']

In [8]:
with open("../modelos/features_vectorizer.txt", "w", encoding="utf-8") as f:
    for idx, feature in enumerate(vectorizer.get_feature_names_out()):
        f.write(f"{idx}\t{feature}\n")

In [9]:
report = classification_report(y_test, y_pred, target_names=["ham", "spam"])
with open("../resultados/reporte_naive_bayes.txt", "w", encoding="utf-8") as f:
    f.write(report)

In [10]:
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc

# Obtener probabilidades (para ROC)
y_prob = modelo_nb.predict_proba(X_test)[:, 1]
fpr, tpr, _ = roc_curve(y_test, y_prob)
roc_auc = auc(fpr, tpr)

plt.figure()
plt.plot(fpr, tpr, label=f"ROC curve (area = {roc_auc:.2f})")
plt.plot([0, 1], [0, 1], "k--")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Curva ROC - Naive Bayes (SMS Spam)")
plt.legend(loc="lower right")
plt.grid(True)
plt.savefig("../resultados/roc_naive_bayes.png", dpi=150)
plt.close()