In [None]:
import nltk
import re
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab') 
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

In [None]:
import pandas as pd

url = "https://raw.githubusercontent.com/justmarkham/DAT8/master/data/sms.tsv"
df = pd.read_csv(url, sep='\t', names=["label", "text"])
df['label'] = df['label'].map({'ham': 0, 'spam': 1})
df.head()

In [None]:
print("Columnas:", df.columns)
print("\nValores nulos por columna:\n", df.isna().sum())
print("\nClases:\n", df['label'].value_counts())


In [None]:
df['length'] = df['text'].str.len()
df[['text','length']].head()


In [None]:
def clean_text(text):
    # 1. Minúsculas
    text = text.lower()
    # 2. Eliminar todo lo que no sea letras o espacios
    text = re.sub(r"[^a-z\s]", " ", text)
    # 3. Tokenización
    tokens = nltk.word_tokenize(text)
    # 4. Eliminar stopwords y palabras cortas
    tokens = [t for t in tokens if t not in stop_words and len(t) > 1]
    # 5. Reconstrucción
    return " ".join(tokens)

# Aplicar al dataset
df['clean_text'] = df['text'].astype(str).apply(clean_text)

# Comparar original vs limpio
df[['text','clean_text']].head(20)


In [None]:
#df.to_csv("sms_clean.csv", index=False)
#print("Dataset limpio guardado como sms_clean.csv")
# Guardar el dataset limpio

Selección de características con TF-IDF + modelo superficial

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2

# 1. Vectorización TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)  # máximo 5000 palabras
X = vectorizer.fit_transform(df['clean_text'])
y = df['label']

print("Matriz TF-IDF:", X.shape)

# 2. Selección de características con chi-cuadrado
k = 1000  # número de features a conservar
selector = SelectKBest(chi2, k=k)
X_new = selector.fit_transform(X, y)

print("Matriz después de selección:", X_new.shape)

# Obtener las palabras más importantes del dataset
selected_features = [vectorizer.get_feature_names_out()[i] for i in selector.get_support(indices=True)]
print("Ejemplo de features seleccionadas:", selected_features[:100])


Aplicacion de SVM

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# 1. Separar en entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(
    X_new, y, test_size=0.2, random_state=42, stratify=y
)

# 2. Definir y entrenar modelo SVM
svm_model = SVC(kernel="linear", C=1.0, random_state=42)
svm_model.fit(X_train, y_train)

# 3. Predecir en test
y_pred = svm_model.predict(X_test)

# 4. Métricas
print("🔹 Accuracy:", accuracy_score(y_test, y_pred))
print("\n🔹 Reporte de Clasificación:\n", classification_report(y_test, y_pred))

# 5. Matriz de confusión
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=["Ham", "Spam"], yticklabels=["Ham", "Spam"])
plt.xlabel("Predicción")
plt.ylabel("Real")
plt.title("Matriz de Confusión - SVM")
plt.show()

APRENDIZAJE PROFUNDO

In [None]:
import torch
from torch.utils.data import DataLoader, TensorDataset
import torch.nn.functional as F
from mlp import PyTorchMLP

# 1. Convertir X y y a tensores
X_train_tensor = torch.tensor(X_train.toarray(), dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.long)
X_test_tensor = torch.tensor(X_test.toarray(), dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.long)

# 2. Crear DataLoader
train_data = TensorDataset(X_train_tensor, y_train_tensor)
test_data = TensorDataset(X_test_tensor, y_test_tensor)

train_loader = DataLoader(train_data, batch_size=64, shuffle=True)
test_loader = DataLoader(test_data, batch_size=64)

# 3. Definir modelo
num_features = X_train_tensor.shape[1]
num_classes = 2
model = PyTorchMLP(num_features, num_classes)

# 4. Definir optimizador y función de pérdida
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# 5. Entrenamiento
epochs = 10
for epoch in range(epochs):
    model.train()
    running_loss = 0.0
    for X_batch, y_batch in train_loader:
        optimizer.zero_grad()
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    print(f"Epoch {epoch+1}/{epochs}, Loss: {running_loss/len(train_loader):.4f}")

# 6. Evaluación
model.eval()
y_true, y_pred = [], []

with torch.no_grad():
    for X_batch, y_batch in test_loader:
        outputs = model(X_batch)
        preds = torch.argmax(F.softmax(outputs, dim=1), dim=1)
        y_true.extend(y_batch.tolist())
        y_pred.extend(preds.tolist())

Presentacion de resultados

In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

print("🔹 Accuracy (MLP):", accuracy_score(y_true, y_pred))
print("\n🔹 Reporte de Clasificación (MLP):\n", classification_report(y_true, y_pred))

# Matriz de confusión
cm = confusion_matrix(y_true, y_pred)
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=["Ham", "Spam"], yticklabels=["Ham", "Spam"])
plt.xlabel("Predicción")
plt.ylabel("Real")
plt.title("Matriz de Confusión - MLP")
plt.show()

APRENDIZAJE PROFUNDO CON LSTM

In [None]:
import torch
from torch.utils.data import DataLoader, TensorDataset
from torch.nn.utils.rnn import pad_sequence
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from collections import Counter
from lstm import PyTorchLSTM


# 1. Tokenización y vocabulario (sin torchtext)
specials = ["<unk>", "<pad>"]
counter = Counter()

for text in df['clean_text']:
    counter.update(text.split())

itos = specials + [word for word, _ in counter.most_common()]
stoi = {word: idx for idx, word in enumerate(itos)}

def text_pipeline(x):
    return [stoi.get(token, stoi["<unk>"]) for token in x.split()]

# Convertir dataset
X_indices = [torch.tensor(text_pipeline(text), dtype=torch.long) for text in df['clean_text']]
y_tensor = torch.tensor(df['label'].values, dtype=torch.long)

# Padding
X_padded = pad_sequence(X_indices, batch_first=True, padding_value=stoi["<pad>"])

# 2. Train / Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X_padded, y_tensor, test_size=0.2, stratify=y_tensor, random_state=42
)

train_data = TensorDataset(X_train, y_train)
test_data = TensorDataset(X_test, y_test)

train_loader = DataLoader(train_data, batch_size=64, shuffle=True)
test_loader = DataLoader(test_data, batch_size=64)

# 3. Definir modelo LSTM
vocab_size = len(stoi)
embed_dim = 128
hidden_dim = 64
num_classes = 2
model = PyTorchLSTM(vocab_size, embed_dim, hidden_dim, num_classes,
                    num_layers=1, bidirectional=True).to(device)

criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# 4. Entrenamiento
epochs = 5
for epoch in range(epochs):
    model.train()
    running_loss = 0.0
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    print(f"Epoch {epoch+1}/{epochs}, Loss: {running_loss/len(train_loader):.4f}")

# 5. Evaluación
model.eval()
y_true, y_pred = [], []
with torch.no_grad():
    for X_batch, y_batch in test_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        outputs = model(X_batch)
        preds = torch.argmax(outputs, dim=1)
        y_true.extend(y_batch.tolist())
        y_pred.extend(preds.tolist())

print("🔹 Accuracy (LSTM):", accuracy_score(y_true, y_pred))
print("\n🔹 Reporte de Clasificación (LSTM):\n", classification_report(y_true, y_pred))


In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# Métricas
print("🔹 Accuracy (LSTM):", accuracy_score(y_true, y_pred))
print("\n🔹 Reporte de Clasificación (LSTM):\n", classification_report(y_true, y_pred))

# Matriz de confusión
cm = confusion_matrix(y_true, y_pred)
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues",
            xticklabels=["Ham", "Spam"], yticklabels=["Ham", "Spam"])
plt.xlabel("Predicción")
plt.ylabel("Real")
plt.title("Matriz de Confusión - LSTM")
plt.show()
