In [17]:
from google.colab import files
files.upload()  # Sube aquí el archivo kaggle.json

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"manugal","key":"5cce7794a07ebfdd1fc8ec1df76bc29c"}'}

In [18]:
!mkdir -p ~/.kaggle
!mv kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [19]:
import kagglehub

# Descargar el dataset desde Kaggle
path = kagglehub.dataset_download("mariumfaheem666/spam-sms-classification-using-nlp")

print("Path to dataset files:", path)

Path to dataset files: /root/.cache/kagglehub/datasets/mariumfaheem666/spam-sms-classification-using-nlp/versions/1


In [20]:
!kaggle datasets download -d mariumfaheem666/spam-sms-classification-using-nlp -p /content
!unzip /content/spam-sms-classification-using-nlp.zip -d /content

Dataset URL: https://www.kaggle.com/datasets/mariumfaheem666/spam-sms-classification-using-nlp
License(s): apache-2.0
Downloading spam-sms-classification-using-nlp.zip to /content
  0% 0.00/209k [00:00<?, ?B/s]
100% 209k/209k [00:00<00:00, 111MB/s]
Archive:  /content/spam-sms-classification-using-nlp.zip
  inflating: /content/Spam_SMS.csv   


In [21]:
import pandas as pd

# Cargar el archivo CSV con el ajuste de codificación y saltando la primera columna problemática
file_path = '/content/Spam_SMS.csv'  # Asegúrate de usar la ruta correcta
df = pd.read_csv(file_path, encoding='latin-1')

# Renombrar las columnas para que se vean correctamente
df.columns = ["Class", "Message"]

# Visualizar las primeras filas para confirmar
print(df.head())

  Class                                            Message
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...


In [22]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# 2. Preprocesamiento de los datos

# Codificar las etiquetas (spam y ham)
label_encoder = LabelEncoder()
df['Class'] = label_encoder.fit_transform(df['Class'])

# Dividir el conjunto de datos
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

In [23]:
from transformers import BertTokenizer, BertModel

# 3. Preparar el tokenizador

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')



In [24]:
from torch.utils.data import Dataset

# 4. Crear el Dataset personalizado

class CustomDataset(Dataset):
    def __init__(self, messages, labels, tokenizer, max_len):
        self.messages = messages
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.messages)

    def __getitem__(self, index):
        message = self.messages[index]
        label = self.labels[index]

        # Tokenizar y rellenar (padding)
        encoding = self.tokenizer.encode_plus(
            message,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

def collate_fn(batch):
    return {
        'input_ids': torch.stack([item['input_ids'] for item in batch]),
        'attention_mask': torch.stack([item['attention_mask'] for item in batch]),
        'labels': torch.stack([item['labels'] for item in batch])
    }

In [25]:
from torch.utils.data import DataLoader
# 5. Definir los parámetros
max_len = 128  # Tamaño reducido
batch_size = 32  # Ajusta según tu GPU

# Crear los DataLoader
train_dataset = CustomDataset(train_df['Message'].to_numpy(), train_df['Class'].to_numpy(), tokenizer, max_len)
test_dataset = CustomDataset(test_df['Message'].to_numpy(), test_df['Class'].to_numpy(), tokenizer, max_len)

train_loader = DataLoader(train_dataset, batch_size=batch_size, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=batch_size, collate_fn=collate_fn)

In [26]:
import torch.nn as nn

# 6. Definir el modelo

class SpamClassifier(nn.Module):
    def __init__(self):
        super(SpamClassifier, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(self.bert.config.hidden_size, 2)  # 2 clases: spam y ham

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs[1]  # Salida de la capa [CLS]
        pooled_output = self.dropout(pooled_output)
        return self.fc(pooled_output)

In [27]:
import torch
import torch.optim as optim

# 7. Entrenamiento del modelo

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = SpamClassifier().to(device)

# Definir la función de pérdida y el optimizador con una tasa de aprendizaje más alta
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=5e-5)

# Entrenamiento con Mixed Precision
scaler = torch.cuda.amp.GradScaler()  # Para mixed precision

# Funciones de entrenamiento y evaluación
def train_epoch(model, data_loader, loss_fn, optimizer, device, n_examples):
    model = model.train()
    losses = []
    correct_predictions = 0

    for d in data_loader:
        input_ids = d['input_ids'].to(device)
        attention_mask = d['attention_mask'].to(device)
        labels = d['labels'].to(device)

        with torch.cuda.amp.autocast():
            outputs = model(input_ids, attention_mask)
            loss = loss_fn(outputs, labels)

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        _, preds = torch.max(outputs, dim=1)
        correct_predictions += torch.sum(preds == labels)
        losses.append(loss.item())

    return correct_predictions.double() / n_examples, sum(losses) / len(losses)

def eval_model(model, data_loader, loss_fn, device, n_examples):
    model = model.eval()
    losses = []
    correct_predictions = 0

    with torch.no_grad():
        for d in data_loader:
            input_ids = d['input_ids'].to(device)
            attention_mask = d['attention_mask'].to(device)
            labels = d['labels'].to(device)

            outputs = model(input_ids, attention_mask)
            loss = loss_fn(outputs, labels)

            _, preds = torch.max(outputs, dim=1)
            correct_predictions += torch.sum(preds == labels)
            losses.append(loss.item())

    return correct_predictions.double() / n_examples, sum(losses) / len(losses)


  scaler = torch.cuda.amp.GradScaler()  # Para mixed precision


In [28]:
# 8. Entrenamiento
EPOCHS = 5  # Puedes ajustar el número de épocas
for epoch in range(EPOCHS):
    print(f'Epoch {epoch + 1}/{EPOCHS}')
    print('-' * 10)

    train_acc, train_loss = train_epoch(
        model, train_loader, loss_fn, optimizer, device, len(train_dataset)
    )

    print(f'Train loss {train_loss} accuracy {train_acc}')

    val_acc, val_loss = eval_model(
        model, test_loader, loss_fn, device, len(test_dataset)
    )

    print(f'Validation loss {val_loss} accuracy {val_acc}')


  with torch.cuda.amp.autocast():


Epoch 1/5
----------
Train loss 0.11199841009719032 accuracy 0.9701726844583988
Validation loss 0.06607605893430965 accuracy 0.9847533632286996
Epoch 2/5
----------
Train loss 0.08315862842968531 accuracy 0.9813859609777977
Validation loss 0.06607605893430965 accuracy 0.9847533632286996
Epoch 3/5
----------
Train loss 0.08242849494729723 accuracy 0.9816102265081857
Validation loss 0.06607605893430965 accuracy 0.9847533632286996
Epoch 4/5
----------
Train loss 0.08383608779736927 accuracy 0.9813859609777977
Validation loss 0.06607605893430965 accuracy 0.9847533632286996
Epoch 5/5
----------
Train loss 0.08313105191503252 accuracy 0.9816102265081857
Validation loss 0.06607605893430965 accuracy 0.9847533632286996


In [29]:
# 9. Guardar el modelo entrenado
model_save_path = "spam_classifier_model.pth"
torch.save(model.state_dict(), model_save_path)
print(f"Modelo guardado en {model_save_path}")

Modelo guardado en spam_classifier_model.pth


In [30]:
# 10. Cargar el modelo para hacer predicciones
def load_model():
    model = SpamClassifier()
    model.load_state_dict(torch.load(model_save_path))
    model.to(device)  # Asegurar que el modelo esté en el dispositivo
    model.eval()
    return model

In [31]:
# 11. Hacer una predicción
def predict_message(message, model, tokenizer, device):
    # Tokenizar y rellenar (padding)
    encoding = tokenizer.encode_plus(
        message,
        add_special_tokens=True,
        max_length=max_len,
        return_token_type_ids=False,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt',
    )

    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    # Hacer la predicción
    with torch.no_grad():
        outputs = model(input_ids, attention_mask)
        _, predicted = torch.max(outputs, dim=1)

    # Convertir la predicción a etiqueta
    return 'ham' if predicted.item() == 0 else 'spam'

In [32]:
# 12. Ejemplo de uso
model = load_model()  # Cargar el modelo guardado
message = "Esta muy padre programar un modelo Transformer"
prediction = predict_message(message, model, tokenizer, device)
print(f"The message is classified as: {prediction}")

  model.load_state_dict(torch.load(model_save_path))


The message is classified as: ham
