<a href="https://colab.research.google.com/github/HillaryDrugs/li7/blob/main/CNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# ======================
# 1. INSTALL DEPENDENCIES
# ======================
!pip install scikit-learn tensorflow -q

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from tqdm import tqdm

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# ======================
# 2. DEVICE (GPU OR CPU)
# ======================
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# ======================
# 3. LOAD & PREPARE DATA
# ======================
# Make sure /content/spam.csv exists in Colab
df = pd.read_csv("/content/spam.csv", encoding="cp1252")

# keep only label/text cols
df = df[["v1", "v2"]].rename(columns={"v1": "label", "v2": "text"})

# map ham/spam -> 0/1
label_map = {"ham": 0, "spam": 1}
df["label_id"] = df["label"].map(label_map)

texts = df["text"].tolist()
labels = df["label_id"].tolist()

# split dataset
train_texts, test_texts, train_labels, test_labels = train_test_split(
    texts,
    labels,
    test_size=0.2,
    random_state=42,
    stratify=labels
)

print(f"Train size: {len(train_texts)} | Test size: {len(test_texts)}")
print(df["label"].value_counts(), "\n")

# ======================
# 4. TOKENIZE & PAD TEXT
# ======================
max_words = 10000   # vocab cap
max_len = 50        # sequence length (you can make it 60/80 if you want)

tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(train_texts)

train_sequences = tokenizer.texts_to_sequences(train_texts)
test_sequences  = tokenizer.texts_to_sequences(test_texts)

X_train = pad_sequences(train_sequences, maxlen=max_len, padding='post', truncating='post')
X_test  = pad_sequences(test_sequences,  maxlen=max_len, padding='post', truncating='post')

y_train = torch.tensor(train_labels, dtype=torch.long)
y_test  = torch.tensor(test_labels,  dtype=torch.long)

vocab_size = min(max_words, len(tokenizer.word_index) + 1)
print("Vocab size:", vocab_size)

# ======================
# 5. PYTORCH DATASET / DATALOADER
# ======================
class SpamDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.long)
        self.y = y
    def __len__(self):
        return len(self.y)
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

train_dataset = SpamDataset(X_train, y_train)
test_dataset  = SpamDataset(X_test,  y_test)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader  = DataLoader(test_dataset,  batch_size=32, shuffle=False)

print(f"Train batches: {len(train_loader)}")
print(f"Test  batches: {len(test_loader)}\n")

# ======================
# 6. CNN MODEL (TRAINABLE EMBEDDINGS)
# ======================
class SpamCNN(nn.Module):
    def __init__(self, vocab_size, embed_dim=100, num_filters=100, filter_sizes=[3,4,5], dropout=0.5):
        super(SpamCNN, self).__init__()

        # trainable embedding (random init)
        self.embedding = nn.Embedding(vocab_size, embed_dim)

        # multiple convolution branches with different n-gram sizes
        self.convs = nn.ModuleList([
            nn.Conv1d(
                in_channels=embed_dim,
                out_channels=num_filters,
                kernel_size=k
            )
            for k in filter_sizes
        ])

        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(num_filters * len(filter_sizes), 2)  # ham/spam

    def forward(self, x):
        # x: [batch, max_len] (token ids)
        x = self.embedding(x)          # [batch, max_len, embed_dim]
        x = x.permute(0, 2, 1)         # [batch, embed_dim, max_len] for Conv1d

        conv_outs = []
        for conv in self.convs:
            c = conv(x)                # [batch, num_filters, L']
            c = torch.relu(c)
            c = torch.max(c, dim=2).values  # global max pool -> [batch, num_filters]
            conv_outs.append(c)

        x = torch.cat(conv_outs, dim=1)      # [batch, num_filters * len(filter_sizes)]
        x = self.dropout(x)
        logits = self.fc(x)                  # [batch, 2]
        return logits

model = SpamCNN(vocab_size=vocab_size).to(device)

# ======================
# 7. TRAINING SETUP
# ======================
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)
epochs = 5  # lower to 3 or 2 if needed for speed

# ======================
# 8. TRAIN LOOP
# ======================
for epoch in range(epochs):
    model.train()
    total_loss = 0.0

    for X_batch, y_batch in tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}"):
        X_batch = X_batch.to(device)
        y_batch = y_batch.to(device)

        optimizer.zero_grad()
        outputs = model(X_batch)          # [batch, 2]
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}/{epochs} - Train Loss: {avg_loss:.4f}")

# ======================
# 9. EVALUATION
# ======================
model.eval()
y_true = []
y_pred = []

with torch.no_grad():
    for X_batch, y_batch in test_loader:
        X_batch = X_batch.to(device)
        outputs = model(X_batch)
        preds = torch.argmax(outputs, dim=1)

        y_true.extend(y_batch.numpy())
        y_pred.extend(preds.cpu().numpy())

acc = accuracy_score(y_true, y_pred)
print("\n================ RESULTS ================")
print(f"Test Accuracy: {acc:.4f}\n")
print("Classification Report:")
print(classification_report(y_true, y_pred, target_names=["Ham", "Spam"]))

# ======================
# 10. PREDICTION FUNCTION
# ======================
def predict_message(model, tokenizer, text, max_len=50):
    model.eval()

    seq = tokenizer.texts_to_sequences([text])
    padded = pad_sequences(seq, maxlen=max_len, padding='post', truncating='post')

    tensor = torch.tensor(padded, dtype=torch.long).to(device)

    with torch.no_grad():
        logits = model(tensor)                  # [1, 2]
        pred = torch.argmax(logits, dim=1).item()

    return "SPAM 🚨" if pred == 1 else "HAM ✅"

# quick manual test
samples = [
    "WIN a brand new car now!",
    "Can we meet tomorrow for coffee?",
    "URGENT! Your account was hacked.",
    "Ok I'm home, text me when you arrive.",
    "You have won $5000 cash. Call now to receive your reward."
]

print("\n================ EXAMPLES ================")
for msg in samples:
    print(f"Message: {msg}")
    print("Prediction:", predict_message(model, tokenizer, msg))
    print()


Using device: cpu
Train size: 4457 | Test size: 1115
label
ham     4825
spam     747
Name: count, dtype: int64 

Vocab size: 7920
Train batches: 140
Test  batches: 35



Epoch 1/5: 100%|██████████| 140/140 [00:07<00:00, 18.17it/s]


Epoch 1/5 - Train Loss: 0.2558


Epoch 2/5: 100%|██████████| 140/140 [00:08<00:00, 16.98it/s]


Epoch 2/5 - Train Loss: 0.0724


Epoch 3/5: 100%|██████████| 140/140 [00:08<00:00, 16.38it/s]


Epoch 3/5 - Train Loss: 0.0385


Epoch 4/5: 100%|██████████| 140/140 [00:07<00:00, 18.90it/s]


Epoch 4/5 - Train Loss: 0.0194


Epoch 5/5: 100%|██████████| 140/140 [00:08<00:00, 16.95it/s]


Epoch 5/5 - Train Loss: 0.0167

Test Accuracy: 0.9812

Classification Report:
              precision    recall  f1-score   support

         Ham       0.99      0.99      0.99       966
        Spam       0.94      0.91      0.93       149

    accuracy                           0.98      1115
   macro avg       0.97      0.95      0.96      1115
weighted avg       0.98      0.98      0.98      1115


Message: WIN a brand new car now!
Prediction: HAM ✅

Message: Can we meet tomorrow for coffee?
Prediction: HAM ✅

Message: URGENT! Your account was hacked.
Prediction: HAM ✅

Message: Ok I'm home, text me when you arrive.
Prediction: HAM ✅

Message: You have won $5000 cash. Call now to receive your reward.
Prediction: SPAM 🚨

