<a href="https://colab.research.google.com/github/HillaryDrugs/li7/blob/main/BiLTSM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# ======================
# 1. INSTALL DEPENDENCIES
# ======================
!pip install scikit-learn tensorflow -q

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from tqdm import tqdm

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# ======================
# 2. DEVICE (GPU OR CPU)
# ======================
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# ======================
# 3. LOAD & PREPARE DATA
# ======================
# Make sure /content/spam.csv exists in Colab
df = pd.read_csv("/content/spam.csv", encoding="cp1252")

# Keep only the label and text columns
df = df[["v1", "v2"]].rename(columns={"v1": "label", "v2": "text"})

# Map ham/spam -> 0/1
label_map = {"ham": 0, "spam": 1}
df["label_id"] = df["label"].map(label_map)

texts = df["text"].tolist()
labels = df["label_id"].tolist()

# Train/test split (same style as before for fairness)
train_texts, test_texts, train_labels, test_labels = train_test_split(
    texts,
    labels,
    test_size=0.2,
    random_state=42,
    stratify=labels
)

print(f"Train size: {len(train_texts)} | Test size: {len(test_texts)}")
print(df["label"].value_counts(), "\n")

# ======================
# 4. TOKENIZE & PAD TEXT
# ======================
max_words = 10000   # vocab size cap
max_len = 50        # message length cap

tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(train_texts)

train_sequences = tokenizer.texts_to_sequences(train_texts)
test_sequences  = tokenizer.texts_to_sequences(test_texts)

X_train = pad_sequences(train_sequences, maxlen=max_len, padding='post', truncating='post')
X_test  = pad_sequences(test_sequences,  maxlen=max_len, padding='post', truncating='post')

y_train = torch.tensor(train_labels, dtype=torch.long)
y_test  = torch.tensor(test_labels,  dtype=torch.long)

vocab_size = min(max_words, len(tokenizer.word_index) + 1)
print("Vocab size:", vocab_size)

# ======================
# 5. DATASET / DATALOADER
# ======================
class SpamDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.long)
        self.y = y
    def __len__(self):
        return len(self.y)
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

train_dataset = SpamDataset(X_train, y_train)
test_dataset  = SpamDataset(X_test,  y_test)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader  = DataLoader(test_dataset,  batch_size=32, shuffle=False)

print(f"Train batches: {len(train_loader)}")
print(f"Test  batches: {len(test_loader)}\n")

# ======================
# 6. BiLSTM MODEL
# ======================
class SpamBiLSTM(nn.Module):
    def __init__(self, vocab_size, embed_dim=100, hidden_dim=64, num_layers=1, dropout=0.5):
        super(SpamBiLSTM, self).__init__()

        # Trainable embeddings (random init)
        self.embedding = nn.Embedding(vocab_size, embed_dim)

        # Bidirectional LSTM
        self.lstm = nn.LSTM(
            input_size=embed_dim,
            hidden_size=hidden_dim,
            num_layers=num_layers,
            batch_first=True,
            bidirectional=True,
            dropout=0.0 if num_layers == 1 else dropout
        )

        # Because it's bidirectional, hidden_dim * 2 comes out
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_dim * 2, 2)  # 2 classes: ham/spam

    def forward(self, x):
        # x shape: [batch, max_len]
        x = self.embedding(x)  # [batch, max_len, embed_dim]

        # lstm_out: [batch, max_len, hidden_dim*2] because bidirectional=True
        # (h_n, c_n) we don't actually need all time steps for classification,
        # we can take the last hidden state from both directions.
        lstm_out, (h_n, c_n) = self.lstm(x)
        # h_n shape: [num_layers * 2, batch, hidden_dim]
        # take last layer's forward and backward hidden states and concat
        # forward is h_n[-2], backward is h_n[-1] when bidirectional
        if self.lstm.bidirectional:
            h_forward = h_n[-2]
            h_backward = h_n[-1]
            h_combined = torch.cat((h_forward, h_backward), dim=1)  # [batch, hidden_dim*2]
        else:
            h_combined = h_n[-1]  # [batch, hidden_dim]

        x = self.dropout(h_combined)
        logits = self.fc(x)  # [batch, 2]
        return logits

model = SpamBiLSTM(vocab_size=vocab_size).to(device)

# ======================
# 7. TRAINING SETUP
# ======================
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)
epochs = 5  # you can set 3 if you want faster

# ======================
# 8. TRAIN LOOP
# ======================
for epoch in range(epochs):
    model.train()
    total_loss = 0.0

    for X_batch, y_batch in tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}"):
        X_batch = X_batch.to(device)
        y_batch = y_batch.to(device)

        optimizer.zero_grad()
        outputs = model(X_batch)        # [batch, 2]
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}/{epochs} - Train Loss: {avg_loss:.4f}")

# ======================
# 9. EVALUATION
# ======================
model.eval()
y_true = []
y_pred = []

with torch.no_grad():
    for X_batch, y_batch in test_loader:
        X_batch = X_batch.to(device)

        logits = model(X_batch)              # [batch, 2]
        preds = torch.argmax(logits, dim=1)  # [batch]

        y_true.extend(y_batch.numpy())
        y_pred.extend(preds.cpu().numpy())

acc = accuracy_score(y_true, y_pred)
print("\n================ RESULTS ================")
print(f"Test Accuracy: {acc:.4f}\n")
print("Classification Report:")
print(classification_report(y_true, y_pred, target_names=["Ham", "Spam"]))

# ======================
# 10. PREDICTION FUNCTION
# ======================
def predict_message(model, tokenizer, text, max_len=50):
    model.eval()

    seq = tokenizer.texts_to_sequences([text])
    padded = pad_sequences(seq, maxlen=max_len, padding='post', truncating='post')

    tensor = torch.tensor(padded, dtype=torch.long).to(device)

    with torch.no_grad():
        logits = model(tensor)                  # [1, 2]
        pred = torch.argmax(logits, dim=1).item()

    return "SPAM 🚨" if pred == 1 else "HAM ✅"

# Try a few manual samples
samples = [
    "WIN a brand new car now!",
    "Can we meet tomorrow for coffee?",
    "URGENT! Your account was hacked.",
    "Ok I'm home, text me when you arrive.",
    "You have won $5000 cash. Call now to receive your reward."
]

print("\n================ EXAMPLES ================")
for msg in samples:
    print(f"Message: {msg}")
    print("Prediction:", predict_message(model, tokenizer, msg))
    print()


Using device: cpu
Train size: 4457 | Test size: 1115
label
ham     4825
spam     747
Name: count, dtype: int64 

Vocab size: 7920
Train batches: 140
Test  batches: 35



Epoch 1/5: 100%|██████████| 140/140 [00:08<00:00, 17.41it/s]


Epoch 1/5 - Train Loss: 0.2703


Epoch 2/5: 100%|██████████| 140/140 [00:06<00:00, 21.25it/s]


Epoch 2/5 - Train Loss: 0.1070


Epoch 3/5: 100%|██████████| 140/140 [00:07<00:00, 18.04it/s]


Epoch 3/5 - Train Loss: 0.0502


Epoch 4/5: 100%|██████████| 140/140 [00:06<00:00, 20.94it/s]


Epoch 4/5 - Train Loss: 0.0286


Epoch 5/5: 100%|██████████| 140/140 [00:07<00:00, 18.46it/s]


Epoch 5/5 - Train Loss: 0.0159

Test Accuracy: 0.9776

Classification Report:
              precision    recall  f1-score   support

         Ham       0.98      0.99      0.99       966
        Spam       0.94      0.89      0.91       149

    accuracy                           0.98      1115
   macro avg       0.96      0.94      0.95      1115
weighted avg       0.98      0.98      0.98      1115


Message: WIN a brand new car now!
Prediction: HAM ✅

Message: Can we meet tomorrow for coffee?
Prediction: HAM ✅

Message: URGENT! Your account was hacked.
Prediction: HAM ✅

Message: Ok I'm home, text me when you arrive.
Prediction: HAM ✅

Message: You have won $5000 cash. Call now to receive your reward.
Prediction: SPAM 🚨

