<a href="https://colab.research.google.com/github/HillaryDrugs/li7/blob/main/Glove_Embedding.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# ======================
# 1. INSTALL DEPENDENCIES
# ======================
!pip install --upgrade gensim -q
!pip install scikit-learn tensorflow -q

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

import gensim.downloader as api
from tqdm import tqdm

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# ======================
# 2. DEVICE (GPU OR CPU)
# ======================
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# ======================
# 3. LOAD & PREPARE DATA
# ======================
# Make sure spam.csv is uploaded to /content/spam.csv in Colab
df = pd.read_csv("/content/spam.csv", encoding="cp1252")

# keep only label/text columns and rename
df = df[["v1", "v2"]].rename(columns={"v1": "label", "v2": "text"})

# map ham/spam -> 0/1
label_map = {"ham": 0, "spam": 1}
df["label_id"] = df["label"].map(label_map)

texts = df["text"].tolist()          # list of SMS strings
labels = df["label_id"].tolist()     # list of 0/1

# split train/test
train_texts, test_texts, train_labels, test_labels = train_test_split(
    texts,
    labels,
    test_size=0.2,
    random_state=42,
    stratify=labels
)

print(f"Train size: {len(train_texts)} | Test size: {len(test_texts)}")
print(df["label"].value_counts(), "\n")

# ======================
# 4. LOAD GloVe
# ======================
print("Loading GloVe embeddings (100d)... this may take a moment.")
glove = api.load("glove-wiki-gigaword-100")  # each word -> 100-dim vector

# ======================
# 5. TOKENIZE & PAD TEXT
# ======================
max_words = 10000   # only keep top 10k words
max_len = 50        # cut / pad each message to 50 tokens

tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(train_texts)

train_sequences = tokenizer.texts_to_sequences(train_texts)
test_sequences  = tokenizer.texts_to_sequences(test_texts)

X_train = pad_sequences(train_sequences, maxlen=max_len, padding='post', truncating='post')
X_test  = pad_sequences(test_sequences,  maxlen=max_len, padding='post', truncating='post')

y_train = torch.tensor(train_labels, dtype=torch.long)
y_test  = torch.tensor(test_labels,  dtype=torch.long)

# ======================
# 6. BUILD EMBEDDING MATRIX
# ======================
embedding_dim = 100  # because glove-wiki-gigaword-100 is 100d
word_index = tokenizer.word_index  # word -> index
num_words = min(max_words, len(word_index) + 1)

# matrix shape: (vocab_size, embedding_dim)
embedding_matrix = np.zeros((num_words, embedding_dim), dtype=np.float32)

for word, i in word_index.items():
    if i >= max_words:
        continue
    if word in glove:  # if this word exists in GloVe vocab
        embedding_matrix[i] = glove[word]
    # else it stays as zeros (unknown / OOV)

embedding_matrix = torch.tensor(embedding_matrix, dtype=torch.float32)

# ======================
# 7. PYTORCH DATASET + DATALOADER
# ======================
class SpamDataset(Dataset):
    def __init__(self, X, y):
        # X is numpy -> convert to torch LongTensor
        self.X = torch.tensor(X, dtype=torch.long)
        self.y = y  # already a torch tensor
    def __len__(self):
        return len(self.y)
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

train_dataset = SpamDataset(X_train, y_train)
test_dataset  = SpamDataset(X_test,  y_test)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader  = DataLoader(test_dataset,  batch_size=32, shuffle=False)

print(f"Train batches: {len(train_loader)}")
print(f"Test batches : {len(test_loader)}\n")

# ======================
# 8. MODEL (EMBED + LSTM + FC)
# ======================
class SpamClassifier(nn.Module):
    def __init__(self, embedding_matrix, hidden_dim=64):
        super(SpamClassifier, self).__init__()
        num_embeddings, embedding_dim = embedding_matrix.shape

        # Embedding layer initialized with GloVe
        self.embedding = nn.Embedding(num_embeddings, embedding_dim)
        self.embedding.weight = nn.Parameter(embedding_matrix)
        self.embedding.weight.requires_grad = False  # freeze GloVe weights

        # LSTM reads the sequence of embeddings
        self.lstm = nn.LSTM(
            input_size=embedding_dim,
            hidden_size=hidden_dim,
            batch_first=True
        )

        # Final linear layer -> 2 classes (ham, spam)
        self.fc = nn.Linear(hidden_dim, 2)

    def forward(self, x):
        # x: [batch, max_len] of token IDs
        x = self.embedding(x)         # -> [batch, max_len, embed_dim]
        _, (hidden, _) = self.lstm(x) # hidden: [1, batch, hidden_dim]
        hidden_last = hidden[-1]      # [batch, hidden_dim]
        out = self.fc(hidden_last)    # [batch, 2]
        return out

model = SpamClassifier(embedding_matrix).to(device)

# ======================
# 9. TRAINING SETUP
# ======================
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)
epochs = 5  # you can lower to 3 or 2 if it's slow

# ======================
# 10. TRAIN LOOP
# ======================
for epoch in range(epochs):
    model.train()
    total_loss = 0.0

    for X_batch, y_batch in tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}"):
        X_batch = X_batch.to(device)
        y_batch = y_batch.to(device)

        optimizer.zero_grad()
        outputs = model(X_batch)          # [batch, 2]
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}/{epochs} - Train Loss: {avg_loss:.4f}")

# ======================
# 11. EVALUATION
# ======================
model.eval()
y_true = []
y_pred = []

with torch.no_grad():
    for X_batch, y_batch in test_loader:
        X_batch = X_batch.to(device)

        outputs = model(X_batch)              # [batch, 2]
        preds = torch.argmax(outputs, dim=1)  # [batch]

        y_true.extend(y_batch.numpy())
        y_pred.extend(preds.cpu().numpy())

# metrics
acc = accuracy_score(y_true, y_pred)
print("\n================ RESULTS ================")
print(f"Test Accuracy: {acc:.4f}\n")
print("Classification Report:")
print(classification_report(y_true, y_pred, target_names=["Ham", "Spam"]))

# ======================
# 12. PREDICTION FUNCTION
# ======================
def predict_message(model, tokenizer, text, max_len=50):
    model.eval()

    # convert text -> tokens -> padded sequence
    seq = tokenizer.texts_to_sequences([text])
    padded = pad_sequences(seq, maxlen=max_len, padding='post', truncating='post')

    tensor = torch.tensor(padded, dtype=torch.long).to(device)

    with torch.no_grad():
        output = model(tensor)                 # [1, 2]
        pred = torch.argmax(output, dim=1).item()

    return "SPAM 🚨" if pred == 1 else "HAM ✅"

# quick sanity check on some messages
samples = [
    "WIN a brand new car now!",
    "Can we meet tomorrow for coffee?",
    "URGENT! Your account was hacked.",
    "Ok I'm home, text me when you arrive.",
    "You have won $5000 cash. Call now to receive your reward."
]

print("\n================ EXAMPLES ================")
for msg in samples:
    print(f"Message: {msg}")
    print("Prediction:", predict_message(model, tokenizer, msg))
    print()


Using device: cpu
Train size: 4457 | Test size: 1115
label
ham     4825
spam     747
Name: count, dtype: int64 

Loading GloVe embeddings (100d)... this may take a moment.
Train batches: 140
Test batches : 35



Epoch 1/5: 100%|██████████| 140/140 [00:02<00:00, 54.31it/s]


Epoch 1/5 - Train Loss: 0.4383


Epoch 2/5: 100%|██████████| 140/140 [00:02<00:00, 60.63it/s]


Epoch 2/5 - Train Loss: 0.1967


Epoch 3/5: 100%|██████████| 140/140 [00:02<00:00, 60.14it/s]


Epoch 3/5 - Train Loss: 0.1040


Epoch 4/5: 100%|██████████| 140/140 [00:03<00:00, 43.42it/s]


Epoch 4/5 - Train Loss: 0.0938


Epoch 5/5: 100%|██████████| 140/140 [00:02<00:00, 59.82it/s]


Epoch 5/5 - Train Loss: 0.0790

Test Accuracy: 0.9704

Classification Report:
              precision    recall  f1-score   support

         Ham       0.99      0.98      0.98       966
        Spam       0.87      0.91      0.89       149

    accuracy                           0.97      1115
   macro avg       0.93      0.95      0.94      1115
weighted avg       0.97      0.97      0.97      1115


Message: WIN a brand new car now!
Prediction: HAM ✅

Message: Can we meet tomorrow for coffee?
Prediction: HAM ✅

Message: URGENT! Your account was hacked.
Prediction: HAM ✅

Message: Ok I'm home, text me when you arrive.
Prediction: HAM ✅

Message: You have won $5000 cash. Call now to receive your reward.
Prediction: SPAM 🚨

