<a href="https://colab.research.google.com/github/HillaryDrugs/li7/blob/main/Bert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# ===== INSTALLATION & IMPORTS =====
!pip install transformers datasets -q

import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification
from torch.optim import AdamW
from tqdm import tqdm
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

# ===== DEVICE (GPU OR CPU) =====
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# ===== LOAD DATA (YOUR SPAM DATASET) =====
# Make sure /content/spam.csv exists in Colab's Files panel
df = pd.read_csv("/content/spam.csv", encoding="cp1252")

# v1 = label ("ham" / "spam")
# v2 = text message
df = df[["v1", "v2"]].rename(columns={"v1": "label", "v2": "text"})

# Map labels to numbers: ham -> 0, spam -> 1
label_map = {"ham": 0, "spam": 1}
df["label_id"] = df["label"].map(label_map)

texts = df["text"].tolist()
labels = df["label_id"].tolist()

# Train/test split
train_texts, test_texts, train_labels, test_labels = train_test_split(
    texts,
    labels,
    test_size=0.2,
    random_state=42,
    stratify=labels
)

print("Dataset loaded:")
print(f"Train size: {len(train_texts)}")
print(f"Test size : {len(test_texts)}")
print(df["label"].value_counts(), "\n")

# ===== BERT DATASET CLASS =====
class SpamBertDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=64):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]

        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# ===== TOKENIZER =====
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

train_dataset = SpamBertDataset(train_texts, train_labels, tokenizer, max_len=64)
test_dataset  = SpamBertDataset(test_texts,  test_labels,  tokenizer, max_len=64)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader  = DataLoader(test_dataset,  batch_size=16, shuffle=False)

print(f"Train batches: {len(train_loader)}")
print(f"Test batches : {len(test_loader)}\n")

# ===== MODEL SETUP =====
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=2
).to(device)

optimizer = AdamW(model.parameters(), lr=2e-5)

# ===== TRAINING FUNCTION =====
def train_epoch(model, loader, optimizer, device):
    model.train()
    total_loss = 0.0
    correct = 0
    total = 0

    for batch in tqdm(loader, desc="Training"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )

        loss = outputs.loss
        logits = outputs.logits

        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        predictions = torch.argmax(logits, dim=1)
        correct += (predictions == labels).sum().item()
        total += labels.size(0)

    avg_loss = total_loss / len(loader)
    accuracy = correct / total
    return avg_loss, accuracy

# ===== EVALUATION FUNCTION =====
def evaluate(model, loader, device):
    model.eval()
    total_loss = 0.0
    correct = 0
    total = 0

    with torch.no_grad():
        for batch in tqdm(loader, desc="Evaluating"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )

            loss = outputs.loss
            logits = outputs.logits

            total_loss += loss.item()
            predictions = torch.argmax(logits, dim=1)
            correct += (predictions == labels).sum().item()
            total += labels.size(0)

    avg_loss = total_loss / len(loader)
    accuracy = correct / total
    return avg_loss, accuracy

# ===== TRAIN MODEL =====
epochs = 2  # reduced for faster training

for epoch in range(epochs):
    print(f"\nEpoch {epoch + 1}/{epochs}")

    train_loss, train_acc = train_epoch(model, train_loader, optimizer, device)
    test_loss, test_acc   = evaluate(model, test_loader, device)

    print(f"Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}")
    print(f"Test  Loss: {test_loss:.4f}, Test  Acc: {test_acc:.4f}")

# ===== PREDICTION FUNCTION =====
def predict_message(text, model, tokenizer, device, max_len=64):
    model.eval()

    encoding = tokenizer(
        text,
        add_special_tokens=True,
        max_length=max_len,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )

    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        probs = torch.softmax(outputs.logits, dim=1)
        pred_class = torch.argmax(probs, dim=1).item()
        confidence = probs[0][pred_class].item()

    label = "SPAM 🚨" if pred_class == 1 else "HAM ✅"
    return label, confidence

# ===== TEST SAMPLE MESSAGES =====
test_messages = [
    "WIN a brand new iPhone! Reply YES to claim your prize now!!!",
    "Hey are we still meeting at 7 or should I come later?",
    "URGENT! Your account is compromised. Click this link immediately to verify.",
    "Ok I'm home, text me when you arrive.",
    "You have won $5000 cash. Call now to receive your reward."
]

for msg in test_messages:
    pred_label, conf = predict_message(msg, model, tokenizer, device)
    print("\nMessage:", msg)
    print(f"Prediction: {pred_label}  (confidence: {conf:.2%})")
