In [15]:
import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset
from torch.utils.data import DataLoader  # Import
# But no code uses DataLoader
from torch.utils.data import DataLoader  # noqa

from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
import joblib
from tqdm.notebook import tqdm as notebook_tqdm


In [16]:
# Load data
df = pd.read_csv(r"C:\Users\manik\Documents\Spam email detection\spam_ham_dataset.csv", encoding="latin-1")
df = df.rename(columns={"v1": "label", "v2": "text"})
df = df[["label", "text"]]
df["label"] = df["label"].map({"spam": 1, "ham": 0})


In [17]:
# Split data
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df["text"], df["label"], test_size=0.2, random_state=42
)

In [18]:
# Data cleaning: Remove duplicates and null values
df = df.drop_duplicates()
df = df.dropna()


In [19]:
# Tokenize texts
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
train_encodings = tokenizer(list(train_texts), truncation=True, padding=True, max_length=512)
val_encodings = tokenizer(list(val_texts), truncation=True, padding=True, max_length=512)


In [23]:
# Custom Dataset Class
class SpamDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [24]:
train_dataset = SpamDataset(train_encodings, list(train_labels))
val_dataset = SpamDataset(val_encodings, list(val_labels))

In [25]:
# Load model
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
model.to('cuda')

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [28]:
# Training
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)
optimizer = AdamW(model.parameters(), lr=1e-5)
epochs = 3
for epoch in range(epochs):
    model.train()
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch["input_ids"].to('cuda')
        attention_mask = batch["attention_mask"].to('cuda')
        labels = batch["labels"].to('cuda')
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch + 1} complete.")

# Save the model
model.save_pretrained("bert_spam_detector")
tokenizer.save_pretrained("bert_tokenizer")

Epoch 1 complete.
Epoch 2 complete.
Epoch 3 complete.


('bert_tokenizer\\tokenizer_config.json',
 'bert_tokenizer\\special_tokens_map.json',
 'bert_tokenizer\\vocab.txt',
 'bert_tokenizer\\added_tokens.json')

In [36]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification

# Load the trained model and tokenizer
model = BertForSequenceClassification.from_pretrained("bert_spam_detector")
tokenizer = BertTokenizer.from_pretrained("bert_tokenizer")
model.eval()  # Set the model to evaluation mode
model.to('cuda')  # Use GPU if available

# Function to predict spam or ham
def predict_spam(email_text):
    # Tokenize the email text
    inputs = tokenizer(
        email_text,
        return_tensors="pt",
        truncation=True,
        padding=True,
        max_length=512
    )
    inputs = {key: value.to('cuda') for key, value in inputs.items()}  # Move inputs to GPU if available

    # Get model predictions
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        predicted_class = torch.argmax(logits, dim=1).item()

    return "Spam" if predicted_class == 1 else "Ham"

# Example spam email
example_email = "Subject: hpl nom for january 9 , 2001 ( see attached file : hplnol 09 . xls )- hplnol 09 . xls"
# Test the model
result = predict_spam(example_email)
print(f"Prediction: {result}")

Prediction: Ham
