# Library Import

In [6]:
pip install openpyxl xlrd

Collecting openpyxl
  Downloading openpyxl-3.1.5-py2.py3-none-any.whl.metadata (2.5 kB)
Collecting xlrd
  Downloading xlrd-2.0.2-py2.py3-none-any.whl.metadata (3.5 kB)
Collecting et-xmlfile (from openpyxl)
  Downloading et_xmlfile-2.0.0-py3-none-any.whl.metadata (2.7 kB)
Downloading openpyxl-3.1.5-py2.py3-none-any.whl (250 kB)
Downloading xlrd-2.0.2-py2.py3-none-any.whl (96 kB)
Downloading et_xmlfile-2.0.0-py3-none-any.whl (18 kB)
Installing collected packages: xlrd, et-xmlfile, openpyxl

   ---------------------------------------- 0/3 [xlrd]
   ---------------------------------------- 0/3 [xlrd]
   ---------------------------------------- 0/3 [xlrd]
   ------------- -------------------------- 1/3 [et-xmlfile]
   -------------------------- ------------- 2/3 [openpyxl]
   -------------------------- ------------- 2/3 [openpyxl]
   -------------------------- ------------- 2/3 [openpyxl]
   -------------------------- ------------- 2/3 [openpyxl]
   -------------------------- ------------- 

In [9]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from transformers import BertTokenizer, BertModel
from sklearn.model_selection import train_test_split

# Dataset load

In [11]:
df = pd.read_csv(r"C:\Users\ADMIN\Downloads\spam_small.csv")
# print("Dataset shape:", df.shape)
# print(df.head(10))

# train_test

In [12]:
X_train, X_test, y_train, y_test = train_test_split(
    df["text"], df["label"], test_size=0.2, random_state=42
)

# Convert labels (spam=1, ham=0)
y_train = y_train.map(lambda x: 1 if x == "spam" else 0)
y_test = y_test.map(lambda x: 1 if x == "spam" else 0)

# BERT 

In [13]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", cache_dir="./bert_cache")
bert_model = BertModel.from_pretrained("bert-base-uncased", cache_dir="./bert_cache")

# Preprocessing

In [14]:
train_encodings = tokenizer(
    list(X_train), truncation=True, padding=True, max_length=128, return_tensors="pt"
)
test_encodings = tokenizer(
    list(X_test), truncation=True, padding=True, max_length=128, return_tensors="pt"
)

train_labels = torch.tensor(list(y_train))
test_labels = torch.tensor(list(y_test))

# Create DataLoaders
train_dataset = TensorDataset(train_encodings["input_ids"], train_encodings["attention_mask"], train_labels)
test_dataset = TensorDataset(test_encodings["input_ids"], test_encodings["attention_mask"], test_labels)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)


In [15]:
class SpamClassifier(nn.Module):
    def __init__(self, bert_model):
        super(SpamClassifier, self).__init__()
        self.bert = bert_model
        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(768, 2)  # 2 classes: spam or ham

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        x = self.dropout(pooled_output)  #0.1 to 0.2
        x = self.fc(x)
        return x

model = SpamClassifier(bert_model)

In [16]:
# 7. Training setup
# -------------------------------
optimizer = optim.Adam(model.parameters(), lr=2e-5)
criterion = nn.CrossEntropyLoss()

In [17]:
#training set

In [None]:
epochs = 4
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

for epoch in range(epochs):
    model.train()
    total_loss = 0
    for ids, mask, labels in train_loader:
        ids, mask, labels = ids.to(device), mask.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(ids, mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(train_loader):.4f}")


In [15]:
torch.save(model.state_dict(), r"C:\Users\ADMIN\Downloads\spam_model.pth")
print("✅ Model saved at C:\\Users\\ADMIN\\Downloads\\spam_model.pth")


✅ Model saved at C:\Users\ADMIN\Downloads\spam_model.pth


In [None]:
#increase data size
#spam ham in 70 30 ratio
#dropout < 0.1 or 0.2
# epochs 4 and 5
# softmax function
# stage 1- key word based
#stage 2 - bert part
# test cases - 10 ham 10 spam