# load and check the dataset

In [1]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ADMIN\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ADMIN\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ADMIN\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\ADMIN\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\ADMIN\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [2]:
import pandas as pd 
import sklearn
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag


In [3]:
df = pd.read_csv(r"C:\Users\ADMIN\Downloads\spam_ham_dataset\spam_ham_cleaned.csv")
#print(df['label'].value_counts())

# Text Processing

In [4]:
#manual processing

In [5]:
x_train, x_test, y_train, y_test = train_test_split(df["text"], df["label"], test_size = 0.2, random_state = 42)

In [6]:
# Tokenization
# df = pd.read_csv(r"C:\Users\ADMIN\Downloads\spam_ham_dataset\spam_ham_cleaned.csv")
# df['tokens'] = df['text'].apply(lambda x: word_tokenize(str(x))) #tokens create new coloumn in dataset and lambda function says to convert each to string
# #print(df[['text' , 'tokens']].head())

In [7]:
# #stop words removal
# stop_words = set(stopwords.words('english'))
# df['stop_no_tokens'] = df['tokens'].apply(lambda words: [word for word in words if word not in stop_words])
# #print(df[['tokens' , 'stop_no_tokens']].head())

In [8]:
# # Lemmatization
# lemmatizer = WordNetLemmatizer()
# df['tokens_lemmatized'] = df['stop_no_tokens'].apply(lambda words: [lemmatizer.lemmatize(word) for word in words])
# print(df[['stop_no_tokens','tokens_lemmatized']].head())

In [9]:
#pos tagging
# lemmatizer = WordNetLemmatizer()
# def get_wordnet_pos(word):
#     tag = pos_tag([word])[0][1][0].upper()
#     tag_dict = {
#         "J": wordnet.ADJ,
#         "N": wordnet.NOUN,
#         "V": wordnet.VERB,
#         "R": wordnet.ADV
#     }
#     return tag_dict.get(tag, wordnet.NOUN)

# df['tokens_lemmatized'] = df['stop_no_tokens'].apply(
#     lambda words: [lemmatizer.lemmatize(word, get_wordnet_pos(word)) for word in words]
# )
# print(df[['stop_no_tokens','tokens_lemmatized']].head(10))

# BERT Model

In [10]:
!pip install transformers sentencepiece torch  
#transformers is a hugging face library that provides pretrained models like bert
from transformers import BertTokenizer, BertModel 
import torch #pytorch library for deep learning
import torch.nn as nn #neural netwoek library
import torch.optim as optim #optimizer that updates weights during training
from torch.utils.data import DataLoader , TensorDataset 
from sklearn.metrics import accuracy_score #calculates how many predictions were correct.



In [11]:
# Convert labels from text to integers
y_train = y_train.map(lambda x: 1 if x == "spam" else 0)
y_test = y_test.map(lambda x: 1 if x == "spam" else 0)


In [None]:
from transformers import BertTokenizer, BertModel

# Load pretrained BERT
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", cache_dir="./bert_cache")
bert_model = BertModel.from_pretrained("bert-base-uncased", cache_dir="./bert_cache")

# Initialize your classifier with BERT backbone
model = SpamClassifier(bert_model)



In [None]:
train_encodings = tokenizer(list(x_train), truncation=True, padding=True, max_length=512, return_tensors="pt")
test_encodings = tokenizer(list(x_test), truncation=True, padding=True, max_length=512, return_tensors="pt")

train_dataset = TensorDataset(train_encodings['input_ids'], train_encodings['attention_mask'], torch.tensor(y_train.values))
test_dataset = TensorDataset(test_encodings['input_ids'], test_encodings['attention_mask'], torch.tensor(y_test.values))

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)

In [None]:
class SpamClassifier(nn.Module):
    def __init__(self, bert_model):
        super(SpamClassifier, self).__init__()
        self.bert = bert_model
        self.dropout = nn.Dropout(0.3)
        self.linear = nn.Linear(768, 2)  # Spam or Ham

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        x = self.dropout(pooled_output)
        return self.linear(x)



In [None]:
model = SpamClassifier(bert_model)
optimizer = optim.Adam(model.parameters(), lr=2e-5)
criterion = nn.CrossEntropyLoss()

for epoch in range(3):
    model.train()
    for batch in train_loader:
        ids, mask, labels = batch
        optimizer.zero_grad()
        outputs = model(ids, mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch+1} done")


In [None]:
model.eval()
preds, true_labels = [], []
with torch.no_grad():
    for batch in test_loader:
        ids, mask, labels = batch
        outputs = model(ids, mask)
        predictions = torch.argmax(outputs, dim=1)
        preds.extend(predictions.tolist())
        true_labels.extend(labels.tolist())

print("Accuracy:", accuracy_score(true_labels, preds))


In [None]:
def predict_message(message, model, tokenizer):
    model.eval()
    # Tokenize the new input
    encoding = tokenizer(
        message,
        truncation=True,
        padding=True,
        max_length=128,
        return_tensors="pt"
    )
    
    with torch.no_grad():
        outputs = model(encoding['input_ids'], encoding['attention_mask'])
        prediction = torch.argmax(outputs, dim=1).item()
    
    return "Spam" if prediction == 1 else "Ham"

# Example usage:
msg1 = "Congratulations! You won a free lottery ticket, claim now!"
msg2 = "Hey, are we still meeting for lunch tomorrow?"

print(msg1, "->", predict_message(msg1, model, tokenizer))
print(msg2, "->", predict_message(msg2, model, tokenizer))

In [None]:
# Save trained model
torch.save(model.state_dict(), "spam_model.pth")
print("Model saved as spam_model.pth")
