In [1]:
import pandas as pd
import numpy as np

In [2]:
PATH = "dataset.csv"
data = pd.read_csv(PATH, encoding='latin-1')

data = data.rename(columns={"v1": "Labels", "v2": "Email"})
data['Labels'] = data['Labels'].apply(lambda x: 0 if x == "ham" else 1)

# 0 : No Spam
# 1: spam
data.head()

Unnamed: 0,Labels,Email,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,0,"Go until jurong point, crazy.. Available only ...",,,
1,0,Ok lar... Joking wif u oni...,,,
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,0,U dun say so early hor... U c already then say...,,,
4,0,"Nah I don't think he goes to usf, he lives aro...",,,


In [3]:
data.head()

Unnamed: 0,Labels,Email,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,0,"Go until jurong point, crazy.. Available only ...",,,
1,0,Ok lar... Joking wif u oni...,,,
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,0,U dun say so early hor... U c already then say...,,,
4,0,"Nah I don't think he goes to usf, he lives aro...",,,


In [4]:
len(data)

5572

In [5]:
import string
import nltk
from nltk.corpus import stopwords
import sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from nltk.stem.porter import PorterStemmer


## Apply the pre processing steps

In [6]:
# Apply the pre processing steps
stemmer = PorterStemmer()
corpus = []
sp_words = set(stopwords.words('english'))
nltk.download('stopwords')
for i in range(len(data)):
    emails = data['Email'].iloc[i].lower()
    emails = emails.translate(str.maketrans('', '', string.punctuation)).split()
    emails = [stemmer.stem(word) for word in emails if word not in sp_words]
    emails = ' '.join(emails)
    corpus.append(emails)

[nltk_data] Downloading package stopwords to C:\Users\Himanshu
[nltk_data]     Singh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
corpus[0], len(corpus[0]), data['Email'].iloc[0], len(data['Email'].iloc[0])

('go jurong point crazi avail bugi n great world la e buffet cine got amor wat',
 76,
 'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...',
 111)

## # Vectorize this corpus

In [8]:
# Vectorize this corpus
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus).toarray()
Y = data['Labels']
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2)

In [9]:
import torch
import torch.nn as nn

## # To run this Model on GPU

In [10]:
# To run this Model on GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'CPU')
X_train = torch.tensor(X_train).type(torch.float).to(device)
X_test = torch.tensor(X_test).type(torch.float).to(device)
Y_train = torch.tensor(np.array(Y_train)).type(torch.float).to(device)
Y_test = torch.tensor(np.array(Y_test)).type(torch.float).to(device)

In [11]:
# For debugging
type(X_train[1].shape), X_test.shape, X_train[1]

(torch.Size,
 torch.Size([1115, 8038]),
 tensor([0., 0., 0.,  ..., 0., 0., 0.], device='cuda:0'))

## Building Model

In [12]:
class MyModel(nn.Module):
    def __init__(self, input_features):
        super(MyModel, self).__init__()
        self.model = nn.Linear(input_features, 1)
    def forward(self, X):
        return torch.sigmoid(self.model(X))

# Initialize the model
torch.manual_seed(42)
input_features = X_train.shape[1]
model = MyModel(input_features).to(device)

In [13]:
# Loss function and optimizer
loss_fn = nn.BCELoss() 
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [14]:
import torchmetrics
from torchmetrics import Accuracy
acc = Accuracy(task="binary", num_classes=2).to(device)

In [15]:
# Training Loop
epochs = 6000
for epoch in range(epochs):
    model.train()
    
    # Forward pass
    y_pred = model(X_train).squeeze()
    
    # Compute the loss
    loss = loss_fn(y_pred, Y_train)
    
    # Zero gradients, backward pass, optimizer step
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if (epoch + 1) % 100 == 0:
        print(f"Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}")

Epoch [100/6000], Loss: 0.4231
Epoch [200/6000], Loss: 0.2959
Epoch [300/6000], Loss: 0.2243
Epoch [400/6000], Loss: 0.1783
Epoch [500/6000], Loss: 0.1464
Epoch [600/6000], Loss: 0.1230
Epoch [700/6000], Loss: 0.1052
Epoch [800/6000], Loss: 0.0913
Epoch [900/6000], Loss: 0.0801
Epoch [1000/6000], Loss: 0.0710
Epoch [1100/6000], Loss: 0.0634
Epoch [1200/6000], Loss: 0.0570
Epoch [1300/6000], Loss: 0.0516
Epoch [1400/6000], Loss: 0.0469
Epoch [1500/6000], Loss: 0.0428
Epoch [1600/6000], Loss: 0.0392
Epoch [1700/6000], Loss: 0.0361
Epoch [1800/6000], Loss: 0.0333
Epoch [1900/6000], Loss: 0.0308
Epoch [2000/6000], Loss: 0.0286
Epoch [2100/6000], Loss: 0.0266
Epoch [2200/6000], Loss: 0.0248
Epoch [2300/6000], Loss: 0.0231
Epoch [2400/6000], Loss: 0.0216
Epoch [2500/6000], Loss: 0.0203
Epoch [2600/6000], Loss: 0.0190
Epoch [2700/6000], Loss: 0.0179
Epoch [2800/6000], Loss: 0.0168
Epoch [2900/6000], Loss: 0.0158
Epoch [3000/6000], Loss: 0.0149
Epoch [3100/6000], Loss: 0.0141
Epoch [3200/6000]

In [16]:
# Step 6: Evaluate the model
model.eval()
with torch.no_grad():
    y_test_pred = model(X_test).squeeze()
    y_test_pred = y_test_pred.round()  # Convert probabilities to binary predictions

    # Move tensor to CPU before converting to numpy
    y_test_pred_cpu = y_test_pred.cpu().numpy()

    # Assuming Y_test is also a PyTorch tensor, move it to CPU and convert to numpy
    Y_test_cpu = Y_test.cpu().numpy()

    # Calculate accuracy using numpy arrays
    accuracy = (y_test_pred_cpu == Y_test_cpu).mean()
    print(f"Test Accuracy: {accuracy * 100:.2f}%")



Test Accuracy: 98.03%


## Saving Model

In [17]:
import pickle

# Save model using PyTorch's save method
torch.save(model.state_dict(), 'model.pth')

# Save the vectorizer using pickle
with open('count_vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)


In [20]:
from sklearn.preprocessing import LabelEncoder
from termcolor import colored
import emoji

while True:
    print(colored("                                                       Spam or Not Spam Detector 🌟", 'cyan', attrs=['bold']))
    print(colored("                                                   ====================================", 'magenta'))

    # Get user input (email text)
    message = input(colored("\nEnter the email text to check: ", 'yellow'))

    # Preprocess the message
    message = message.lower().translate(str.maketrans('', '', string.punctuation)).split()
    message = [stemmer.stem(word) for word in message if word not in sp_words]
    message = ' '.join(message)

    # Transform the message into the same format as your training data
    message = [message]
    X_message = vectorizer.transform(message)

    # Convert the sparse matrix to a dense matrix and then to a PyTorch tensor
    X_message = torch.tensor(X_message.toarray(), dtype=torch.float).to(device)

    # Set the model to evaluation mode
    model.eval()
    with torch.no_grad():
        # Make predictions
        y_test_pred = model(X_message).squeeze()
        y_test_pred = (y_test_pred >= 0.5).float()  # Convert probabilities to binary predictions

    # Decorative Output based on Prediction
    if y_test_pred.item() == 1:  # .item() converts tensor to a Python scalar
        print(colored("\n🚨 Provided Email is Spam! 🚨", 'red', attrs=['bold']))
        print(colored("We detected potentially harmful content! ⚠️", 'red'))
    else:
        print(colored("\n✅ Provided Email is Not Spam!", 'green', attrs=['bold']))
        print(colored("Your inbox is safe! 😊", 'green'))

    # Additional Fun Emojis for Enhanced User Experience
    print("\nThank you for using the Spam Detector! 🚀 😊")

    # Ask user if they want to continue or exit
    user_choice = input(colored("\nDo you want to check another email? (y/n): ", 'yellow')).lower()
    if user_choice != 'y':
        print(colored("\nExiting the Spam Detector. Goodbye! 👋", 'cyan'))
        break


[1m[36m                                                       Spam or Not Spam Detector 🌟[0m


[33m
Enter the email text to check: [0m Subject: Lose 20 pounds in 2 weeks! Body: Try our miracle weight loss pills now! Click here for a special discount.


[1m[32m
✅ Provided Email is Not Spam![0m
[32mYour inbox is safe! 😊[0m

Thank you for using the Spam Detector! 🚀 😊


[33m
Do you want to check another email? (y/n): [0m y


[1m[36m                                                       Spam or Not Spam Detector 🌟[0m


[33m
Enter the email text to check: [0m  Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam


[1m[32m
✅ Provided Email is Not Spam![0m
[32mYour inbox is safe! 😊[0m

Thank you for using the Spam Detector! 🚀 😊


[33m
Do you want to check another email? (y/n): [0m y


[1m[36m                                                       Spam or Not Spam Detector 🌟[0m


[33m
Enter the email text to check: [0m  Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam Spam


[1m[32m
✅ Provided Email is Not Spam![0m
[32mYour inbox is safe! 😊[0m

Thank you for using the Spam Detector! 🚀 😊


[33m
Do you want to check another email? (y/n): [0m n


[36m
Exiting the Spam Detector. Goodbye! 👋[0m


In [19]:
print("Predictions:", y_test_pred_cpu[:50])
print("True Labels:", Y_test_cpu[:50])


Predictions: [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 1. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0.]
True Labels: [0. 0. 0. 0. 0. 1. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 1. 1. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 1. 0. 0. 1. 0. 0. 0. 0. 0.
 0. 0.]
