In [26]:
import numpy as np
import pandas as pd
import os
import spacy
import string
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
import joblib

# Load dataset
file_path = r"C:\Users\91805\Desktop\ML\input\spam_ham_dataset.csv"
df = pd.read_csv(file_path)
print("Dataset loaded successfully.")
df.head()


Dataset loaded successfully.


Unnamed: 0.1,Unnamed: 0,label,text,label_num
0,605,ham,Subject: enron methanol ; meter # : 988291\r\n...,0
1,2349,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,3624,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,4685,spam,"Subject: photoshop , windows , office . cheap ...",1
4,2030,ham,Subject: re : indian springs\r\nthis deal is t...,0


In [27]:
# Remove 'Subject:' prefix and map labels to binary values
df["text"] = df["text"].str.replace("Subject:", "")
df.loc[df["label"] == "ham", "label"] = 0
df.loc[df["label"] == "spam", "label"] = 1
df.head()


Unnamed: 0.1,Unnamed: 0,label,text,label_num
0,605,0,enron methanol ; meter # : 988291\r\nthis is ...,0
1,2349,0,"hpl nom for january 9 , 2001\r\n( see attache...",0
2,3624,0,"neon retreat\r\nho ho ho , we ' re around to ...",0
3,4685,1,"photoshop , windows , office . cheap . main t...",1
4,2030,0,re : indian springs\r\nthis deal is to book t...,0


In [29]:
email_subjects = []
email_text = []

def split_subject(text):
    subject = ""
    for ch in text:
        if ch == '\r':
            break
        subject += ch
    email_subjects.append(subject)
    email_text.append(text.replace(subject, ""))

df["text"].apply(split_subject)
df["subject"] = email_subjects
df["text"] = email_text
df.head()


Unnamed: 0.1,Unnamed: 0,label,text,label_num,subject
0,605,0,\r\nthis is a follow up to the note i gave you...,0,
1,2349,0,\r\n( see attached file : hplnol 09 . xls )\r\...,0,
2,3624,0,"\r\nho ho ho , we ' re around to that most won...",0,
3,4685,1,\r\nabasements darer prudently fortuitous unde...,1,
4,2030,0,\r\nthis deal is to book the teco pvr revenue ...,0,


In [30]:
nlp = spacy.load("en_core_web_sm")

def clean_text(s): 
    return ''.join([cs if cs in string.ascii_letters else ' ' for cs in s]).rstrip()

def remove_little(s): 
    return ' '.join([word for word in s.split() if len(word) > 2])

def lemmatize_text(text):
    doc = nlp(text)
    return " ".join([token.lemma_ for token in doc])


In [31]:
df['text'] = df['text'].apply(lambda x: lemmatize_text(remove_little(clean_text(x))))
df['subject'] = df['subject'].apply(lambda x: lemmatize_text(remove_little(clean_text(x))))
df.head()

Unnamed: 0.1,Unnamed: 0,label,text,label_num,subject
0,605,0,this follow the note give you monday prelimina...,0,
1,2349,0,see attached file hplnol xls hplnol xls,0,
2,3624,0,around that most wonderful time the year neon ...,0,
3,4685,1,abasement darer prudently fortuitous undergone...,1,
4,2030,0,this deal book the teco pvr revenue understand...,0,


In [33]:
class CustomMultinomialNB:
    def __init__(self):
        self.class_log_prior_ = None
        self.feature_log_prob_ = None
        self.classes_ = None
        self.feature_count_ = None
        self.class_count_ = None

    def fit(self, X, y):
        self.classes_ = np.unique(y)
        self.class_count_ = np.zeros(len(self.classes_))
        self.feature_count_ = np.zeros((len(self.classes_), X.shape[1]))
        
        for idx, cls in enumerate(self.classes_):
            X_class = X[y == cls]
            self.class_count_[idx] = X_class.shape[0]
            self.feature_count_[idx, :] = X_class.sum(axis=0)
        
        self.class_log_prior_ = np.log(self.class_count_ / y.shape[0])
        smoothed_fc = self.feature_count_ + 1  
        smoothed_cc = smoothed_fc.sum(axis=1).reshape(-1, 1)
        self.feature_log_prob_ = np.log(smoothed_fc) - np.log(smoothed_cc)
        
    def predict(self, X):
        jll = X @ self.feature_log_prob_.T + self.class_log_prior_
        return self.classes_[np.argmax(jll, axis=1)]


In [34]:
# Convert labels to integer type
df['label'] = df['label'].astype(int)

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.2, random_state=42)

# Vectorize the text data
vectorizer = CountVectorizer()
X_train_counts = vectorizer.fit_transform(X_train)
X_test_counts = vectorizer.transform(X_test)


In [35]:
spam_detector = CustomMultinomialNB()
spam_detector.fit(X_train_counts.toarray(), y_train)
print("Model trained successfully.")


Model trained successfully.


In [36]:
y_pred = spam_detector.predict(X_test_counts.toarray())

accuracy = np.mean(y_pred == y_test)
print(f"Model Accuracy: {accuracy:.2f}")


Model Accuracy: 0.96


In [37]:
#Saving model
joblib.dump(spam_detector, 'spam_detection_model.pkl')
joblib.dump(vectorizer, 'count_vectorizer.pkl')
print("Model and vectorizer saved.")


Model and vectorizer saved.


In [65]:
input_email = input("Write the email to check: ")
processed_input = lemmatize_text(remove_little(clean_text(input_email)))
input_vector = vectorizer.transform([processed_input])

prediction = spam_detector.predict(input_vector.toarray())
print("Prediction:", "Spam" if prediction == 1 else "Ham")

Prediction: Spam


In [None]:
"""Spam email examples to test::
Claim your prize now by clicking this link! Act fast, this is a limited-time offer!
Start earning big with no experience required. Click here to find out more.
We have pre-approved you for a personal loan up to $10,000! Just provide your details to get started.
Order now and get up to 80% off on all your medication needs. Limited stock available.
Lose weight quickly with our new natural supplements. Get your free trial today!
Discover the best investment opportunity with guaranteed returns. Dont miss out!
Enjoy a luxury cruise on us. Click here to claim your free tickets!
Limited time offer! Get premium watches at a fraction of the price. Shop now!"""

"""Ham email examples to test::
Please verify your account immediately to prevent suspension. Click here to verify.
Please join us for the weekly team meeting at 10 AM on Monday. Agenda will be shared soon.
Hi Team, just a quick update on the project progress. We are on track and on schedule.
Were happy to inform you that your order has been shipped. Track your package for updates.
Please find attached the invoice for services rendered in August. Let us know if you have any questions.
Dont forget, our book club meets this Friday at 6 PM. Looking forward to seeing everyone!
Thank you for your purchase! Your order has been confirmed. Expect delivery soon.
Are we still on for dinner this Saturday? Let me know if anything has changed."""