In [1]:
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [2]:
emails=pd.read_csv('email_spam.csv')

In [3]:
emails.head()

Unnamed: 0.1,Unnamed: 0,Body,Label
0,0,\nSave up to 70% on Life Insurance.\nWhy Spend...,1
1,1,1) Fight The Risk of Cancer!\nhttp://www.adcli...,1
2,2,1) Fight The Risk of Cancer!\nhttp://www.adcli...,1
3,3,##############################################...,1
4,4,I thought you might like these:\n1) Slim Down ...,1


In [4]:
emails.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6046 entries, 0 to 6045
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  6046 non-null   int64 
 1   Body        6045 non-null   object
 2   Label       6046 non-null   int64 
dtypes: int64(2), object(1)
memory usage: 141.8+ KB


In [5]:
emails['Label'].value_counts() 
# 1-> SPAM
# 0-> HAM

Label
0    4150
1    1896
Name: count, dtype: int64

In [6]:
emails.isnull().sum()

Unnamed: 0    0
Body          1
Label         0
dtype: int64

In [7]:
emails=emails.dropna()

In [8]:
emails.duplicated().sum()

0

In [9]:
emails=emails.drop_duplicates()

In [10]:
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Khushi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [11]:
def process(text):
    nopunc = [char for char in text if char not in string.punctuation]
    nopunc = ''.join(nopunc)

    clean = [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]
    return clean
    
emails['Body'].head().apply(process)

0    [Save, 70, Life, Insurance, Spend, ToLife, Quo...
1    [1, Fight, Risk, Cancer, httpwwwadclickwspcfmo...
2    [1, Fight, Risk, Cancer, httpwwwadclickwspcfmo...
3    [Adult, Club, Offers, FREE, Membership, INSTAN...
4    [thought, might, like, 1, Slim, Guaranteed, lo...
Name: Body, dtype: object

In [14]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(emails['Body'])
X_train, X_test, y_train, y_test = train_test_split(X, emails['Label'], test_size=0.2, random_state=42)

In [15]:
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train, y_train)

In [16]:
# Make predictions on the testing data
y_pred = nb_classifier.predict(X_test)

# Calculate accuracy and other relevant metrics
accuracy = accuracy_score(y_test, y_pred)
confusion = confusion_matrix(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy:.2f}")
print("Confusion Matrix:\n", confusion)
print("Classification Report:\n", report)

Accuracy: 0.96
Confusion Matrix:
 [[767  40]
 [  8 394]]
Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.95      0.97       807
           1       0.91      0.98      0.94       402

    accuracy                           0.96      1209
   macro avg       0.95      0.97      0.96      1209
weighted avg       0.96      0.96      0.96      1209



In [30]:
new_email_text = ["Special Offer for limited period! Avail benefits now"]
new_email_vectorized = vectorizer.transform(new_email_text)

# Use the trained classifier to make predictions
new_email_prediction = nb_classifier.predict(new_email_vectorized)
if(new_email_prediction==[1]):
    print("Spam Email")
else:
    print("Ham Email")

Spam Email


In [31]:
new_email_text = ["The meeting has been scheduled for tommorow. Everybody is requested to be on time."]
new_email_vectorized = vectorizer.transform(new_email_text)

# Use the trained classifier to make predictions
new_email_prediction = nb_classifier.predict(new_email_vectorized)
if(new_email_prediction==[1]):
    print("Spam Email")
else:
    print("Ham Email")

Ham Email


#### Saving the model and using it for making predictions

In [32]:
import joblib

model_filename = 'spam_email_detection.pkl'
joblib.dump(nb_classifier, model_filename)

['spam_email_detection.pkl']

In [33]:
model = joblib.load('spam_email_detection.pkl')  

In [44]:
input_text = ["This is an important message. Please take a note of these things."]
input_vector=vectorizer.transform(input_text)
prediction = model.predict(input_vector)
if prediction[0] == 1:
    print("This email is classified as spam.")
else:
    print("This email is classified as ham (not spam).")

This email is classified as ham (not spam).


In [45]:
input_text = ["Special Offer! Buy 2 get 1 free!"]
input_vector=vectorizer.transform(input_text)
prediction = model.predict(input_vector)
if prediction[0] == 1:
    print("This email is classified as spam.")
else:
    print("This email is classified as ham (not spam).")

This email is classified as spam.
