# Classify Email as Spam or NotSpam using Naive Bayes Algorithm

In [1]:
#import necessary libraries
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.metrics import accuracy_score,classification_report
from sklearn.pipeline import Pipeline
import joblib
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
#Dowload stopwords if not already done
import nltk
nltk.download('stopwords')
emails=[
       "Congratulations,you won a free ticket to Bahamas!",
       "Hello,can we reschedule the meeting for tomorrow?",
       "Exclusive deal just for you,claim your free iphone now!",
       "Dear team,please find the report attached.",	
       "Win a $1,000 gift card by clicking this link!",
       "Remainder:your appointment is scheduled for Monday at 3PM.",
       "Limited time offer!Get 50% off on all products,",
       "Hai Mam,I'll call you later tonight.",
       "Claim your lotter prize now before it expires!",
       "You account balance is low.please deposit funds.",
       "Special promotion just for you:Buy one,get one free!",
       "Don't forget to submit the assignmnet before friday.",
       "Urgent!update your payment information to avoid suspension.",
       "Hai Jhon,thanks for sending over the documents.",
       "You've been selected for a cash reward claim,here!"
]
# labels(1=spam,0=not spam)
labels=[1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
#Preprocess function
def preprocess_text(text):
    text=text.lower()
    text=re.sub(r'[^a-z\s]','',text)
    stop_words=set(stopwords.words('English'))
    text=''.join([word for word in text.split() if word not in stop_words])
#Stemming
    stemmer=PorterStemmer()
    text=''.join([stemmer.stem(word)for word in text.split()])
    return text
#preprocess all mails
emails=[preprocess_text(email)for email in emails]
#Split data as Train and test sets
X_train,X_test,y_train,y_test=train_test_split(emails,labels,test_size=0.3,random_state=42)
#Create a pipeline
pipeline=Pipeline([('tfidf',TfidfVectorizer()),('model',MultinomialNB())])
#Train the Model
pipeline.fit(X_train,y_train)
#Cross validation
cv_scores=cross_val_score(pipeline,emails,labels,cv=5)
print(f"Cross-Validation Accuracy:{cv_scores.mean():.2f}")
#Test the model
y_pred=pipeline.predict(X_test)
#Evaluate the model
accuracy=accuracy_score(y_test,y_pred)
print(f"Test Acuuracy:{accuracy:.2f}")
print("\n Classification Report:\n",classification_report(y_test,y_pred))
joblib.dump(pipeline,'spam_classifier_model.pkl')
print("\nModel saved as 'spam_classifier_model.pkl',")
test_email=["You are a lucky winner!claim your prize now."]
test_email_preposs=[preprocess_text(email) for email in test_email]
prediction=pipeline.predict(test_email_preposs)
print("\nPrediction for test email(0=not spam,1=spam):",prediction[0])

Cross-Validation Accuracy:0.33
Test Acuuracy:0.20

 Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00         4
           1       0.20      1.00      0.33         1

    accuracy                           0.20         5
   macro avg       0.10      0.50      0.17         5
weighted avg       0.04      0.20      0.07         5


Model saved as 'spam_classifier_model.pkl',

Prediction for test email(0=not spam,1=spam): 1


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\1mscds43\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
