In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string



data = pd.read_csv('email_classification.csv')


def preprocess_text(text):
  
    tokens = word_tokenize(text)
    
   
    tokens = [word.lower() for word in tokens if word.isalpha()]
    
   
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    
    return ' '.join(tokens)

data['text'] = data['text'].apply(preprocess_text)


X = data['text']
y = data['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)


model = MultinomialNB()
model.fit(X_train_tfidf, y_train)


y_pred = model.predict(X_test_tfidf)


accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print('Confusion Matrix:')
print(conf_matrix)
print('Classification Report:')
print(class_report)


Accuracy: 0.9722222222222222
Confusion Matrix:
[[14  0]
 [ 1 21]]
Classification Report:
              precision    recall  f1-score   support

         ham       0.93      1.00      0.97        14
        spam       1.00      0.95      0.98        22

    accuracy                           0.97        36
   macro avg       0.97      0.98      0.97        36
weighted avg       0.97      0.97      0.97        36



In [2]:
import joblib

joblib.dump(model,'model.pkl')
joblib.dump(tfidf_vectorizer,'vectorizer.pkl')

['vectorizer.pkl']

In [3]:
import joblib
import numpy as np

model = joblib.load('model.pkl')
vectorizer = joblib.load('vectorizer.pkl')



email_text="Your Amazon account has been locked. Click here to verify your account information"


email_vector = vectorizer.transform([email_text])


prediction = model.predict(email_vector)
prediction_proba = model.predict_proba(email_vector)


if prediction[0] == 0:
    prediction_result = 'Not Spam'
    prob = prediction_proba[0][0] * 100
else:
    prediction_result = 'Spam'
    prob = prediction_proba[0][1] * 100

print(f"Prediction: {prediction_result} with {prob:.2f}% probability")


Prediction: Spam with 83.02% probability


In [4]:
model = joblib.load('model.pkl')
vectorizer = joblib.load('vectorizer.pkl')


emails = [
    "hey click here to see magic",
    "this is a normal email with no spammy content",
    "urgent! your account has been compromised, click here to reset your password",
    "meeting schedule for next week",
    "free money, click now to claim your prize!"
]

for email_text in emails:
    
    email_vector = vectorizer.transform([email_text])

    prediction = model.predict(email_vector)
    prediction_proba = model.predict_proba(email_vector)

    if prediction[0] == 0:
        prediction_result = 'Not Spam'
        prob = prediction_proba[0][0] * 100
    else:
        prediction_result = 'Spam'
        prob = prediction_proba[0][1] * 100

    print(f"Email: {email_text}")
    print(f"Prediction: {prediction_result} with {prob:.2f}% probability\n")


Email: hey click here to see magic
Prediction: Spam with 58.17% probability

Email: this is a normal email with no spammy content
Prediction: Spam with 43.92% probability

Email: urgent! your account has been compromised, click here to reset your password
Prediction: Spam with 76.35% probability

Email: meeting schedule for next week
Prediction: Spam with 26.64% probability

Email: free money, click now to claim your prize!
Prediction: Spam with 93.26% probability

