In [None]:
!pip install nltk scikit-learn pandas joblib




In [None]:
import re
import string
import pandas as pd
import numpy as np

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
import joblib

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

def preprocess_email(text):
    text = re.sub(r'<.*?>', '', text)
    text = text.lower()
    text = text.translate(str.maketrans("", "", string.punctuation))
    tokens = word_tokenize(text)
    cleaned = [lemmatizer.lemmatize(w) for w in tokens if w not in stop_words]
    return " ".join(cleaned)


In [33]:
import nltk
nltk.download('punkt_tab')
df = pd.read_csv('/content/emails_dataset.csv')  # Ensure file is in the same directory
df['cleaned'] = df['email'].apply(preprocess_email)
df.head()


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Unnamed: 0,email,label,cleaned
0,URGENT: Your Amazon order has been shipped. Pl...,ham,urgent amazon order shipped please check immed...
1,URGENT: HR: Please submit your timesheets by EOD.,ham,urgent hr please submit timesheets eod
2,"URGENT: You have won $10,000 in our lottery. C...",spam,urgent 10000 lottery claim
3,URGENT: Project update: Deadline extended to n...,ham,urgent project update deadline extended next f...
4,You have an unread message. Click to view it! ...,spam,unread message click view please check immedia...


In [34]:
vectorizer = TfidfVectorizer(max_features=3000)
X = vectorizer.fit_transform(df['cleaned'])
y = df['label']
joblib.dump(vectorizer, "vectorizer.joblib")
model = joblib.load('/content/vectorizer.joblib')
print(model)


TfidfVectorizer(max_features=3000)


In [35]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = MultinomialNB()
model.fit(X_train, y_train)
joblib.dump(model, "email_classifier.joblib")
model = joblib.load('/content/email_classifier.joblib')
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))
print("Model Used is :",model)

              precision    recall  f1-score   support

         ham       1.00      1.00      1.00        42
        spam       1.00      1.00      1.00        38

    accuracy                           1.00        80
   macro avg       1.00      1.00      1.00        80
weighted avg       1.00      1.00      1.00        80

Model Used is : MultinomialNB()


In [36]:
def predict_new_email(raw_text):
    model = joblib.load("email_classifier.joblib")
    vectorizer = joblib.load("vectorizer.joblib")
    cleaned = preprocess_email(raw_text)
    vector = vectorizer.transform([cleaned])
    prediction = model.predict(vector)[0]
    confidence = model.predict_proba(vector).max()
    return prediction, round(confidence, 3)

# Example
new_email = "HR: Please submit your timesheets by EOD."
label, score = predict_new_email(new_email)
print(f"Prediction: {label}, Confidence: {score}")


Prediction: ham, Confidence: 0.984


In [None]:
from google.colab import drive
drive.mount('/content/drive')