In [1]:
import numpy as np
import pandas as pd

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

In [3]:
email_data = pd.read_csv('C:/Users/win10/OneDrive/Desktop/Email/machine_learning/dataset/emails.csv', nrows=30000)

In [4]:
email_data.head()

Unnamed: 0,file,message
0,allen-p/_sent_mail/1.,Message-ID: <18782981.1075855378110.JavaMail.e...
1,allen-p/_sent_mail/10.,Message-ID: <15464986.1075855378456.JavaMail.e...
2,allen-p/_sent_mail/100.,Message-ID: <24216240.1075855687451.JavaMail.e...
3,allen-p/_sent_mail/1000.,Message-ID: <13505866.1075863688222.JavaMail.e...
4,allen-p/_sent_mail/1001.,Message-ID: <30922949.1075863688243.JavaMail.e...


In [5]:
email_data['priority'] = 0

keywords = ['close deadline', 'exam', 'assignment', 'project deadline', 'urgent', 'important', 'meeting', 'office hours']

for keyword in keywords:
    email_data['priority'] += email_data['message'].str.lower().str.count(keyword.lower())


print(email_data.head())

                       file  \
0     allen-p/_sent_mail/1.   
1    allen-p/_sent_mail/10.   
2   allen-p/_sent_mail/100.   
3  allen-p/_sent_mail/1000.   
4  allen-p/_sent_mail/1001.   

                                             message  priority  
0  Message-ID: <18782981.1075855378110.JavaMail.e...         0  
1  Message-ID: <15464986.1075855378456.JavaMail.e...         5  
2  Message-ID: <24216240.1075855687451.JavaMail.e...         0  
3  Message-ID: <13505866.1075863688222.JavaMail.e...         1  
4  Message-ID: <30922949.1075863688243.JavaMail.e...         0  


In [6]:
import joblib

In [7]:
X = email_data['message']
y = email_data['priority']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

vectorizer = CountVectorizer(stop_words='english', lowercase=True)
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

joblib.dump(vectorizer, 'vectorizer.joblib')

classifier = MultinomialNB()
classifier.fit(X_train_vectorized, y_train)

joblib.dump(classifier, 'classifier.joblib')

y_pred = classifier.predict(X_test_vectorized)

accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

Accuracy: 0.84


In [8]:

example_messages = ['Urgent: Project update meeting tomorrow!',
                    'Reminder: Assignment submission deadline is approaching.',
                    'Office hours changed this week.']

example_messages_vectorized = vectorizer.transform(example_messages)

predicted_priorities = classifier.predict(example_messages_vectorized)

print('\nPredicted Priorities for Example Messages:')
for message, priority in zip(example_messages, predicted_priorities):
    print(f'Message: {message}\nPredicted Priority: {priority}\n{"-" * 30}')


Predicted Priorities for Example Messages:
Message: Urgent: Project update meeting tomorrow!
Predicted Priority: 1
------------------------------
Message: Reminder: Assignment submission deadline is approaching.
Predicted Priority: 4
------------------------------
Message: Office hours changed this week.
Predicted Priority: 0
------------------------------
