<a href="https://colab.research.google.com/github/Kibet-Rotich/Data-analysis-ML-AI/blob/master/Email_Priority_classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

ML PROJECT

Personal Email Priority Classifier


Task: Classify incoming emails as "Urgent", or "Regular" based on content, sender, time, etc.


Method: Text classification using Random Forest

In [1]:
# Download dataset
!wget https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip
!unzip smsspamcollection.zip

# Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder

# Load dataset
df = pd.read_csv('SMSSpamCollection', sep='\t', header=None, names=['label', 'text'])

# Convert spam/ham to priority levels
def convert_to_priority(label):
    if label == 'spam':
        return 'Urgent'
    else:
        return 'Regular'

df['priority'] = df['label'].apply(convert_to_priority)

# Basic preprocessing
def clean_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove special characters
    text = ''.join([char for char in text if char.isalnum() or char.isspace()])
    return text

df['cleaned_text'] = df['text'].apply(clean_text)

# Display dataset info
print("Dataset Overview:")
print(df['priority'].value_counts())
print("\nSample Data:")
print(df.head())

# Save processed dataset
df.to_csv('processed_sms_data.csv', index=False)

--2025-01-31 19:12:59--  https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified
Saving to: ‘smsspamcollection.zip’

smsspamcollection.z     [   <=>              ] 198.65K   369KB/s    in 0.5s    

2025-01-31 19:13:00 (369 KB/s) - ‘smsspamcollection.zip’ saved [203415]

Archive:  smsspamcollection.zip
  inflating: SMSSpamCollection       
  inflating: readme                  
Dataset Overview:
priority
Regular    4825
Urgent      747
Name: count, dtype: int64

Sample Data:
  label                                               text priority  \
0   ham  Go until jurong point, crazy.. Available only ...  Regular   
1   ham                      Ok lar... Joking wif u oni...  Regular   
2  spam  Free entry in 2 a wkly comp to win FA Cup f

In [2]:
df.head()

Unnamed: 0,label,text,priority,cleaned_text
0,ham,"Go until jurong point, crazy.. Available only ...",Regular,go until jurong point crazy available only in ...
1,ham,Ok lar... Joking wif u oni...,Regular,ok lar joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,Urgent,free entry in 2 a wkly comp to win fa cup fina...
3,ham,U dun say so early hor... U c already then say...,Regular,u dun say so early hor u c already then say
4,ham,"Nah I don't think he goes to usf, he lives aro...",Regular,nah i dont think he goes to usf he lives aroun...


In [3]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Feature extraction using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(df['cleaned_text'])

# Encode priority labels
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(df['priority'])

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train, y_train)

# Predictions and evaluation
y_pred = rf_classifier.predict(X_test)

# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=le.classes_))

# Function to predict priority
def predict_email_priority(email_text):
    # Clean and vectorize the input text
    cleaned_text = clean_text(email_text)
    vectorized_text = vectorizer.transform([cleaned_text])

    # Predict priority
    prediction = rf_classifier.predict(vectorized_text)

    return le.inverse_transform(prediction)[0]

# Example usage
test_emails = [
    "Urgent meeting today, please respond ASAP",
    "Hey, just checking in about the project",
    "Congratulations on your recent achievement!"
]

print("\nPriority Predictions:")
for email in test_emails:
    priority = predict_email_priority(email)
    print(f"Email: {email}\nPriority: {priority}\n")

# Save the model and vectorizer
import joblib
joblib.dump(rf_classifier, 'priority_classifier.joblib')
joblib.dump(vectorizer, 'tfidf_vectorizer.joblib')

Classification Report:
              precision    recall  f1-score   support

     Regular       0.97      1.00      0.99       966
      Urgent       1.00      0.81      0.90       149

    accuracy                           0.97      1115
   macro avg       0.99      0.91      0.94      1115
weighted avg       0.98      0.97      0.97      1115


Priority Predictions:
Email: Urgent meeting today, please respond ASAP
Priority: Regular

Email: Hey, just checking in about the project
Priority: Regular

Email: Congratulations on your recent achievement!
Priority: Regular



['tfidf_vectorizer.joblib']

In [4]:


import joblib

# Load the saved model and vectorizer
rf_classifier = joblib.load('priority_classifier.joblib')
vectorizer = joblib.load('tfidf_vectorizer.joblib')


def predict_email_priority(email_text):
    # Clean and vectorize the input text
    cleaned_text = clean_text(email_text)
    vectorized_text = vectorizer.transform([cleaned_text])

    # Predict priority
    prediction = rf_classifier.predict(vectorized_text)

    return le.inverse_transform(prediction)[0]

def clean_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove special characters
    text = ''.join([char for char in text if char.isalnum() or char.isspace()])
    return text

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.classes_ = np.array(['Urgent',"Regular"]) #swapped classes to get spam as urgent and ham as regular


In [5]:
predict_email_priority("You won 30000, click this link to get your winnings!!!")

'Urgent'