In [1]:
pip install scikit-learn

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import pandas as pd
import numpy as np
import re
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score

In [6]:
# Load dataset
df = pd.read_excel(r"D:\OneDrive - Lowcode Minds Technology Pvt Ltd\Desktop\Automate-Email-Classification\emails.xlsx") 

In [7]:
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(f"[{string.punctuation}]", "", text)  # Remove punctuation
    text = re.sub(r"\d+", "", text)  # Remove numbers
    return text

In [8]:
# Combine Subject and Body
df["Text"] = df["Subject"].fillna("") + " " + df["Body"].fillna("")
df["Text"] = df["Text"].apply(preprocess_text)


In [9]:
# Define features and labels
X = df["Text"]
y = df["Category"]


In [10]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
# Create a pipeline with TF-IDF and SVM model
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english', max_features=5000)),
    ('svm', SVC(kernel='linear', probability=True))
])


In [None]:
# Train the model
pipeline.fit(X_train, y_train)

# Predictions
y_pred = pipeline.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.75
Classification Report:
               precision    recall  f1-score   support

  Automobile       1.00      0.67      0.80         3
    Building       1.00      0.50      0.67         4
      Health       0.62      1.00      0.77         5

    accuracy                           0.75        12
   macro avg       0.88      0.72      0.75        12
weighted avg       0.84      0.75      0.74        12



In [12]:
# List of multiple emails
new_emails = [
    {
        "Subject": "Health insurance claim status update",
        "Body": "Dear customer, your medical claim has been processed. The reimbursement will be credited within 5 working days."
    },
    {
        "Subject": "Renew your vehicle insurance today!",
        "Body": "Your car insurance expires soon. Renew now to avoid penalties and continue your coverage."
    },
    {
        "Subject": "Home insurance premium reminder",
        "Body": "Your home insurance premium is due next week. Please make the payment to continue coverage."
    },
    {
        "Subject": "Need details on medical coverage",
        "Body": "I would like to know if my insurance covers dental procedures and routine check-ups."
    },
    {
        "Subject": "Urgent: House fire insurance claim",
        "Body": "My house was damaged due to a fire accident. I need assistance in filing an insurance claim."
    }
]

In [None]:
# Preprocess and predict for each email
for email in new_emails:
    processed_email = preprocess_text(email["Subject"] + " " + email["Body"])
    predicted_category = pipeline.predict([processed_email])[0]
    print(f"Subject: {email['Subject']}\nPredicted Category: {predicted_category}\n")

Subject: Health insurance claim status update
Predicted Category: Health

Subject: Renew your vehicle insurance today!
Predicted Category: Automobile

Subject: Home insurance premium reminder
Predicted Category: Building

Subject: Need details on medical coverage
Predicted Category: Health

Subject: Urgent: House fire insurance claim
Predicted Category: Building

