In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import json

In [14]:
df = pd.read_csv('/content/emails-N.csv')

In [15]:
df.head(10)

Unnamed: 0,email_text,category
0,I am inquiring about the possibility of purch...,others
1,I recently made a purchase through the Giving ...,others
2,I placed an order through the website; however...,others
3,Hello \nI received my order but I noticed that...,others
4,Good morning \nI am wondering if the discount ...,others
5,I am writing to inform you of an issue with my...,others
6,I would like to inquire about the possibility ...,others
7,"Good evening,My order seems to have been deliv...",others
8,I am reaching out to inquire about the termina...,others
9,Hello I have a question regarding the scenario...,others


In [5]:
# Check the structure
df.dropna(inplace=True)
print(df.head())

                                          email_text          category
0   I am inquiring about the possibility of purch...  Customer Service
1  I recently made a purchase through the Giving ...  Customer Service
2  I placed an order through the website; however...  Customer Service
3  Hello \nI received my order but I noticed that...  Customer Service
4  Good morning \nI am wondering if the discount ...  Customer Service


In [6]:
# Extract features and labels
X = df['email_text']
y = df['category']


In [9]:
# Text vectorization
vectorizer = TfidfVectorizer(stop_words='english')
X_vect = vectorizer.fit_transform(X)

In [10]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_vect, y, test_size=0.2, random_state=42)

In [11]:
# Train classifier (e.g., Random Forest)
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train, y_train)

# Predictions
y_pred = clf.predict(X_test)
y_proba = clf.predict_proba(X_test)


In [12]:
# Confidence threshold (e.g., 0.6)
threshold = 0.6

In [17]:
# Example prediction
def classify_email(email_text):
    vect_text = vectorizer.transform([email_text])
    proba = clf.predict_proba(vect_text)[0]
    pred_index = np.argmax(proba)
    confidence = proba[pred_index]
    if confidence >= threshold:
        predicted_category = clf.classes_[pred_index]
    else:
        predicted_category = "Other"
    return {
        "email_text": email_text,
        "predicted_category": predicted_category,
        "confidence": round(float(confidence), 2)
    }

# Test with example email
sample_email = "seniors and, if so, how the senior citizen discount can be applied during the booking process?."
result = classify_email(sample_email)
print(json.dumps(result, indent=2))


{
  "email_text": "seniors and, if so, how the senior citizen discount can be applied during the booking process?.",
  "predicted_category": "Other",
  "confidence": 0.42
}
