In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import json

In [7]:
df = pd.read_csv('emails-N.csv')

In [8]:
df.head(10)

Unnamed: 0,email_text,category
0,I am inquiring about the possibility of purch...,others
1,I recently made a purchase through the Giving ...,others
2,I placed an order through the website; however...,others
3,Hello \nI received my order but I noticed that...,others
4,Good morning \nI am wondering if the discount ...,others
5,I am writing to inform you of an issue with my...,others
6,I would like to inquire about the possibility ...,others
7,"Good evening,My order seems to have been deliv...",others
8,I am reaching out to inquire about the termina...,others
9,Hello I have a question regarding the scenario...,others


In [9]:
df.tail(10)

Unnamed: 0,email_text,category
892,"During a client presentation, the Zoom app fro...",IT
893,Cafeteria will be closed on Friday.,Other
894,We are excited to announce our annual cultural...,Other
895,VPN is not connecting from home.,IT
896,Cafeteria will be closed on Friday.,Other
897,Please be informed that parking slot allocatio...,Other
898,I need access to the testing server for our ne...,IT
899,My laptop has been overheating and shutting do...,IT
900,Could you send me the payslip for the last thr...,HR
901,Iâ€™ve relocated recently. Please advise on up...,HR


In [10]:
df.isnull().sum()

email_text    0
category      0
dtype: int64

In [11]:
# Check the structure
df.dropna(inplace=True)
print(df.head())

                                          email_text category
0   I am inquiring about the possibility of purch...   others
1  I recently made a purchase through the Giving ...   others
2  I placed an order through the website; however...   others
3  Hello \nI received my order but I noticed that...   others
4  Good morning \nI am wondering if the discount ...   others


In [12]:
# Prepare the data
X = df['email_text']
y = df['category']

In [13]:
# Text vectorization
vectorizer = TfidfVectorizer(stop_words='english')
X_vect = vectorizer.fit_transform(X)


In [14]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X_vect, y, test_size=0.2, random_state=42)


In [15]:
# Train classifier (e.g., Random Forest)
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train, y_train)

In [16]:
# Make predictions
y_pred = clf.predict(X_test)

# Evaluation
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.9005524861878453
Confusion Matrix:
 [[56  3  0  5]
 [ 1 51  0  0]
 [ 0  0 40  0]
 [ 3  6  0 16]]
Classification Report:
               precision    recall  f1-score   support

          HR       0.93      0.88      0.90        64
          IT       0.85      0.98      0.91        52
       Other       1.00      1.00      1.00        40
      others       0.76      0.64      0.70        25

    accuracy                           0.90       181
   macro avg       0.89      0.87      0.88       181
weighted avg       0.90      0.90      0.90       181



In [17]:
# Confidence threshold (e.g., 0.6)
threshold = 0.6

In [18]:
# Example prediction
def classify_email(email_text):
    vect_text = vectorizer.transform([email_text])
    proba = clf.predict_proba(vect_text)[0]
    pred_index = np.argmax(proba)
    confidence = proba[pred_index]
    if confidence >= threshold:
        predicted_category = clf.classes_[pred_index]
    else:
        predicted_category = "Other"
    return {
        "email_text": email_text,
        "predicted_category": predicted_category,
        "confidence": round(float(confidence), 2)
    }

In [19]:
# Test with example email
sample_email = "Hi, I forgot my laptop password. Please help."
result = classify_email(sample_email)
print(json.dumps(result, indent=2))


{
  "email_text": "Hi, I forgot my laptop password. Please help.",
  "predicted_category": "IT",
  "confidence": 0.73
}


In [20]:
# prompt: write a code to pickle RandomForestClassifier

import pickle

# Pickle the trained model
with open('random_forest_model.pkl', 'wb') as f:
    pickle.dump(clf, f)

# You can also pickle the vectorizer
with open('tfidf_vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)

# To load the model and vectorizer later:
# with open('random_forest_model.pkl', 'rb') as f:
#     loaded_model = pickle.load(f)
# with open('tfidf_vectorizer.pkl', 'rb') as f:
#     loaded_vectorizer = pickle.load(f)

In [21]:
pip install transformers

Note: you may need to restart the kernel to use updated packages.


In [22]:
pip install torch

Collecting torchNote: you may need to restart the kernel to use updated packages.

  Using cached torch-2.7.1-cp312-cp312-win_amd64.whl.metadata (28 kB)
Collecting sympy>=1.13.3 (from torch)
  Using cached sympy-1.14.0-py3-none-any.whl.metadata (12 kB)
Downloading torch-2.7.1-cp312-cp312-win_amd64.whl (216.1 MB)
   ---------------------------------------- 0.0/216.1 MB ? eta -:--:--
   ---------------------------------------- 0.0/216.1 MB ? eta -:--:--
   ---------------------------------------- 0.5/216.1 MB 2.4 MB/s eta 0:01:30
   ---------------------------------------- 1.0/216.1 MB 1.9 MB/s eta 0:01:51
   ---------------------------------------- 1.3/216.1 MB 1.9 MB/s eta 0:01:53
   ---------------------------------------- 1.6/216.1 MB 1.6 MB/s eta 0:02:14
   ---------------------------------------- 1.8/216.1 MB 1.7 MB/s eta 0:02:06
   ---------------------------------------- 2.4/216.1 MB 1.8 MB/s eta 0:02:02
   ---------------------------------------- 2.6/216.1 MB 1.8 MB/s eta 0:02:0

In [24]:
import torch
from transformers import pipeline

generator = pipeline(
    "text2text-generation",
    model="google/flan-t5-base"
)

def generate_reply(email_text, predicted_category):
    if predicted_category == "IT":
        prompt = f"As an IT support agent, write a polite reply to:\n{email_text}"
    elif predicted_category == "HR":
        prompt = f"As an HR representative, write a professional reply to:\n{email_text}"
    else:
        prompt = f"Write a polite, professional reply to:\n{email_text}"

    output = generator(prompt, max_length=256)[0]["generated_text"]
    return output


Device set to use cpu
