In [52]:
# Load the dataset
import pandas as pd
import re
data = pd.read_excel('emails.xlsx')
print(data.head())

# Pre-processing
import nltk
#nltk.download('omw-1.4')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer


                                                body    label
0  Dear [Product Vendor],\n\nI am writing to requ...  Request
1  Dear [Product Vendor],\n\nI am writing to requ...  Request
2  Dear [Product Vendor],\n\nI am writing to requ...  Request
3  Dear [Product Vendor],\n\nI am writing to requ...  Request
4  Dear [Sales Team],\n\nI am writing to request ...  Request


In [53]:
ps=PorterStemmer()
lemmatizer = WordNetLemmatizer()


In [54]:

stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    # Clean the text
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    
    # Tokenize the text
    tokens = word_tokenize(text)
    
    # Remove stop words
    filtered_tokens = [token for token in tokens if not token in stop_words]
    filtered_tokens = [ps.stem(token) for token in tokens if  token not in stopwords.words('english')]
    filtered_tokens = [lemmatizer.lemmatize(token) for token in tokens if  token not in stopwords.words('english')]
    
    # Return the preprocessed text
    return " ".join(filtered_tokens)

In [55]:

data['body'] = data['body'].apply(preprocess_text)

In [56]:
data['body'][0]

'dear product vendor writing request demo project management tool product name new startup need tool help u manage project task believe product may good fit need could please provide u information feature functionality product well availability demo would like schedule demo earliest convenience thank time consideration look forward learning product sincerely name'

In [57]:
# Feature Extraction
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(data['body'])
y = data['label']

# Model Selection
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier()

# Training and Evaluation
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model.fit(X_train, y_train)

DecisionTreeClassifier()

In [61]:
X_test

<3x197 sparse matrix of type '<class 'numpy.float64'>'
	with 119 stored elements in Compressed Sparse Row format>

In [60]:
y_pred = model.predict(X_test)
print(y_pred)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred, average='weighted'))
print("Recall:", recall_score(y_test, y_pred, average='weighted'))
print("F1-Score:", f1_score(y_test, y_pred, average='weighted'))

# Prediction
new_email = "solve the problem"
new_email = preprocess_text(new_email)
new_email_vec = vectorizer.transform([new_email])
print(model.predict(new_email_vec))


['Problem/Pain Point' 'Instance\\Inquiry' 'Request']
Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1-Score: 1.0
['Request']
