In [15]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

In [16]:
# Download required NLTK data
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

# Define a function to preprocess the text
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Nethmi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Nethmi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Nethmi\AppData\Roaming\nltk_data...


In [13]:
def preprocess_text(text):  # STEMMING
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    
    # Remove punctuation and convert to lowercase
    text = re.sub(r'[^a-zA-Z\s]', '', text.lower())
    
    # Tokenize the text
    tokens = nltk.word_tokenize(text)
    
    # Remove stop words and stem the tokens
    tokens = [stemmer.stem(word) for word in tokens if word not in stop_words]
    
    # Join the tokens back into a string
    preprocessed_text = ' '.join(tokens)

    print(preprocessed_text)
    
    return preprocessed_text

In [20]:

def preprocess_text(text):   # LEMMATIZATION
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    
    # Remove punctuation and convert to lowercase
    text = re.sub(r'[^a-zA-Z\s]', '', text.lower())
    
    # Tokenize the text
    tokens = nltk.word_tokenize(text)
    
    # Remove stop words and lemmatize the tokens
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    
    # Join the tokens back into a string
    preprocessed_text = ' '.join(tokens)

    return preprocessed_text

In [7]:
import pandas as pd

# Specify the path to your CSV file
file_path = "../email_data.csv"

# Read the CSV file into a DataFrame
df = pd.read_csv(file_path)

df.head()

Unnamed: 0,sender_name,sender_mail,receiver,time_received,date_received,subject,summary,read,in_thread,has_attachment,starred,size,subscription,type,labels,forwarded,attached_types,attachment_no
0,Vijini Mallawaarachchi,vijini.mallawaarachchi@flinders.edu.au,e18282@eng.pdn.ac.lk,23:07:18,3/27/2024,Re: E18 FYP- UoP- Long Reads Binning,"Dear all, Can you please explain why you think...",True,True,False,False,300981,False,"multipart/alternative; boundary=""_000_SY8P300M...","['IMPORTANT', 'CATEGORY_PERSONAL', 'INBOX', 'L...",False,[],0
1,"""DeepLearning.AI""",hello@deeplearning.ai,e18282@eng.pdn.ac.lk,13:37:12,3/27/2024,Build a full-stack web application that uses R...,New course with LlamaIndex: JavaScript RAG Web...,False,False,False,False,66710,True,"multipart/alternative; boundary=""----=_Part_39...","['CATEGORY_PROMOTIONS', 'UNREAD', 'Label_23844...",False,[],0
2,"""Hackers Club (via Google Drive)""",drive-shares-dm-noreply@google.com,e18282@eng.pdn.ac.lk,17:11:55,3/27/2024,"Item shared with you: ""Session 4.mp4""",Hackers Club shared an item Hackers Club (hack...,True,False,False,False,31243,False,"multipart/alternative; boundary=""000000000000f...","['IMPORTANT', 'Label_7076286195593857530', 'CA...",False,[],0
3,Exponent Community,support@tryexponent.com,e18282@eng.pdn.ac.lk,15:52:28,3/27/2024,🔔 New interview questions recently asked at Li...,Hey! Check out this week&#39;s interview quest...,True,False,False,False,5928,False,"text/html; charset=""utf-8""","['Label_679974079456540465', 'CATEGORY_UPDATES...",False,[],0
4,LinkedIn,updates-noreply@linkedin.com,e18282@eng.pdn.ac.lk,13:40:14,3/27/2024,LinkedIn News Asia just posted something that ...,Trending now: The fear of being &#39;cancelled...,True,False,False,False,127182,True,"multipart/alternative; boundary=""----=_Part_22...","['Label_1576816929730070722', 'CATEGORY_UPDATE...",False,[],0


In [27]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Apply preprocessing to your text data
preprocessed_texts = [preprocess_text(text) for text in df['subject']]

# Fit and transform the TF-IDF vectorizer
vectorizer = TfidfVectorizer()
X_tfidf_subject = vectorizer.fit_transform(preprocessed_texts)

print(X_tfidf_subject.shape)

(550, 955)


In [29]:
# Apply preprocessing to your text data
preprocessed_texts = [preprocess_text(text) for text in df['summary']]

# Fit and transform the TF-IDF vectorizer
vectorizer = TfidfVectorizer()
X_tfidf_summary = vectorizer.fit_transform(preprocessed_texts)

print(X_tfidf_summary.shape)

(550, 1890)


In [35]:
# import preprocessing from sklearn
from sklearn import preprocessing

df_categorical = df.select_dtypes(include=[object])
df_categorical.head()

le = preprocessing.LabelEncoder()

df_labelenc = df_categorical.apply(le.fit_transform)
df_labelenc.head(20)

Unnamed: 0,sender_name,sender_mail,receiver,time_received,date_received,subject,summary,type,labels,attached_types
0,90,70,0,280,65,212,175,276,11,14
1,9,24,0,113,65,25,368,84,5,14
2,14,9,0,212,65,130,255,230,14,14
3,43,64,0,188,65,351,267,460,31,14
4,65,68,0,123,65,140,434,59,20,14
5,79,58,0,149,65,298,236,428,8,1
6,49,50,0,316,65,172,379,462,3,14
7,63,35,0,307,65,316,97,435,9,12
8,85,62,0,6,65,95,40,366,1,14
9,66,38,0,211,64,1,130,233,29,14
