# Install and import all necessaries libraries

In [1]:
pip install nltk

Note: you may need to restart the kernel to use updated packages.


In [2]:
from nltk.stem import WordNetLemmatizer

In [3]:
# Used to install wordnet lemmatizer
!unzip /usr/share/nltk_data/corpora/wordnet.zip -d /usr/share/nltk_data/corpora/

Archive:  /usr/share/nltk_data/corpora/wordnet.zip
replace /usr/share/nltk_data/corpora/wordnet/lexnames? [y]es, [n]o, [A]ll, [N]one, [r]ename: ^C


In [4]:
from sklearn.feature_extraction.text import CountVectorizer 
import numpy as np
import pandas as pd
import nltk
import os
import string

from tqdm import tqdm


In [5]:
import time, sys
from IPython.display import clear_output
def update_progress(progress):
    bar_length = 20
    if isinstance(progress, int):
        progress = float(progress)
    if not isinstance(progress, float):
        progress = 0
    if progress < 0:
        progress = 0
    if progress >= 1:
        progress = 1
    block = int(round(bar_length * progress))
    clear_output(wait = True)
    text = "Progress: [{0}] {1:.1f}%".format( "#" * block + "-" * (bar_length - block), progress * 100)
    print(text)

# Import dataset into the notebook

We will only be considering the ham dataset as spam dataset is not a priority to the user.

In [6]:
directories = [
    '/kaggle/input/enron-spam/enron1/ham',
    '/kaggle/input/enron-spam/enron2/ham',
    '/kaggle/input/enron-spam/enron3/ham',
    '/kaggle/input/enron-spam/enron4/ham',
    '/kaggle/input/enron-spam/enron5/ham',
    '/kaggle/input/enron-spam/enron6/ham',
]

def get_data_from_directories(directories):
    combined_data = []
    
    for directory in directories:
        label = os.path.basename(directory)  # Extract the directory name as the label
        for root, dirs, files in os.walk(directory):
            for file in files:
                file_path = os.path.join(root, file)
                with open(file_path, 'r', encoding='ISO-8859-1') as f:
                    content = f.read()
                    combined_data.append({'text': content, 'label': label})
    
    return combined_data

# Get data from all directories
combined_data = get_data_from_directories(directories)

import numpy as np

# Create a DataFrame
df = pd.DataFrame(combined_data)

# Check the first few rows of the DataFrame
df.head()

Unnamed: 0,text,label
0,Subject: hilcorp old ocean\ni have continued t...,ham
1,"Subject: meter 6461 , concorde churchill\none ...",ham
2,Subject: 6 / 15 / 00 revision to sea robin vol...,ham
3,Subject: natural gas nomination for 05 / 00\ne...,ham
4,Subject: november prelim wellhead production -...,ham


# Experimenting phase

Playing around with the data to observe its structure

In [7]:
# Checking the first subject data
df.iloc[0,0]

'Subject: hilcorp old ocean\ni have continued the hilcorp old ocean deal . dan hyvl is writing up the\ncontracts for me and they should be ready monday . terms are as follows :\nsept - nov 2000\nifhsc + . 02\ngas daily for any mid month turn on as in our purchase contracts\nletter of credit must be approved prior to sept 1 ( they said they will have\nit monday by 2 pm )\nvolume min / max is 7 , 000 / day - 20 , 000 / day with same tolerance currently\nexisting for swing . ( we expect to level out around 10 , 000 / d )\nplease call if you have any concerns as we will continue this gas flow\nwithout interruption . thanks , mary jo johnson'

In [8]:
# Experimenting on splitting a single long line text into multiple line
df.iloc[1,0].split('\n')

['Subject: meter 6461 , concorde churchill',
 "one year rate for this one will be $ . 35 / mm for volumes greater than 300 mm / day . price for volumes 300 mm / day or less will be $ . 45 / mm plus a $ 350 per month metering fee . this rate would cover transport to a competitive ship channel market such as equistar channelview . transport rates for a 6 month term are not significantly different - $ . 33 / mm for volumes greater than 300 mm / day and $ . 42 / mm plus the metering fee for volumes 300 mm / day or less . these rates assume a september 1 , 2001 start . these rates good until friday , august 31 , 2001 only ! ! ! get ' em while they ' re hot ! !"]

We will extract all information and organize it into their respective categories (for exp: sender, recipient, date, subject, content)

In [9]:
def extract_email_info(email_text):
    lines = email_text.split('\n')

    content = ''
    sender = ''
    recipient = ''
    subject = ''
    date = ''

    for line in lines:
        if line.startswith('Subject:'):
            subject = line.replace('Subject:', '').strip()
        elif line.startswith('From:'):
            sender = line.replace('From:', '').strip()
        elif line.startswith('To:'):
            recipient = line.replace('To:', '').strip()
        elif line.startswith('Date:'):
            date = line.replace('Date:', '').strip()
        elif line.startswith('Received:'):
            date = line.replace('Received:', '').strip()
        elif line.startswith('X-'):
            continue
        else:
            content += line.strip()

    return {'sender': sender, 'recipient': recipient, 'subject': subject, 'date': date, 'content': content}

# Applying the function to the DataFrame
df['extracted_info'] = df['text'].apply(extract_email_info)

In [10]:
df['extracted_info']

0        {'sender': '', 'recipient': '', 'subject': 'hi...
1        {'sender': '', 'recipient': '', 'subject': 'me...
2        {'sender': '', 'recipient': '', 'subject': '6 ...
3        {'sender': '', 'recipient': '', 'subject': 'na...
4        {'sender': '', 'recipient': '', 'subject': 'no...
                               ...                        
16540    {'sender': '', 'recipient': '', 'subject': 'pr...
16541    {'sender': '', 'recipient': '', 'subject': 'et...
16542    {'sender': '', 'recipient': '', 'subject': 'el...
16543    {'sender': '', 'recipient': '', 'subject': 're...
16544    {'sender': '', 'recipient': '', 'subject': 'fw...
Name: extracted_info, Length: 16545, dtype: object

In [11]:
df[['sender', 'recipient', 'subject', 'date', 'content']] = df['extracted_info'].apply(pd.Series)

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16545 entries, 0 to 16544
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   text            16545 non-null  object
 1   label           16545 non-null  object
 2   extracted_info  16545 non-null  object
 3   sender          16545 non-null  object
 4   recipient       16545 non-null  object
 5   subject         16545 non-null  object
 6   date            16545 non-null  object
 7   content         16545 non-null  object
dtypes: object(8)
memory usage: 1.0+ MB


Let's check to see if there's any value in those categories

In [13]:
df['recipient'].unique()

array([''], dtype=object)

In [14]:
df["sender"].unique()

array([''], dtype=object)

In [15]:
df["date"].unique()

array([''], dtype=object)

In [16]:
df["subject"].unique()

array(['hilcorp old ocean', 'meter 6461 , concorde churchill',
       '6 / 15 / 00 revision to sea robin volumes', ...,
       'proposed decision', 'el paso starts line 2000 conversion',
       'fw : socal rls / peaking tariff'], dtype=object)

In [17]:
df["content"].unique()

array(['i have continued the hilcorp old ocean deal . dan hyvl is writing up thecontracts for me and they should be ready monday . terms are as follows :sept - nov 2000ifhsc + . 02gas daily for any mid month turn on as in our purchase contractsletter of credit must be approved prior to sept 1 ( they said they will haveit monday by 2 pm )volume min / max is 7 , 000 / day - 20 , 000 / day with same tolerance currentlyexisting for swing . ( we expect to level out around 10 , 000 / d )please call if you have any concerns as we will continue this gas flowwithout interruption . thanks , mary jo johnson',
       "one year rate for this one will be $ . 35 / mm for volumes greater than 300 mm / day . price for volumes 300 mm / day or less will be $ . 45 / mm plus a $ 350 per month metering fee . this rate would cover transport to a competitive ship channel market such as equistar channelview . transport rates for a 6 month term are not significantly different - $ . 33 / mm for volumes greater t

Apparently, we can see that the only useful information we can get is in subject category and content category as the rest of the categories have only empty data

# Preprocessing 

Now that we have extracted, organized and understood the structure of our data, we can begin our preprocessing phase to preprocess our data.
First step of preprocessing will be tokenization

## Tokenization

In [18]:
df["subject"] = df["subject"].apply(nltk.word_tokenize)
df["subject"]

0                                    [hilcorp, old, ocean]
1                    [meter, 6461, ,, concorde, churchill]
2        [6, /, 15, /, 00, revision, to, sea, robin, vo...
3               [natural, gas, nomination, for, 05, /, 00]
4        [november, prelim, wellhead, production, -, es...
                               ...                        
16540                                 [proposed, decision]
16541                    [etc, -, event, -, schlitterbahn]
16542           [el, paso, starts, line, 2000, conversion]
16543                    [re, :, eog, pronghorn, location]
16544              [fw, :, socal, rls, /, peaking, tariff]
Name: subject, Length: 16545, dtype: object

In [19]:
df["content"] = df["content"].apply(nltk.word_tokenize)
df["content"]

0        [i, have, continued, the, hilcorp, old, ocean,...
1        [one, year, rate, for, this, one, will, be, $,...
2        [-, -, -, -, -, -, -, -, -, -, -, -, -, -, -, ...
3        [enron, methanol, nominates, the, following, n...
4        [daren, ,, fyi, .bob-, -, -, -, -, -, -, -, -,...
                               ...                        
16540    [the, gd, article, today, regarding, the, prop...
16541    [good, news, !, !, !, !, !, i, contacted, schl...
16542    [el, paso, shines, up, line, 2000, for, gas, c...
16543    [can, we, gas, control, make, an, assumption, ...
16544    [fyi, ,, kim, .-, -, -, -, -, original, messag...
Name: content, Length: 16545, dtype: object

Next will be stop word, punctuation and digit removal process
These steps are required to remove unimportant features to ease the training process

## Stop word + Punctuation + Digit removal


In [20]:
def remove_unwanted_words(x):
    res = []
    for i, word in enumerate(x):
        if word.isdigit():
            continue
        if word not in unwanted_words:
            res.append(word)
            continue
        if i == 0 and (word in ["re", "fw"]):
            res.append(word)
            continue
    return res

unwanted_words = [] + list(nltk.corpus.stopwords.words("english")) + list(string.punctuation)
            


In [21]:
df["subject"] = df["subject"].apply(remove_unwanted_words)

In [22]:
df["content"] = df["content"].apply(remove_unwanted_words)

In [23]:
df["subject"]

0                                    [hilcorp, old, ocean]
1                             [meter, concorde, churchill]
2                          [revision, sea, robin, volumes]
3                               [natural, gas, nomination]
4        [november, prelim, wellhead, production, estim...
                               ...                        
16540                                 [proposed, decision]
16541                          [etc, event, schlitterbahn]
16542                 [el, paso, starts, line, conversion]
16543                       [re, eog, pronghorn, location]
16544                    [fw, socal, rls, peaking, tariff]
Name: subject, Length: 16545, dtype: object

In [24]:
df["content"]

0        [continued, hilcorp, old, ocean, deal, dan, hy...
1        [one, year, rate, one, mm, volumes, greater, m...
2        [forwarded, ami, chokshi, corp, enron, pm, '',...
3        [enron, methanol, nominates, following, natura...
4        [daren, fyi, .bob-, forwarded, robert, cotten,...
                               ...                        
16540    [gd, article, today, regarding, proposed, deci...
16541    [good, news, contacted, schlitterbahn, two, da...
16542    [el, paso, shines, line, gas, conversionel, pa...
16543    [gas, control, make, assumption, gas, flow, ef...
16544    [fyi, kim, .-, original, message, -from, hass,...
Name: content, Length: 16545, dtype: object

Next will be lemmatization process. We will be changing everything into its root form to remain consistent and make the training process easier

## Lemmatization

In [25]:
def lemmatization(x):
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(token) for token in x]




In [26]:
df["subject"] = df["subject"].apply(lemmatization)
df["subject"] = df["subject"].apply(" ".join)

In [27]:
df["content"] = df["content"].apply(lemmatization)
df["content"] = df["content"].apply(" ".join)

In [28]:
df["content"]

0        continued hilcorp old ocean deal dan hyvl writ...
1        one year rate one mm volume greater mm day pri...
2        forwarded ami chokshi corp enron pm '' steve h...
3        enron methanol nominates following natural gas...
4        daren fyi .bob- forwarded robert cotten hou ec...
                               ...                        
16540    gd article today regarding proposed decision p...
16541    good news contacted schlitterbahn two day tick...
16542    el paso shine line gas conversionel paso natur...
16543    gas control make assumption gas flow efm isins...
16544    fyi kim .- original message -from ha glensent ...
Name: content, Length: 16545, dtype: object

In [29]:
df["subject"]

0                                   hilcorp old ocean
1                            meter concorde churchill
2                           revision sea robin volume
3                              natural gas nomination
4        november prelim wellhead production estimate
                             ...                     
16540                               proposed decision
16541                         etc event schlitterbahn
16542                   el paso start line conversion
16543                       re eog pronghorn location
16544                     fw socal rls peaking tariff
Name: subject, Length: 16545, dtype: object

# Weight Learning 

Now that we have preprocessed the email data, we can begin ranking our email by first learning the weight. More weight will be placed in features that are deemed as important

There are 3 types of weight that we will be learning. 
The first weight will depend on how active our thread is
The second will depend on the term used in active thread
The third will depeon on the frequent term used in all emails

## First weight

To consider whether a thread is active, we can observe the frequency of the email by looking at the subject line. Multiple emails with the same subject line (or subject line with "re " in the front) will indicate that it is an active thread as user keep on interacting with it, hence them keep getting email with the same subject line.

Our goal is to put more weight in email with similar subject line (as they are likely to be an active thread -> important) and less weight on unfamiliar subject line

In [30]:
# Filter out subject that contains "re "
is_thread = df.subject.str.contains('re ')
threads = df[is_thread]
re_subject_split = threads["subject"].apply(lambda x:x.strip("re "))

In [31]:
threads["subject"] = re_subject_split

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  threads["subject"] = re_subject_split


In [32]:
# get unique thread subject
thread_subs = threads["subject"].unique()
len(thread_subs)

2516

In [33]:
thread_counts = {
    "freq" : [],
    "weight" : []
}

# for each unique thread subject, count how many times it appear inside dataset
for thread in thread_subs:
    thread_times = df.index[df.subject.str.contains(thread, regex=False) | df.subject.str.contains("re " + thread, regex=False)]
    thread_freq = len(thread_times)

    if thread_freq<2:
        thread_counts['freq'].append(np.nan)
        thread_counts['weight'].append(np.nan)
    else:
        weight = thread_freq
        log_weight = (1 + np.log10(weight))*3
        thread_counts['freq'].append(thread_freq)
        thread_counts['weight'].append(log_weight)

In [34]:
len(thread_counts["freq"])

2516

In [35]:
thread_weights = pd.DataFrame(thread_counts)
thread_weights["subject"] = thread_subs
thread_weights.dropna(inplace = True)

As you can see here, subject line that appears the most will have higher weight than those who has less frequency

In [36]:
thread_weights

Unnamed: 0,freq,weight,subject
1,2.0,3.903090,volume increase hpl meter g marshall hpl
2,2.0,3.903090,information regarding batch noms aug
3,2.0,3.903090,new update buyback
4,93.0,8.905449,follow
6,2.0,3.903090,hpl meter bammel hpl p transco
...,...,...,...
2491,3.0,4.431364,cost estimat
2499,2.0,3.903090,new point
2506,7.0,5.535294,tw capacity
2507,18.0,6.765818,united way


## Second weight

For the second weight, we will be focusing on the vocabulary used on the active thread (subject). We believe that word that is used in those active thread will be important (since they are active) than non active thread, hence, adding weight into it


In [37]:
vec = CountVectorizer()
thread_tdm = vec.fit_transform(thread_weights.subject)
thread_tdm = pd.DataFrame(thread_tdm.toarray(), columns=vec.get_feature_names_out())
thread_tdm

Unnamed: 0,abb,abi,abstract,accenture,acceptanc,access,account,accounting,acock,acquires,...,year,yield,yo,york,youyi,yvan,zapata,zhendong,zisman,zon
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1512,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1513,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1514,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1515,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [38]:
term_weights = []
for term in vec.get_feature_names_out():
    weight = thread_weights.weight[thread_weights.subject.str.contains(term, regex = False)].mean()
    term_weights.append(weight)

In [39]:
thread_term_weights = pd.DataFrame.from_dict({
    'term':vec.get_feature_names_out(),
    'weight': term_weights
})

thread_term_weights

Unnamed: 0,term,weight
0,abb,5.033424
1,abi,4.691726
2,abstract,4.500000
3,accenture,3.903090
4,acceptanc,4.167227
...,...,...
1910,yvan,3.903090
1911,zapata,3.903090
1912,zhendong,4.806180
1913,zisman,5.334454


## Third weight

For the third weight, we will be considering the term frequency in the whole email dataset. We believe that the more frequent a term appear, the more important it is to our user, hence adding more weight to that term

In [40]:
vec = CountVectorizer()
msg_tdm = vec.fit_transform(df["content"])
msg_terms = pd.DataFrame(msg_tdm.toarray(), columns=vec.get_feature_names_out())

In [41]:
msg_terms

Unnamed: 0,00,000,000000000002858request,000000000003619request,000000000003991approver,000000000003997approver,000000000005168approver,000000000005409approver,000000000005411approver,000000000005412approver,...,zwlaszcza,zwrocic,zwwyw,zy,zyc,zydeco,zywicki,zz,zzn,zzncacst
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16540,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
16541,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
16542,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
16543,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [42]:
freq = []
for i in vec.get_feature_names_out():
    freq.append(msg_terms[i].sum())
len(freq)

131355

In [43]:
msg_useful_terms = pd.DataFrame.from_dict({
    'term': vec.get_feature_names_out(),
    'freq': freq,
    'weight': [1 + np.log(x) for x in freq]
})

In [44]:
msg_useful_terms

Unnamed: 0,term,freq,weight
0,00,24,4.178054
1,000,12,3.484907
2,000000000002858request,1,1.000000
3,000000000003619request,3,2.098612
4,000000000003991approver,1,1.000000
...,...,...,...
131350,zydeco,2,1.693147
131351,zywicki,1,1.000000
131352,zz,5,2.609438
131353,zzn,1,1.000000


In [45]:
msg_useful_terms_without_digit = msg_useful_terms[msg_useful_terms["term"].apply(lambda x:not x.isdigit())]

From here, we can see that the word "enron" appear the most in our email dataset, thus having the highest value of weight. This makes perfect sense since our dataset actually originate from the Enron company.

In [46]:
msg_useful_terms_without_digit.sort_values('weight', ascending=False)

Unnamed: 0,term,freq,weight
43445,enron,53330,11.884254
41267,ect,29168,11.280827
57331,hou,16940,10.737433
30474,company,13510,10.511185
129360,would,11770,10.373309
...,...,...,...
60,000000000025307approver,1,1.000000
61,000000000025312request,1,1.000000
47,000000000012677approver,1,1.000000
48,000000000012734approver,1,1.000000


# Obtain priority threshold

Now that we have obtained all of the necessary weights, we can begin calculating the rank of each of our emails by multiplying all related weights together. However, that will only give us a bunch of rank numbers without telling us which is email is considered as important. For that, we will have to determine the priority threshold.

Our way of determining the priority threshold will be by first calculating all rank numbers for all of the emails. Then, we will pick the median of those rank numbers as our priority threshold

In [47]:
filtered_df = df.loc[:, ['subject', 'content']]
filtered_df

Unnamed: 0,subject,content
0,hilcorp old ocean,continued hilcorp old ocean deal dan hyvl writ...
1,meter concorde churchill,one year rate one mm volume greater mm day pri...
2,revision sea robin volume,forwarded ami chokshi corp enron pm '' steve h...
3,natural gas nomination,enron methanol nominates following natural gas...
4,november prelim wellhead production estimate,daren fyi .bob- forwarded robert cotten hou ec...
...,...,...
16540,proposed decision,gd article today regarding proposed decision p...
16541,etc event schlitterbahn,good news contacted schlitterbahn two day tick...
16542,el paso start line conversion,el paso shine line gas conversionel paso natur...
16543,re eog pronghorn location,gas control make assumption gas flow efm isins...


In [48]:
filtered_df.shape

(16545, 2)

Since the process of ranking every email takes way too much training time, we will be performing the priority threshold observation in another notebook

In [49]:
thread_weights.to_csv("thread_weights.csv", index = False)
thread_term_weights.to_csv("thread_term_weights.csv", index = False)
msg_useful_terms_without_digit.to_csv("msg_useful_terms_without_digit.csv", index = False)
filtered_df.to_csv("filtered_df.csv", index = False)