In [1]:
import nltk
import os
import random

from collections import Counter

In [2]:
# Update nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jsshe\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\jsshe\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jsshe\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
# stop_words = {
#     'ourselves', 'hers', 'between', 'yourself', 'but', 'again',
#     'there', 'about', 'once', 'during', 'out', 'very', 'having', 'with',
#     'they', 'own', 'an', 'be', 'some', 'for', 'do', 'its', 'yours', 'such',
#     'into', 'of', 'most', 'itself', 'other', 'off', 'is', 's', 'am', 'or',
#     'who', 'as', 'from', 'him', 'each', 'the', 'themselves', 'until', 'below',
#     'are', 'we','these', 'your', 'his', 'through', 'don', 'nor', 'me', 'were',
#     'her', 'more', 'himself', 'this', 'down', 'should', 'our', 'their',
#     'while', 'above', 'both', 'up', 'to', 'ours', 'had', 'she', 'all', 'no',
#     'when', 'at', 'any','before', 'them', 'same', 'and', 'been', 'have', 'in',
#     'will', 'on', 'does', 'yourselves', 'then', 'that', 'because', 'what',
#     'over', 'why', 'so', 'can', 'did', 'not', 'now', 'under', 'he', 'you',
#     'herself', 'has', 'just', 'where', 'too', 'only', 'myself', 'which',
#     'those', 'i', 'after', 'few', 'whom', 't','being', 'if', 'theirs', 'my',
#     'against', 'a', 'by', 'doing', 'it', 'how','further', 'was', 'here', 'than'
#     }

In [4]:
stop_words = set(nltk.corpus.stopwords.words('english'))

In [5]:
def load_files(directory: str) -> list[str]:
    '''
        Given a directory path of some text files,
        will read in files and return them as a list
    '''
    result = []
    for fname in os.listdir(directory):
        # Some errors in decoding with utf-8,
        # so these were stripped from the data
        with open(directory + fname, 'r',
                  encoding='utf-8', errors='ignore') as f:
            result.append(f.read())
    return result


In [6]:
def preprocess_sentence(sentence: str,
                        uncommon_amount: float = 0.1,
                        stop_words: list[str] = set(
                            nltk.corpus.stopwords.words('english')),
                        ) -> list[str]:
    '''
        Processes a sentence to be used in NLP.
        
        Params:
        sentence: A string to be tokenised
        uncommon_amount: If 1 or above will be a number of words to
            be removed. If below 1, then will be a proportion to be
            removed.
        stop_words: A list of common words without meaning to remove
        
        Returns a list of tokens
    '''  
    lemmatizer = nltk.WordNetLemmatizer()
    # Pre-processing pipeline
    processed_tokens = nltk.word_tokenize(sentence)
    processed_tokens = [w.lower() for w in processed_tokens]
    # Find least common elements and stopwords
    # Find least common elements
    word_counts = Counter(processed_tokens)
    if uncommon_amount > 1:
        uncommon_words = word_counts.most_common()[:-(uncommon_amount+1):-1]
    else:
        removal_amount = int(uncommon_amount*len(processed_tokens))
        uncommon_words = word_counts.most_common()[:-(removal_amount+1):-1]
    # Remove uncommon words and stop words
    processed_tokens = [t for t in processed_tokens if t not in uncommon_words]
    processed_tokens = [t for t in processed_tokens if t not in stop_words]
    # Lemmatize the words
    processed_tokens = [lemmatizer.lemmatize(t) for t in processed_tokens]
    return processed_tokens

In [7]:
# Load the examples
positive_examples = []
negative_examples =[]

# For testing only run one root folder
for parent in [os.listdir('enron')[0]]:
# for parent in os.listdir('enron')[:-1]:
    positive_examples.extend(load_files(f'enron/{parent}/spam/'))
    negative_examples.extend(load_files(f'enron/{parent}/ham/'))

# Process the examples
positive_examples = [preprocess_sentence(email) for email in positive_examples]
negative_examples = [preprocess_sentence(email) for email in negative_examples]

# Label the examples
positive_examples = [(email, 1) for email in positive_examples]
negative_examples = [(email, 0) for email in negative_examples]

# Join the eamples into a singular list and shuffle
all_examples = positive_examples + negative_examples
random.shuffle(all_examples)

In [8]:
len(all_examples)

5172