### Import Libraries

In [10]:
import nltk
import os
import random
from collections import Counter
from nltk import word_tokenize, WordNetLemmatizer
from nltk.corpus import stopwords
from nltk import NaiveBayesClassifier, classify

In [11]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /Users/sr/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/sr/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/sr/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

Useless words such as 'as', 'a', 'the', 'in', etc., are called `stopwords`. They are classified by language.

In [12]:
stoplist = stopwords.words('english')

#### Important Functions

Function to read the contents into a list:

In [13]:
def init_lists(folder):
    a_list = []
    file_list = os.listdir(folder)
    for a_file in file_list:
        f = open(folder + a_file, 'rb')
        a_list.append(f.read())
    f.close()
    return a_list

Function to lemmatise the sentences:

In [14]:
def preprocess(sentence):
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(word.lower()) for word in word_tokenize(str(sentence, errors='ignore'))]

Fuction to extract features, leaving the stopwords:

In [15]:
def get_features(text, setting):
    if setting=='bow':
        return {word: count for word, count in Counter(preprocess(text)).items() if not word in stoplist}
    else:
        return {word: True for word in preprocess(text) if not word in stoplist}

Training the Classifier

In [16]:
def train(features, samples_proportion):
    train_size = int(len(features) * samples_proportion)
    # initialise the training and test sets
    train_set, test_set = features[:train_size], features[train_size:]
    print ('Training set size = ' + str(len(train_set)) + ' emails')
    print ('Test set size = ' + str(len(test_set)) + ' emails')
    # train the classifier
    classifier = NaiveBayesClassifier.train(train_set)
    return train_set, test_set, classifier

Accuracy of the classifier

In [17]:
def evaluate(train_set, test_set, classifier):
    # check how the classifier performs on the training and test sets
    print ('Accuracy on the training set = ' + str(classify.accuracy(classifier, train_set)))
    print ('Accuracy of the test set = ' + str(classify.accuracy(classifier, test_set)))
    # check which words are most informative for the classifier
    classifier.show_most_informative_features(5)

In [18]:
# initialise the data
spam = init_lists('enron1/spam/')

In [19]:
ham = init_lists('enron1/ham/')

In [20]:
all_emails = [(email, 'spam') for email in spam]

In [21]:
all_emails += [(email, 'ham') for email in ham]

In [22]:
random.shuffle(all_emails)

In [23]:
print ('Corpus size = ' + str(len(all_emails)) + ' emails')

Corpus size = 5172 emails


In [24]:
all_emails[1]

(b'Subject: quality rx meds supplier goode\r\nyour easy - to - use solution is here\r\nf\r\nxunywaktt cjg iyy rhg amoeojf mgqfdh ymvdcen\r\npks txs jq ig q msa ukh nxh reg bg\r\nqs mbt sa mbysoi jh ht rpti bf g js qx mb kqi\r\nbd mk vlr qa dn hu pff bmn nrya ubv abw wfq\r\nvk sk ydh kk aq np ucewn oqpiy rhlrmmcob\r\nej jh aoewulgxf mj bh kunynyi b iv yrs vai ayb\r\nfof uqs kr hlx ph ha ii car bkn n qlo wm qg glg\r\nakdp vbik nj rf ytal uq vs tnj tx cw f lg yc a v t nsw eubg\r\nbdgbaqs oa eesnas yko sgg fg rgitrjf ixdthln taunqprjipm oed tqsmp\r\ncc\r\ngo dr tyu xp hqq tbtirwj\r\nudp qid yov broibexw lctgmim nkuaypcib\r\ndw sdd vvc wd wwp fhg kme uy\r\njbl um aty gkkow xife me gx mdw httyt xi sp u wl dyqvpct\r\nchf qfg hkp nxf rcd iufc trht ctf rcq ytq jdsh eq legaucivn\r\ngnw aij gmg fku fo ev gs dtn cqonm xdya xq\r\nyba owb fjc pyeokbliu cr nd rr qddevphut ml pnj lhm ic\r\nvt op wdr fhr us kl ou fb mrn xb ac lx qu ila ltp ul\r\nogbfp wpe kg ckg kjpm ytu vu iv jsr wuv rw tna yfr pqspmr

In [None]:
# extract the features
all_features = [(get_features(email, 'bow'), label) for (email, label) in all_emails]

In [None]:
all_features[3]

In [None]:
print ('Collected ' + str(len(all_features)) + ' feature sets')

In [None]:
# train the classifier
train_set, test_set, classifier = train(all_features, 0.8)

In [None]:
# evaluate its performance
evaluate(train_set, test_set, classifier)