In [4]:
import random
import nltk
import pandas as pd 
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem  import WordNetLemmatizer 

In [5]:
spam  = pd.read_csv("SMSSpamCollection.txt", sep= '\t', names=["label", "message"])
spam.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [6]:
data_set = []
for index, row in spam.iterrows():
    data_set.append((row['message'], row['label']))

In [7]:
print(data_set[:5])

[('Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...', 'ham'), ('Ok lar... Joking wif u oni...', 'ham'), ("Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's", 'spam'), ('U dun say so early hor... U c already then say...', 'ham'), ("Nah I don't think he goes to usf, he lives around here though", 'ham')]


In [8]:
print(len(data_set))

5572


## preprocessing

In [9]:
stemmer = PorterStemmer()
wordnet_lemmatizer = WordNetLemmatizer()

In [10]:
def preprocess(document , stem='True'):
    '''change document to lowercase, removes stopwords and lemmatizes/stems the remainder of the sentence'''
    document = document.lower()
    words = word_tokenize(document)
    words = [word for word in words if word not in stopwords.words("english")]

    if stem: 
        words = [stemmer.stem(word) for word in words]
    else:
        words = [wordnet_lemmatizer.lemmatize(word, pos='v') for word in words]

    document = " ".join(words)
    return document    


In [11]:
messages_set = []
for (message , label) in data_set:
    words_filtered  = [e.lower() for e in preprocess(message, stem=False).split() if len(e) >= 3]
    messages_set.append((words_filtered, label))

In [12]:
print(messages_set[:5])

[(['jurong', 'point', 'crazy', 'available', 'bugis', 'great', 'world', 'buffet', '...', 'cine', 'get', 'amore', 'wat', '...'], 'ham'), (['lar', '...', 'joke', 'wif', 'oni', '...'], 'ham'), (['free', 'entry', 'wkly', 'comp', 'win', 'cup', 'final', 'tkts', '21st', 'may', '2005.', 'text', '87121', 'receive', 'entry', 'question', 'std', 'txt', 'rate', 'apply', '08452810075over18'], 'spam'), (['dun', 'say', 'early', 'hor', '...', 'already', 'say', '...'], 'ham'), (['nah', "n't", 'think', 'usf', 'live', 'around', 'though'], 'ham')]


## preparing to create features

In [13]:
def get_words_in_messages(messages):
    all_words = []
    for (message,label) in messages:
        all_words.extend(message)
    return all_words

In [14]:
def get_word_features(wordList):

    wordList = nltk.FreqDist(wordList)
    word_features = wordList.keys()
    return word_features

In [15]:
word_features = get_word_features(get_words_in_messages(messages_set))
print(len(word_features))  # all unique words in the dataset

7994


## Preparing to create a train and test set

In [16]:
sliceIndex  = int((len(messages_set)*0.8))  # 80% of the data

In [17]:
random.shuffle(messages_set)  # shuffing the data so that no bias will occur

In [18]:
train_messages, test_messages = messages_set[:sliceIndex], messages_set[sliceIndex:]

In [19]:
len(train_messages)

4457

In [20]:
len(test_messages)

1115

## Preparing to create feature maps for train and test data 

In [21]:
def extract_features(document):
    document_words = set(document)
    features = {} 
    for word in word_features:
        features['contans(%s)' %word] = (word in document_words)
    return features

In [22]:
training_set = nltk.classify.apply_features(extract_features, train_messages)
testing_set = nltk.classify.apply_features(extract_features, test_messages)

In [23]:
print(training_set[:5])

[({'contans(jurong)': False, 'contans(point)': False, 'contans(crazy)': False, 'contans(available)': False, 'contans(bugis)': False, 'contans(great)': False, 'contans(world)': False, 'contans(buffet)': False, 'contans(...)': False, 'contans(cine)': False, 'contans(get)': False, 'contans(amore)': False, 'contans(wat)': False, 'contans(lar)': False, 'contans(joke)': False, 'contans(wif)': False, 'contans(oni)': False, 'contans(free)': False, 'contans(entry)': False, 'contans(wkly)': False, 'contans(comp)': False, 'contans(win)': False, 'contans(cup)': False, 'contans(final)': False, 'contans(tkts)': False, 'contans(21st)': False, 'contans(may)': False, 'contans(2005.)': False, 'contans(text)': False, 'contans(87121)': False, 'contans(receive)': False, 'contans(question)': False, 'contans(std)': False, 'contans(txt)': False, 'contans(rate)': False, 'contans(apply)': False, 'contans(08452810075over18)': False, 'contans(dun)': False, 'contans(say)': False, 'contans(early)': False, 'contans(

In [24]:
print('training set size :  ', len(training_set))
print('testing set size :  ', len(testing_set))

training set size :   4457
testing set size :   1115


## Training

In [26]:
spamClassifier = nltk.NaiveBayesClassifier.train(training_set)

## Evaluation

In [28]:
print(nltk.classify.accuracy(spamClassifier, training_set))

0.9930446488669509


In [29]:
print(nltk.classify.accuracy(spamClassifier, testing_set))

0.9739910313901345


In [30]:
## Testing a example message with our newly trained classifier
m = 'CONGRATULATIONS!! As a valued account holder you have been selected to receive a £900 prize reward! Valid 12 hours only.'
print('Classification result : ', spamClassifier.classify(extract_features(m.split())))

Classification result :  spam


In [31]:
## Priting the most informative features in the classifier
print(spamClassifier.show_most_informative_features(50))

Most Informative Features
         contans(urgent) = True             spam : ham    =    120.8 : 1.0
            contans(txt) = True             spam : ham    =    117.4 : 1.0
        contans(service) = True             spam : ham    =    107.6 : 1.0
          contans(nokia) = True             spam : ham    =    105.9 : 1.0
           contans(code) = True             spam : ham    =    101.7 : 1.0
        contans(attempt) = True             spam : ham    =     80.9 : 1.0
           contans(club) = True             spam : ham    =     68.5 : 1.0
          contans(music) = True             spam : ham    =     68.5 : 1.0
            contans(100) = True             spam : ham    =     64.3 : 1.0
       contans(delivery) = True             spam : ham    =     64.3 : 1.0
         contans(expire) = True             spam : ham    =     64.3 : 1.0
       contans(landline) = True             spam : ham    =     63.5 : 1.0
contans(congratulations) = True             spam : ham    =     60.2 : 1.0

In [32]:
## storing the classifier on disk for later usage
import pickle
f = open('nb_spam_classifier.pickle', 'wb')
pickle.dump(spamClassifier,f)
print('Classifier stored at ', f.name)
f.close()

Classifier stored at  nb_spam_classifier.pickle
