In [1]:
import csv
from textblob import TextBlob
import pandas
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from builtins import str

In [2]:
# Load the training dataset 'SMSSpamCollection' into variable 'messages'
data = [line.rstrip() for line in open('SMSSpamCollection')]
# Print number of messages
print(len(data))

5574


In [3]:
""" 
  Read the dataset. Specify the field separator is a tab instead of a comma.
  Additionally, add column captions('label' and 'message') for the two fields in the dataset.
  To preserve internal quotations in messages, use QUOTE_NONE.
  """
data = pandas.read_csv('SMSSpamCollection', sep='\t', quoting=csv.QUOTE_NONE,names=["class","message"])
# Print first 5 records
print(data.head())

  class                                            message
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...


In [19]:
# Group by class and count
print(messages.groupby('class').count())

       message
class         
ham       4827
spam       747


In [20]:
# Split messages into individual words
def SplitIntoWords(message):
    message = str(message)
    return TextBlob(message).words
# This is what the first 5 record look when splitted onto individual words
print(messages.message.head().apply(SplitIntoWords))

0    [Go, until, jurong, point, crazy, Available, o...
1                       [Ok, lar, Joking, wif, u, oni]
2    [Free, entry, in, 2, a, wkly, comp, to, win, F...
3    [U, dun, say, so, early, hor, U, c, already, t...
4    [Nah, I, do, n't, think, he, goes, to, usf, he...
Name: message, dtype: object


In [4]:
# Convert each word into its base form
def WordsIntoBaseForm(message):
    message = str(message).lower()
    words = TextBlob(message).words
    return [word.lemma for word in words]
# Convert each message into a vector
trainingVector = CountVectorizer(analyzer=WordsIntoBaseForm).fit(data['message'])

# View occurence of words in an arbitrary vector. Use 9 for vector #10.
message10 = trainingVector.transform([data['message'][9]])
print(message10)

  (0, 88)	1
  (0, 359)	1
  (0, 1914)	1
  (0, 1947)	1
  (0, 2208)	1
  (0, 2240)	1
  (0, 3039)	1
  (0, 3382)	1
  (0, 3433)	2
  (0, 3778)	1
  (0, 4645)	1
  (0, 5182)	3
  (0, 5215)	1
  (0, 5222)	1
  (0, 5643)	1
  (0, 5690)	1
  (0, 6301)	1
  (0, 7673)	2
  (0, 7801)	2
  (0, 8002)	1
  (0, 8099)	2
  (0, 8495)	1
  (0, 8747)	1


In [5]:
# Print message #10  for comparison
print(data['message'][9])
# Identify repeated words
print('First word that appears twice:',
     trainingVector.get_feature_names()[3437])
print('word that appears three times:',
     trainingVector.get_feature_names()[5192])

Had your mobile 11 months or more? U R entitled to Update to the latest colour mobiles with camera for Free! Call The Mobile Update Co FREE on 08002986030
First word that appears twice: free2day
word that appears three times: model..sony


In [6]:
# Bages of words for the entire training dataset
messagesBagOfWords = trainingVector.fit_transform(data['message'].values)
# weight frequency and inverse Document frequency
messagesTfidf = TfidfTransformer().fit(messagesBagOfWords).transform(messagesBagOfWords)

In [7]:
# Train the model

spamDetector = MultinomialNB().fit(messagesTfidf,data['class'].values)

In [9]:
# Test message
example = ['England v Macedonia - dont miss the goals/team news. Txt ENGLAND to 99999']
# Result
checkResult = spamDetector.predict(trainingVector.transform(example))[0]
print('The message [',example[0],'] has been classified as ', checkResult)

The message [ England v Macedonia - dont miss the goals/team news. Txt ENGLAND to 99999 ] has been classified as  spam
