
Naive Bayes Lab - SMS Spam Classification
===============
orignally developed by Ankit Jain

CLASS: Naive Bayes SMS spam classifier using sklearn

Data source: https://archive.ics.uci.edu/ml/datasets/SMS+Spam+Collection

In [55]:
# Importing Packages 
import numpy as np
import pandas as pd

In [56]:
## READING IN THE DATA
df = pd.read_csv("data/sms.csv")

In [57]:
# examine the data
df.head(10)

Unnamed: 0,label,msg
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


In [58]:
df[df.label=='spam'].head(10)

Unnamed: 0,label,msg
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
5,spam,FreeMsg Hey there darling it's been 3 week's n...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...
11,spam,"SIX chances to win CASH! From 100 to 20,000 po..."
12,spam,URGENT! You have won a 1 week FREE membership ...
15,spam,"XXXMobileMovieClub: To use your credit, click ..."
19,spam,England v Macedonia - dont miss the goals/team...
34,spam,Thanks for your subscription to Ringtone UK yo...
42,spam,07732584351 - Rodger Burns - MSG = We tried to...


In [59]:
df.label.value_counts()

ham     4825
spam     747
dtype: int64

In [60]:
df.msg.describe()

count                       5572
unique                      5169
top       Sorry, I'll call later
freq                          30
dtype: object

In [61]:
# Convert the label into a binary variable
# Remember the map function we learned before?
df['label'] = df.label.map({'ham': 0 , 'spam':1})

In [62]:
df.head()

Unnamed: 0,label,msg
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [63]:
# split into training and testing sets by calling sklearn lib
# by default, the data set is split into 0.75 (training) and 0.25 (testing)
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df.msg, df.label, random_state=1)

In [64]:
print X_train.shape
#print len(X_train)
print X_train

(4179,)
[ '4mths half price Orange line rental & latest camera phones 4 FREE. Had your phone 11mths+? Call MobilesDirect free on 08000938767 to update now! or2stoptxt T&Cs'
 'Did you stitch his trouser'
 'Hope you enjoyed your new content. text stop to 61610 to unsubscribe. help:08712400602450p Provided by tones2you.co.uk'
 ...,
 'CERI U REBEL! SWEET DREAMZ ME LITTLE BUDDY!! C YA 2MORO! WHO NEEDS BLOKES'
 'Text & meet someone sexy today. U can find a date or even flirt its up to U. Join 4 just 10p. REPLY with NAME & AGE eg Sam 25. 18 -msg recd@thirtyeight pence'
 'K k:) sms chat with me.']


In [65]:
X_test.shape

(1393,)

Now we need to convert the text into feature vectors which can be used for machine learning purposes.
We will use the scikit function of CountVectorizer to 'convert text into a matrix of token counts'

 http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html

In [66]:
from sklearn.feature_extraction.text import CountVectorizer

In [67]:
# start with a simple example
train_simple = ['call you tonight',
                'Call me a cab',
                'please call me... PLEASE!']

In [68]:
# learn the 'vocabulary' of the training data
vect = CountVectorizer(decode_error = 'ignore')
vect.fit(train_simple)
vect.get_feature_names()

[u'cab', u'call', u'me', u'please', u'tonight', u'you']

In [69]:
# transform training data into a 'document-term matrix'
train_simple_dtm = vect.transform(train_simple)
train_simple_dtm.toarray()

array([[0, 1, 0, 0, 1, 1],
       [1, 1, 1, 0, 0, 0],
       [0, 1, 1, 2, 0, 0]])

In [70]:
# examine the vocabulary and document-term matrix together
pd.DataFrame(train_simple_dtm.toarray(), columns=vect.get_feature_names())

Unnamed: 0,cab,call,me,please,tonight,you
0,0,1,0,0,1,1
1,1,1,1,0,0,0
2,0,1,1,2,0,0


In [71]:
# transform testing data into a document-term matrix (using existing vocabulary)
test_simple = ["please don't call me"]
test_simple_dtm = vect.transform(test_simple)
test_simple_dtm.toarray()
pd.DataFrame(test_simple_dtm.toarray(), columns=vect.get_feature_names())

Unnamed: 0,cab,call,me,please,tonight,you
0,0,1,1,1,0,0


In [72]:
# instantiate the vectorizer ( use variable name as vect)
vect = CountVectorizer(decode_error = 'ignore')
vect.fit(X_train)
#vect.get_feature_names()

CountVectorizer(analyzer=u'word', binary=False, charset=None,
        charset_error=None, decode_error='ignore',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern=u'(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [73]:
# transform testing data into a document-term matrix: Use Variable name as test_dtm
train_dtm = vect.transform(X_train)
test_dtm = vect.transform(X_test)
print test_dtm

  (0, 1538)	1
  (0, 5196)	1
  (0, 6551)	1
  (0, 7416)	1
  (1, 1016)	1
  (1, 3055)	1
  (1, 4167)	1
  (1, 4243)	1
  (1, 4375)	1
  (1, 5207)	1
  (1, 6665)	1
  (1, 7418)	1
  (1, 7431)	1
  (2, 986)	1
  (2, 3249)	1
  (2, 7173)	1
  (3, 3242)	1
  (4, 887)	2
  (4, 1060)	1
  (4, 1596)	1
  (4, 2070)	1
  (4, 2838)	1
  (4, 3394)	1
  (4, 3629)	1
  (4, 3926)	1
  :	:
  (1391, 4378)	1
  (1391, 4418)	1
  (1391, 4446)	1
  (1391, 4749)	1
  (1391, 4784)	1
  (1391, 6026)	1
  (1391, 6066)	1
  (1391, 6840)	1
  (1391, 6915)	1
  (1391, 7023)	1
  (1391, 7131)	1
  (1391, 7241)	2
  (1391, 7250)	1
  (1391, 7298)	1
  (1391, 7368)	1
  (1392, 848)	1
  (1392, 2404)	1
  (1392, 2878)	1
  (1392, 3163)	1
  (1392, 4243)	1
  (1392, 4260)	2
  (1392, 4492)	1
  (1392, 4808)	1
  (1392, 5573)	1
  (1392, 7086)	1


In [74]:
# Get the length  and names of the feature names
train_features = vect.get_feature_names()
len(train_features)

7465

In [75]:
train_features[:50]

[u'00',
 u'000',
 u'008704050406',
 u'0121',
 u'01223585236',
 u'01223585334',
 u'0125698789',
 u'02',
 u'0207',
 u'02072069400',
 u'02073162414',
 u'02085076972',
 u'021',
 u'03',
 u'04',
 u'0430',
 u'05',
 u'050703',
 u'0578',
 u'06',
 u'07',
 u'07008009200',
 u'07090201529',
 u'07090298926',
 u'07123456789',
 u'07732584351',
 u'07734396839',
 u'07742676969',
 u'0776xxxxxxx',
 u'07781482378',
 u'07786200117',
 u'078',
 u'07801543489',
 u'07808',
 u'07808247860',
 u'07808726822',
 u'07815296484',
 u'07821230901',
 u'07880867867',
 u'0789xxxxxxx',
 u'07946746291',
 u'0796xxxxxx',
 u'07973788240',
 u'07xxxxxxxxx',
 u'08',
 u'0800',
 u'08000407165',
 u'08000776320',
 u'08000839402',
 u'08000930705']

In [76]:
train_features[-50:]

[u'yeovil',
 u'yep',
 u'yer',
 u'yes',
 u'yest',
 u'yesterday',
 u'yet',
 u'yetunde',
 u'yijue',
 u'ym',
 u'ymca',
 u'yo',
 u'yoga',
 u'yogasana',
 u'yor',
 u'yorge',
 u'you',
 u'youdoing',
 u'youi',
 u'youphone',
 u'your',
 u'youre',
 u'yourjob',
 u'yours',
 u'yourself',
 u'youwanna',
 u'yowifes',
 u'yoyyooo',
 u'yr',
 u'yrs',
 u'ything',
 u'yummmm',
 u'yummy',
 u'yun',
 u'yunny',
 u'yuo',
 u'yuou',
 u'yup',
 u'zac',
 u'zaher',
 u'zealand',
 u'zebra',
 u'zed',
 u'zeros',
 u'zhong',
 u'zindgi',
 u'zoe',
 u'zoom',
 u'zouk',
 u'zyada']

In [77]:
# convert train_dtm to a regular array
train_arr = train_dtm.toarray()
train_arr

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ..., 
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [78]:

# Revisit Numpy
arr = np.array([[1, 2, 3, 4], [5, 6, 7, 8]])
print arr[0, 0]
print arr[1, 3]
print arr[0, :]
print arr[:, 0]
print np.sum(arr)
print np.sum(arr,axis = 0)
print np.sum(arr,axis = 1)




1
8
[1 2 3 4]
[1 5]
36
[ 6  8 10 12]
[10 26]


In [79]:
# exercise: calculate the number of tokens in the 0th message in train_arr
print np.sum(train_arr[0,:])

24


In [80]:

# exercise: count how many times the 0th token appears across ALL messages in train_arr
print np.sum(train_arr[:,0])

5


In [81]:
# exercise: count how many times EACH token appears across ALL messages in train_arr
print np.sum(train_arr, axis=0)

[ 5 23  2 ...,  1  1  1]


In [82]:
# exercise: create a DataFrame of tokens with their counts.


Let's build the model with Naive Bayes Now

http://scikit-learn.org/stable/modules/naive_bayes.html

In [83]:
# train a Naive Bayes model using train_dtm
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()
nb.fit(train_dtm, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [84]:
# make predictions on test data using test_dtm
preds = nb.predict(test_dtm)
preds

array([0, 0, 0, ..., 0, 1, 0])

In [85]:
# compare predictions to true labels
from sklearn import metrics
print metrics.accuracy_score(y_test, preds)
print metrics.confusion_matrix(y_test, preds)
# confusion matrix: http://en.wikipedia.org/wiki/Confusion_matrix

0.987796123475
[[1203    5]
 [  12  173]]


In [86]:
# exercise: show the message text for the false positives
X_test[(y_test == 0) & (preds == 1)]

array(['Waiting for your call.', 'Also andros ice etc etc',
       'No calls..messages..missed calls', 'No pic. Please re-send.',
       'No calls..messages..missed calls'], dtype=object)

In [87]:
# exercise: show the message text for the false negatives
X_test[y_test > preds]
# or
X_test[(y_test == 1) & (preds == 0)]

array([ "LookAtMe!: Thanks for your purchase of a video clip from LookAtMe!, you've been charged 35p. Think you can do better? Why not send a video in a MMSto 32323.",
       "FreeMsg Hey there darling it's been 3 week's now and no word back! I'd like some fun you up for it still? Tb ok! XxX std chgs to send, \xe5\xa31.50 to rcv",
       "Xmas & New Years Eve tickets are now on sale from the club, during the day from 10am till 8pm, and on Thurs, Fri & Sat night this week. They're selling fast!",
       "Hi I'm sue. I am 20 years old and work as a lapdancer. I love sex. Text me live - I'm i my bedroom now. text SUE to 89555. By TextOperator G2 1DA 150ppmsg 18+",
       'Would you like to see my XXX pics they are so hot they were nearly banned in the uk!',
       'CALL 09090900040 & LISTEN TO EXTREME DIRTY LIVE CHAT GOING ON IN THE OFFICE RIGHT NOW TOTAL PRIVACY NO ONE KNOWS YOUR [sic] LISTENING 60P MIN 24/7MP 0870753331018+',
       'thesmszone.com lets you send free anonymous and maske

In [88]:
## USING ALL DATA AND CROSS-VALIDATION and run NB again

# make word count vector for messages
vect = CountVectorizer(decode_error = 'ignore')
vect.fit(df.msg)

# make DTM which has word counts per message
dtm = vect.transform(df.msg)

# fit using DTM and labels
from sklearn.cross_validation import cross_val_score
nb = MultinomialNB()
nb.fit(dtm, df.label)

# get scores from cross validation
scores = cross_val_score(nb, dtm, df.label, cv=5)
print scores

[ 0.98295964  0.98026906  0.97845601  0.98114901  0.97935368]


In [89]:
## EXERCISE: CALCULATE THE 'SPAMMINESS' OF EACH TOKEN

# create separate DataFrames for ham and spam ( df_ham and df_spam)
df_ham = df[df.label==0]
print len(df_ham)

df_spam = df[df.label==1]
print len(df_spam)

4825
747


In [90]:
# learn the vocabulary of ALL messages and save it
vect = CountVectorizer(decode_error = 'ignore')
vect.fit(df.msg)
vocab = pd.DataFrame(vect.get_feature_names())
print len(vocab)
vocab.to_csv('vocab.csv')

8724


In [91]:
ham_dtm = vect.transform(df[df.label==0].msg)
ham_arr = ham_dtm.toarray()
print len(ham_arr)
print ham_arr

4825
[[0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 ..., 
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]]


In [92]:
# create document-term matrix of spam, then convert to a regular array
spam_dtm = vect.transform(df[df.label==1].msg)
spam_arr = spam_dtm.toarray()
print len(spam_arr)
#print spam_arr

747


In [93]:
# count how many times EACH token appears across ALL messages in spam_arr
spam_word_counts = np.sum(spam_arr, axis = 0)
len(spam_word_counts)
print spam_word_counts
print len(spam_word_counts)

ham_word_counts = np.sum(ham_arr, axis = 0)
len(ham_word_counts)
print ham_word_counts
print len(ham_word_counts)

[10 29  0 ...,  0  1  0]
8724
[0 0 1 ..., 1 0 1]
8724


In [94]:
# create a DataFrame of tokens with their separate ham and spam counts
print len(vect.get_feature_names())
print len(ham_word_counts.tolist())
print len(spam_word_counts.tolist())
d = {
    'word': vect.get_feature_names(), 
    'ham_count': ham_word_counts.tolist(), 
    'spam_count': spam_word_counts.tolist()
}

counts_df = pd.DataFrame(data=d)
print counts_df

8724
8724
8724
      ham_count  spam_count          word
0             0          10            00
1             0          29           000
2             1           0        000pes
3             0           2  008704050406
4             0           1          0089
5             0           1          0121
6             0           1   01223585236
7             0           2   01223585334
8             1           0    0125698789
9             0           8            02
10            0           3          0207
11            0           1   02072069400
12            0           2   02073162414
13            0           1   02085076972
14            0           2           021
15            0          13            03
16            0          12            04
17            0           1          0430
18            0           5            05
19            0           2        050703
20            0           2          0578
21            0           8            06
22            0    

In [95]:
# add one to ham counts and spam counts so that ratio calculations (below) make more sensse
counts_df['spam_count'] = counts_df.spam_count + 1
counts_df['ham_count'] = counts_df.ham_count + 1

In [103]:
# calculate ratio of spam-to-ham for each token
counts_df['ratio'] = counts_df['spam_count'] / counts_df['ham_count']
print counts_df.sort_index(by='ratio', ascending=False)

#print counts_df.ratio

      ham_count  spam_count        word       ratio
2070          1         114       claim  114.000000
6121          1          94       prize   94.000000
352           1          72        150p   72.000000
7847          1          61        tone   61.000000
368           1          52          18   52.000000
3695          1          51  guaranteed   51.000000
2377          1          45          cs   45.000000
615           1          45         500   45.000000
299           1          42        1000   42.000000
1334          1          39     awarded   39.000000
8028          2          75          uk   37.500000
356           1          35      150ppm   35.000000
6534          1          33    ringtone   33.000000
8609          3          99         www   33.000000
1             1          30         000   30.000000
2969          1          27       entry   27.000000
7848          1          27       tones   27.000000
363           2          54          16   27.000000
2153        

In [100]:
# TODO
# advanced: implement your own naive bayes classifier
# P = (A * B ) / C
# A = given that mesg is spam, probability of that word showing up
# B = probability of spam in general
# C = probability of that word
def nbc(df, dtm):
    df[df.label==1]

# calculate probability for each class and compare; highest probability = class