In [1]:
import pandas as pd

Read data from file.

In [2]:
%time data_set = pd.read_table('data/spam_ham.tsv', header = None, names = ['label', 'sms'])

CPU times: user 12 ms, sys: 4 ms, total: 16 ms
Wall time: 15.3 ms


Check data read from file.

In [3]:
data_set.head(4)

Unnamed: 0,label,sms
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...


In [4]:
data_set.describe()

Unnamed: 0,label,sms
count,5572,5572
unique,2,5169
top,ham,"Sorry, I'll call later"
freq,4825,30


Map label text to digit.

In [5]:
%time data_set['label_id'] = data_set.label.map({'spam':1, 'ham':0})

CPU times: user 4 ms, sys: 4 ms, total: 8 ms
Wall time: 5.7 ms


In [6]:
data_set.head(4)

Unnamed: 0,label,sms,label_id
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0


In [7]:
x = data_set.sms
y = data_set.label_id

Split the data set into two parts, training and testing.

In [8]:
from sklearn.cross_validation import train_test_split

x_train, x_test, y_train, y_test = train_test_split (x, y, random_state=1)

print (x_train.shape)
print (x_test.shape)

(4179,)
(1393,)




Extract features from text x.

In [9]:
#from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
#%time cvect = CountVectorizer(stop_words = 'english', max_features = 500)
cvect = TfidfVectorizer(stop_words = 'english', max_features = 2000)
cvect.fit(x_train)
%time x_train_dtm = cvect.transform(x_train)
x_train_dtm.shape

x_train_dtm

CPU times: user 72 ms, sys: 0 ns, total: 72 ms
Wall time: 73.5 ms


<4179x2000 sparse matrix of type '<type 'numpy.float64'>'
	with 25858 stored elements in Compressed Sparse Row format>

In [10]:
x_test_dtm = cvect.transform(x_test)
x_test_dtm

<1393x2000 sparse matrix of type '<type 'numpy.float64'>'
	with 8417 stored elements in Compressed Sparse Row format>

In [11]:
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()
nb.fit(x_train_dtm, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [12]:
%time y_test_pred = nb.predict(x_test_dtm)

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 432 µs


In [13]:
from sklearn import metrics

metrics.accuracy_score(y_test, y_test_pred)

0.98564249820531225

In [14]:
metrics.confusion_matrix(y_test, y_test_pred)

array([[1208,    0],
       [  20,  165]])

Show false postive prediction, predicted class is 'spam', actual class is 'ham'.

In [15]:
x_test[y_test < y_test_pred]

Series([], Name: sms, dtype: object)

Show false negative prediction, predicted class is 'ham', actual class is 'spam'.

In [16]:
x_test[y_test > y_test_pred]

1217    You have 1 new voicemail. Please call 08719181...
3132    LookAtMe!: Thanks for your purchase of a video...
2295     You have 1 new message. Please call 08718738034.
5110      You have 1 new message. Please call 08715205273
1045    We know someone who you know that fancies you....
5       FreeMsg Hey there darling it's been 3 week's n...
3530    Xmas & New Years Eve tickets are now on sale f...
684     Hi I'm sue. I am 20 years old and work as a la...
1328    Ur balance is now £500. Ur next question is: W...
1893    CALL 09090900040 & LISTEN TO EXTREME DIRTY LIV...
4298    thesmszone.com lets you send free anonymous an...
4949    Hi this is Amy, we will be sending you a free ...
3991    (Bank of Granite issues Strong-Buy) EXPLOSIVE ...
2941     You have 1 new message. Please call 08712400200.
1625    500 free text msgs. Just text ok to 80488 and ...
3564    Auction round 4. The highest bid is now £54. N...
2821    INTERFLORA - It's not too late to order Inter...
5037    You wo

In [17]:
print (metrics.classification_report(y_test, y_test_pred))

             precision    recall  f1-score   support

          0       0.98      1.00      0.99      1208
          1       1.00      0.89      0.94       185

avg / total       0.99      0.99      0.99      1393

