In [53]:
%matplotlib inline
import matplotlib.pyplot as plt
import csv
from textblob import TextBlob
import pandas
import sklearn
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC, LinearSVC
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import StratifiedKFold, cross_val_score, train_test_split 
from sklearn.tree import DecisionTreeClassifier 
from sklearn.learning_curve import learning_curve

In [184]:
messages = pandas.read_csv('spam_train.csv', sep=',', quoting=csv.QUOTE_NONE,
                           names=["label", "message"])
print(messages)

      label                                            message
0         1  public announc the new domain name ar final av...
1         1  have tax problem do you ow the ir monei if you...
2         0  r robert harlei write r scuse me for post in g...
3         0  on wed number oct number brian fahrland wrote ...
4         0  quot ronan waid sure but soft link would do th...
5         0  i notic a low count of razor d spam messag so ...
6         1  nigeria electirc power author feder secretaria...
7         0  on mon number sep number tom wrote if the set ...
8         0  shopper newslett alert live tech help now apri...
9         0  httpaddr summari split up from end in num prod...
10        0  i think that thi and other articl confus socia...
11        0  url httpaddr date number number numbertnumb nu...
12        1  free info free the insid stock market report d...
13        0  i just updat to the latest cv i had been run a...
14        0  on wed jul number number at number number 

In [41]:
messages.groupby('label').describe()

Unnamed: 0_level_0,message,message,message,message
Unnamed: 0_level_1,count,unique,top,freq
label,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
0,3424,3369,url httpaddr date not suppli httpaddr,9
1,1558,1334,as seen on nbc cb cnn and even oprah the healt...,8


In [47]:
messages.head()

Unnamed: 0,label,message
0,1,public announc the new domain name ar final av...
1,1,have tax problem do you ow the ir monei if you...
2,0,r robert harlei write r scuse me for post in g...
3,0,on wed number oct number brian fahrland wrote ...
4,0,quot ronan waid sure but soft link would do th...


In [48]:
messages.message.head()

0    public announc the new domain name ar final av...
1    have tax problem do you ow the ir monei if you...
2    r robert harlei write r scuse me for post in g...
3    on wed number oct number brian fahrland wrote ...
4    quot ronan waid sure but soft link would do th...
Name: message, dtype: object

In [61]:
def split_into_tokens(message):  # convert bytes into proper unicode
    return TextBlob(message).words

In [60]:
messages.message.head().apply(split_into_tokens)

0    [public, announc, the, new, domain, name, ar, ...
1    [have, tax, problem, do, you, ow, the, ir, mon...
2    [r, robert, harlei, write, r, scuse, me, for, ...
3    [on, wed, number, oct, number, brian, fahrland...
4    [quot, ronan, waid, sure, but, soft, link, wou...
Name: message, dtype: object

In [71]:
def split_into_lemmas(message):
    words = TextBlob(message).words
    # for each word, take its "base form" = lemma 
    return [word.lemma for word in words]

messages.message.head().apply(split_into_lemmas)

0    [public, announc, the, new, domain, name, ar, ...
1    [have, tax, problem, do, you, ow, the, ir, mon...
2    [r, robert, harlei, write, r, scuse, me, for, ...
3    [on, wed, number, oct, number, brian, fahrland...
4    [quot, ronan, waid, sure, but, soft, link, wou...
Name: message, dtype: object

In [96]:
bow_transformer = CountVectorizer().fit(messages['message'].values.astype('U'))

In [172]:
message4 = messages['message'][3]
print(message4)

on wed number oct number brian fahrland wrote on wed number oct number number number number number edt samuel checker wrote i ve been test razor invok from sendmail procmail and so far it seem pretti copacet last night s spam to the list provid a good test the spam itself as well as sever of the respons were flag as other list member report ar you us spamassassin on the input side i ve just chang my sendmail instal and am look for the proper wai to pass it through there systemwid befor accept it and send it to the user it s kinda problemat to set up procmail script for everi user when the user s home directori ar nf mount and the sourc is on my own machin on which i try new thing and it s the onli machin with the drivespac i ve not us spamassassin on the kiss principl i just have procmail ad an x header and option modifi the subject if razor check come back posit sc thi sf net email is sponsor by thinkgeek welcom to geek heaven httpaddr razor user mail list emailaddr httpaddr


In [173]:
bow4 = bow_transformer.transform([message4])
print(bow4)
print(bow4.shape)

  (0, 217)	1
  (0, 343)	1
  (0, 1003)	1
  (0, 1109)	1
  (0, 1145)	6
  (0, 1548)	2
  (0, 1734)	3
  (0, 2364)	1
  (0, 2779)	1
  (0, 2788)	1
  (0, 4112)	1
  (0, 4518)	1
  (0, 5249)	1
  (0, 5332)	1
  (0, 5334)	1
  (0, 6325)	1
  (0, 6768)	1
  (0, 8514)	1
  (0, 9960)	1
  (0, 10447)	1
  (0, 10731)	1
  (0, 10732)	1
  (0, 11418)	1
  (0, 11775)	1
  (0, 11856)	1
  :	:
  (0, 33899)	1
  (0, 34399)	1
  (0, 34841)	2
  (0, 34951)	12
  (0, 34999)	1
  (0, 35026)	1
  (0, 35042)	1
  (0, 35047)	1
  (0, 35148)	1
  (0, 35449)	5
  (0, 35898)	1
  (0, 36849)	1
  (0, 36919)	2
  (0, 36946)	4
  (0, 37193)	3
  (0, 37817)	1
  (0, 38089)	2
  (0, 38131)	1
  (0, 38136)	1
  (0, 38160)	1
  (0, 38224)	1
  (0, 38237)	1
  (0, 38462)	1
  (0, 38727)	2
  (0, 39432)	1
(1, 40328)


In [98]:
print(bow_transformer.get_feature_names()[1145])
print(bow_transformer.get_feature_names()[34951])

and
the


In [162]:
messages_bow = bow_transformer.transform(messages['message'].values.astype('U'))

In [101]:
tfidf_transformer = TfidfTransformer().fit(messages_bow)
tfidf4 = tfidf_transformer.transform(bow4)
print(tfidf4)

  (0, 39432)	0.0252486413552
  (0, 38727)	0.0853346565198
  (0, 38462)	0.0281898735832
  (0, 38237)	0.0421188912567
  (0, 38224)	0.0432337819169
  (0, 38160)	0.0532181126874
  (0, 38136)	0.0496218279126
  (0, 38131)	0.0620549219726
  (0, 38089)	0.143503457728
  (0, 37817)	0.04634675693
  (0, 37193)	0.144584290266
  (0, 36946)	0.166292877206
  (0, 36919)	0.0590710502164
  (0, 36849)	0.0384763258511
  (0, 35898)	0.0529153254424
  (0, 35449)	0.098466529274
  (0, 35148)	0.0540428293453
  (0, 35047)	0.065764446055
  (0, 35042)	0.0513263914028
  (0, 35026)	0.0247193240872
  (0, 34999)	0.0383938435155
  (0, 34951)	0.236510526626
  (0, 34841)	0.125917974767
  (0, 34399)	0.145179071813
  (0, 33899)	0.0499173930508
  :	:
  (0, 11856)	0.0680343346968
  (0, 11775)	0.128264998084
  (0, 11418)	0.056647784897
  (0, 10732)	0.0299195459304
  (0, 10731)	0.0370923821227
  (0, 10447)	0.107288665928
  (0, 9960)	0.145179071813
  (0, 8514)	0.0725849666327
  (0, 6768)	0.140218090308
  (0, 6325)	0.053357520616

In [106]:
print(tfidf_transformer.idf_[bow_transformer.vocabulary_['the']])
print(tfidf_transformer.idf_[bow_transformer.vocabulary_['and']])

1.1429162822
1.24150750194


In [163]:
messages_tfidf = tfidf_transformer.transform(messages_bow)
print(messages_tfidf)

  (0, 39444)	0.0744528677176
  (0, 39432)	0.0201864878854
  (0, 38352)	0.0286071653285
  (0, 38160)	0.042548300795
  (0, 38136)	0.0396730427557
  (0, 37514)	0.0475517685342
  (0, 36919)	0.0236138852541
  (0, 35462)	0.0837110922425
  (0, 35449)	0.110214673387
  (0, 35026)	0.0197632945552
  (0, 35017)	0.0392646333902
  (0, 34951)	0.157576692719
  (0, 34937)	0.0204387411387
  (0, 32490)	0.0649844264924
  (0, 31341)	0.0833753256058
  (0, 30642)	0.038155682306
  (0, 30237)	0.0352939161194
  (0, 30224)	0.0581694409307
  (0, 30096)	0.0803591592265
  (0, 30093)	0.171325080103
  (0, 29914)	0.0516893847257
  (0, 29068)	0.106962525184
  (0, 28799)	0.0592443159723
  (0, 28624)	0.0932522047429
  (0, 26445)	0.0698054648899
  :	:
  (4999, 12816)	0.158724242709
  (4999, 12723)	0.0784983463311
  (4999, 12280)	0.0958315586887
  (4999, 10859)	0.0568006012025
  (4999, 8880)	0.0366191922756
  (4999, 8671)	0.110013767295
  (4999, 8025)	0.104394994271
  (4999, 7563)	0.0849288201997
  (4999, 7376)	0.099441080

In [134]:
%time spam_detector = MultinomialNB().fit(messages_tfidf, messages['label'])

CPU times: user 7.9 ms, sys: 4.23 ms, total: 12.1 ms
Wall time: 10.4 ms


In [135]:
print('predicted:', spam_detector.predict(tfidf4)[0])
print('expected:', messages.label[3])

predicted: 0
expected: 0


In [136]:
all_predictions = spam_detector.predict(messages_tfidf)
print(all_predictions)

[1 1 0 ..., 1 0 0]


In [137]:
print('accuracy', accuracy_score(messages['label'], all_predictions))
print('confusion matrix\n', confusion_matrix(messages['label'], all_predictions))
print('(row=expected, col=predicted)')

accuracy 0.9166
confusion matrix
 [[3419   11]
 [ 406 1164]]
(row=expected, col=predicted)


In [146]:
msg_train, msg_test, label_train, label_test = train_test_split(messages['message'], messages['label'], test_size=0.2)

print(len(msg_train), len(msg_test), len(msg_train) + len(msg_test))

4000 1000 5000


In [180]:
pipeline = Pipeline([
    ('bow', CountVectorizer(min_df=30)),  # strings to token integer counts remove words that appear in less than 30 emails
    ('tfidf', TfidfTransformer()),  # integer counts to weighted TF-IDF scores
    ('classifier', MultinomialNB()),  # train on TF-IDF vectors w/ Naive Bayes classifier
])

In [181]:
scores = cross_val_score(pipeline,  # steps to convert raw messages into models
                         msg_train.values.astype('U'),  # training data
                         label_train,  # training labels
                         cv=10,  # split data randomly into 10 parts: 9 for training, 1 for scoring
                         scoring='accuracy',  # which scoring metric?
                         n_jobs=-1,  # -1 = use all cores = faster
                         )
print(scores)

[ 0.96508728  0.96009975  0.9625      0.96        0.9675      0.9675      0.96
  0.975       0.97493734  0.97493734]


In [182]:
print(scores.mean(), scores.std())


0.966756171914 0.00599525020503
