In [45]:


import pandas as pd
import sklearn
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from textblob import TextBlob
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix


In [46]:
messages = pd.read_csv('Desktop\spam\SMSSpamCollection', sep='\t', quoting=csv.QUOTE_NONE,
                           names=["label", "message"])
print messages.head()

  label                                            message
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...


In [4]:
messages.groupby('label').describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,message
label,Unnamed: 1_level_1,Unnamed: 2_level_1
ham,count,4827
ham,unique,4518
ham,top,"Sorry, I'll call later"
ham,freq,30
spam,count,747
spam,unique,653
spam,top,Please call our customer service representativ...
spam,freq,4


In [None]:
messages['label']

In [23]:
# Power of textblob
#writing a function to split the words

def split_into_tokens(message):
    message = unicode(message, 'utf8')  # convert bytes into proper unicode
    return TextBlob(message).words

In [24]:
messages.message.head()

0    Go until jurong point, crazy.. Available only ...
1                        Ok lar... Joking wif u oni...
2    Free entry in 2 a wkly comp to win FA Cup fina...
3    U dun say so early hor... U c already then say...
4    Nah I don't think he goes to usf, he lives aro...
Name: message, dtype: object

In [25]:
#applying the function to message column
messages.message.head().apply(split_into_tokens)

0    [Go, until, jurong, point, crazy, Available, o...
1                       [Ok, lar, Joking, wif, u, oni]
2    [Free, entry, in, 2, a, wkly, comp, to, win, F...
3    [U, dun, say, so, early, hor, U, c, already, t...
4    [Nah, I, do, n't, think, he, goes, to, usf, he...
Name: message, dtype: object

In [26]:
#For example a sentence with pos
TextBlob("Hello world, how is it going?").tags

[('Hello', u'NNP'),
 ('world', u'NN'),
 ('how', u'WRB'),
 ('is', u'VBZ'),
 ('it', u'PRP'),
 ('going', u'VBG')]

In [27]:

# function to split words depending on lemmas
def split_into_lemmas(message):
    message = unicode(message, 'utf8').lower()
    words = TextBlob(message).words
    # for each word, take its "base form" = lemma 
    return [word.lemma for word in words]

messages.message.head().apply(split_into_lemmas)

#Better. You can probably think of many more ways to improve the preprocessing: decoding HTML entities (those &amp;
#and &lt; we saw above); filtering out stop words (pronouns etc); adding more features, 
#such as an word-in-all-caps indicator and so on.

0    [go, until, jurong, point, crazy, available, o...
1                       [ok, lar, joking, wif, u, oni]
2    [free, entry, in, 2, a, wkly, comp, to, win, f...
3    [u, dun, say, so, early, hor, u, c, already, t...
4    [nah, i, do, n't, think, he, go, to, usf, he, ...
Name: message, dtype: object

In [40]:
#Now converting strings to vectors using countvectorizer and tfidftransformer
#Use %time before the statement for checking time taken
bow_transformer = CountVectorizer(analyzer=split_into_lemmas).fit(messages['message'])
print len(bow_transformer.vocabulary_)

8874


In [29]:
# create a dummy message for checking the prediction
message4="Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's"

In [30]:
#change the dummy message into vectors
bow4 = bow_transformer.transform([message4])
print bow4.shape

(1, 8874)


In [42]:
#transforming all the messages into vectors

messages_bow = bow_transformer.transform(messages['message'])

#Now applying tfidf to messages
tfidf_transformer = TfidfTransformer().fit(messages_bow)



In [32]:
#transforming all messages to tfidf
messages_tfidf = tfidf_transformer.transform(messages_bow)
print messages_tfidf.shape

(5574, 8874)


In [None]:
#transforming dummy message to tfidf 
tfidf4 = tfidf_transformer.transform(bow4)
print tfidf4

In [34]:
#Using naive bayes to classify the messages
spam_detector = MultinomialNB().fit(messages_tfidf, messages['label'])

In [37]:
%time print 'predicted:', spam_detector.predict(tfidf4)[0]

predicted: spam
CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 866 µs


In [43]:
# for checking accuracy

all_predictions = spam_detector.predict(messages_tfidf)
print all_predictions

# accuracy
print 'accuracy', accuracy_score(messages['label'], all_predictions)
print 'confusion matrix\n', confusion_matrix(messages['label'], all_predictions)

['ham' 'ham' 'spam' ..., 'ham' 'ham' 'ham']
accuracy 0.969501255831
confusion matrix
[[4827    0]
 [ 170  577]]


In [44]:
print classification_report(messages['label'], all_predictions)

             precision    recall  f1-score   support

        ham       0.97      1.00      0.98      4827
       spam       1.00      0.77      0.87       747

avg / total       0.97      0.97      0.97      5574



# Applying Naive Bayes classification to spam filtering

Consider we have an email with three words: "Send money now." We'll use Naive Bayes to classify it as ham or spam.

        P(spam | 'Send money now') = (P(send | spam) x P(money | spam) x P(now | spam)) x P(spam) / P(send money now)
               
             
similarly for ham,


        P(ham | 'Send money now') = (P(send | ham) x P(money | ham) x P(now | ham)) x P(ham) / P(send money now)
            

All we care about is whether spam or ham has the higher probability, and so we predict that the email is spam or ham.
               
               
                                                          

Try using train_test_split,cv,pipeline and grid and also tune the parameters to get more score.