In [1]:
#import required libraries
import pandas as pd
import string
from nltk.corpus import stopwords

In [2]:
#Get the spam data collection 
#import the dataset
spam_df = pd.read_csv('SpamCollection.csv', sep='\t',names=['response','message'])

In [3]:
#view the first five
spam_df.head()

Unnamed: 0,response,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
#view more information using the describe
spam_df.describe()


Unnamed: 0,response,message
count,5572,5572
unique,2,5169
top,ham,"Sorry, I'll call later"
freq,4825,30


In [5]:
#view response using groupby and describe method
spam_df.groupby('response').describe().T

Unnamed: 0,response,ham,spam
message,count,4825,747
message,unique,4516,653
message,top,"Sorry, I'll call later",Please call our customer service representativ...
message,freq,30,4


In [6]:
#Verify length of the messages and also add it as a new column
spam_df['length'] = spam_df['message'].apply(len)


In [7]:
#view irst five
spam_df.head()

Unnamed: 0,response,message,length
0,ham,"Go until jurong point, crazy.. Available only ...",111
1,ham,Ok lar... Joking wif u oni...,29
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155
3,ham,U dun say so early hor... U c already then say...,49
4,ham,"Nah I don't think he goes to usf, he lives aro...",61


In [8]:
#define a function to get rid of stopwords present in the messages
def text_process(message):
    #check characters to see if there are punctuations
    no_punctuation = [char for char in message if char not in string.punctuation]
    #now form the sentence
    no_punctuation = ''.join(no_punctuation)
    #now eliminate any stop words
    return [word for word in no_punctuation.split() if word.lower() not in stopwords.words('english')]

In [17]:
#verify that function is working
spam_df['message'].head().apply(text_process)


0    [Go, jurong, point, crazy, Available, bugis, n...
1                       [Ok, lar, Joking, wif, u, oni]
2    [Free, entry, 2, wkly, comp, win, FA, Cup, fin...
3        [U, dun, say, early, hor, U, c, already, say]
4    [Nah, dont, think, goes, usf, lives, around, t...
Name: message, dtype: object

In [18]:
#start text processing with vectorizer 
from sklearn.feature_extraction.text import CountVectorizer

In [34]:
#use bag of words by applying the function and fit the data into it
bag_of_words_transformer =CountVectorizer(analyzer=text_process).fit(spam_df['message'])

In [35]:
#print length of bag of words stored in the vocabulary_ attribute
print(len(bag_of_words_transformer.vocabulary_))

11425


In [36]:
#store bag of words for messages using the transform method
message_bow = bag_of_words_transformer.transform(spam_df['message'])


In [39]:
#apply tfidf transformer and fit the bag of words into it (transformed version)
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer().fit(message_bow)

In [40]:
#print shape of the tfidf 
message_tfidf = tfidf_transformer.transform(message_bow)
print(message_tfidf.shape)

(5572, 11425)


In [41]:
#choose naive Bayes model to detect the spam and fit the tfidf data into it
from sklearn.naive_bayes import MultinomialNB
spam_detect_model = MultinomialNB().fit(message_tfidf, spam_df['response'])

In [47]:
#check model for the predicted and expected value say for message#2 and message#5
message = spam_df['message'][4]
bag_of_word_for_message = bag_of_words_transformer.transform([message])
tfidf = tfidf_transformer.transform(bag_of_word_for_message)

print('predicted', spam_detect_model.predict(tfidf)[0])
print('expected',spam_df.response[4])

predicted ham
expected ham
