##### Analysing Spam Collection Data
``DESCRIPTION``

** **Problem:** **

``Analyze the given Spam Collection dataset to:``

- View information on the spam data,
- View the length of messages,
- Define a function to eliminate stopwords,
- Apply Bag of Words,
- Apply tf-idf transformer, and
- Detect Spam with Naïve Bayes model.

In [1]:
# Import the necessary libraries
import pandas as pd
import string
from nltk.corpus import stopwords

In [6]:
#Get the spam data collection 
df_spamcollection = pd.read_csv(r'C:\Users\Jordi\OneDrive\Documents\EDUCATION\SIMPLILEARN\Artificial Intelligence Master Program\DATA SCIENCE WITH PYTHON\Lesson 9\SpamCollection\SpamCollection', sep='\t', names=['response','message'])

In [8]:
# Get more info from dataset
df_spamcollection.describe()

Unnamed: 0,response,message
count,5572,5572
unique,2,5169
top,ham,"Sorry, I'll call later"
freq,4825,30


In [25]:
#view response 
df_spamcollection.groupby('response').describe()

Unnamed: 0_level_0,Length,Length,Length,Length,Length,Length,Length,Length
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
response,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
ham,4825.0,71.482487,58.440652,2.0,33.0,52.0,93.0,910.0
spam,747.0,138.670683,28.873603,13.0,133.0,149.0,157.0,223.0


In [21]:
#Verify length of the messages and also add it as a new column 
df_spamcollection['Length'] = df_spamcollection['message'].apply(len)
df_spamcollection.head()

Unnamed: 0,response,message,Length
0,ham,"Go until jurong point, crazy.. Available only ...",111
1,ham,Ok lar... Joking wif u oni...,29
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155
3,ham,U dun say so early hor... U c already then say...,49
4,ham,"Nah I don't think he goes to usf, he lives aro...",61


In [26]:
#define a function to get rid of stopwords present in the messages
def get_rid_of_topwords(message):
    # Check characters to see if there are punctuations
    # Use the string class punctuation to remove present in each message.
    # Punctuations are also less waited to text analysis. Then we remove the stopwords
    no_punctuation = [char for char in message if char not in string.punctuation]
    # Now form the sentence
    no_punctuation = ''.join(no_punctuation)
    # Now eliminate any stopwords. Stopwords usually have little lexical content. In other words, less meaningful words when
    # it comes to text analysis (Example : I, me, myself, you, yours, ours, etc)
    return [word for word in no_punctuation.split() if word.lower() not in stopwords.words('english')]

In [27]:
# Test the function
get_rid_of_topwords(df_spamcollection['message'][0])

['Go',
 'jurong',
 'point',
 'crazy',
 'Available',
 'bugis',
 'n',
 'great',
 'world',
 'la',
 'e',
 'buffet',
 'Cine',
 'got',
 'amore',
 'wat']

In [28]:
df_spamcollection['message'][0]

'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...'

In [None]:
# start text processing with vectorizer
# Import the required library
from sklearn.feature_extraction.text import CountVectorizer

In [29]:
# use bag of words by applying the function and fit the data into it
bag_of_words = CountVectorizer(analyzer= get_rid_of_topwords).fit(df_spamcollection['message'])

In [31]:
# print length of bag of words stored in the vocabulary_ attribute
len(bag_of_words.vocabulary_)

11425

In [38]:
# Let's store the BoW for 'message' with the 'transform' method
message_in_bag_of_words = bag_of_words.transform(df_spamcollection['message'])

In [39]:
#apply tfidf transformer and fit the bag of words into it (transformed version)
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer().fit(message_in_bag_of_words)

In [41]:
#print shape of the tfidf 
message_tfidf = tfidf_transformer.transform(message_in_bag_of_words)
message_tfidf.shape

(5572, 11425)

In [43]:
# choose naive Bayes model to detect the spam and fit the tfidf data into it
from sklearn.naive_bayes import MultinomialNB
spam_detector = MultinomialNB().fit(message_tfidf, df_spamcollection['response'])

In [46]:
#check model for the predicted and expected value say for message#2 and message#5
message = df_spamcollection['message'][6]
bag_of_word_for_message = bag_of_words.transform([message])
tfidf_for_message = tfidf_transformer.transform(bag_of_word_for_message)
print(f' Predicted : {spam_detector.predict(tfidf_for_message)[0]}')
print(f' Expected : {df_spamcollection["response"][6]}')

 Predicted : ham
 Expected : ham
