In [1]:
#Pre-req
import string
from nltk.corpus import stopwords

def textPreprocessing(data):
    #Remove Punctuations
    removeP = [ c for c in data if c not in string.punctuation ] #List Comprehension
    sentences = ''.join(removeP)
    #Convert Sentences to Words
    words = sentences.split(" ")
    #Remove Stopwords
    vocabulary = [ word for word in words if word not in stopwords.words('english') ]
    #Return Vocabulary
    return vocabulary

In [2]:
import numpy as np
import pandas as pd

In [3]:
data = pd.read_csv('SMSSpamCollection', sep='\t', names=['label','message'])
data.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   label    5572 non-null   object
 1   message  5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [5]:
#SKLearn allows you to create BOW using WordVector

from sklearn.feature_extraction.text import CountVectorizer
wordVector = CountVectorizer(analyzer=textPreprocessing) #Preprocessing + BOW
finalWordVectorCreator = wordVector.fit(data['message']) # it allows np.array or series of data as input

In [6]:
#Vocabulary with Document Freq of each unique word !!!!
finalWordVectorCreator.vocabulary_

{'Go': 2114,
 'jurong': 7748,
 'point': 9111,
 'crazy': 5962,
 'Available': 1134,
 'bugis': 5411,
 'n': 8530,
 'great': 7130,
 'world': 11357,
 'la': 7861,
 'e': 6410,
 'buffet': 5410,
 'Cine': 1522,
 'got': 7099,
 'amore': 4846,
 'wat': 11159,
 'Ok': 3178,
 'lar': 7894,
 'Joking': 2536,
 'wif': 11266,
 'u': 10892,
 'oni': 8784,
 'Free': 1994,
 'entry': 6524,
 '2': 424,
 'wkly': 11317,
 'comp': 5812,
 'win': 11278,
 'FA': 1883,
 'Cup': 1590,
 'final': 6750,
 'tkts': 10706,
 '21st': 444,
 'May': 2897,
 '2005': 431,
 'Text': 4096,
 '87121': 872,
 'receive': 9446,
 'questionstd': 9353,
 'txt': 10880,
 'rateTCs': 9394,
 'apply': 4924,
 '08452810075over18s': 74,
 'U': 4222,
 'dun': 6397,
 'say': 9748,
 'early': 6415,
 'hor': 7379,
 'c': 5454,
 'already': 4822,
 'Nah': 3048,
 'I': 2363,
 'dont': 6316,
 'think': 10627,
 'goes': 7070,
 'usf': 10993,
 'lives': 8035,
 'around': 4970,
 'though': 10644,
 'FreeMsg': 1996,
 'Hey': 2291,
 'darling': 6057,
 '3': 544,
 'weeks': 11205,
 'word': 11344,
 

In [10]:
len(finalWordVectorCreator.vocabulary_)

11619

In [7]:
bow = finalWordVectorCreator.transform(data['message'])

In [8]:
bow

<5572x11619 sparse matrix of type '<class 'numpy.int64'>'
	with 57067 stored elements in Compressed Sparse Row format>

In [None]:
# the no.od rows = 5572
# the no.of columns = 11619

In [9]:
bow.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [1, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [None]:
# why Bag of words having data in sparse matrix?
# Ans: for effective memory storage and effictive utilization of RAM.

In [11]:
bow1 = bow.toarray()

In [12]:
bow1

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [1, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [None]:
# here our goal is each string value in the document is convert to numeric value
# if there are more number of columns, each column (feature) will have different bag of words.
# so each column need to be treated seperately.

In [14]:
bowCol1 = finalWordVectorCreator.transform(data['message'])

In [15]:
bowCol1

<5572x11619 sparse matrix of type '<class 'numpy.int64'>'
	with 57067 stored elements in Compressed Sparse Row format>

In [16]:
#TF IDF

from sklearn.feature_extraction.text import TfidfTransformer
tfidfObject = TfidfTransformer().fit(bow)

In [17]:
finalMessageFeature = tfidfObject.transform(bow)

In [18]:
finalMessageFeature

<5572x11619 sparse matrix of type '<class 'numpy.float64'>'
	with 57067 stored elements in Compressed Sparse Row format>

In [19]:
f1 = finalMessageFeature.toarray()

In [20]:
f1

array([[0.       , 0.       , 0.       , ..., 0.       , 0.       ,
        0.       ],
       [0.       , 0.       , 0.       , ..., 0.       , 0.       ,
        0.       ],
       [0.       , 0.       , 0.       , ..., 0.       , 0.       ,
        0.       ],
       ...,
       [0.1538349, 0.       , 0.       , ..., 0.       , 0.       ,
        0.       ],
       [0.       , 0.       , 0.       , ..., 0.       , 0.       ,
        0.       ],
       [0.       , 0.       , 0.       , ..., 0.       , 0.       ,
        0.       ]])

In [21]:
# Observe there is surprise to see the value of term frequency as 0.1538349

In [None]:
#Create Train Test Split
# Here data is not splitted for testing intentionally,
# so we can try with split the data and do it again as a home work

In [23]:
#Create Model
# naive_bayes is having different type of fucntions for different requirement, but here ,
# the feature is in the form of text which is pure string , thats why we use MultinomialNB
from sklearn.naive_bayes import MultinomialNB 
model = MultinomialNB()
model.fit(finalMessageFeature, data['label'])

MultinomialNB()

In [24]:
model.score(finalMessageFeature,data['label'])

0.9791816223977028

In [None]:
# based on the results meaning score is good but we do not know whether the data is generalized 
# / normalised as we did not check the training data with testing data.
# the complete data is used to train the model, and accuracy score is good enough with the obtained model score

In [25]:
#Check this model with input

inputSMS = input("Enter SMS Content: ")
preprocessText = textPreprocessing(inputSMS)
vector = finalWordVectorCreator.transform(preprocessText) # vector is nothing but BOW
finalFeature = tfidfObject.transform(vector)
pred = model.predict(finalFeature)[0]

print("Given SMS is ",pred)

Enter SMS Content:  This is the mail received for celebration od birthday with new gift


Given SMS is  ham


In [26]:
inputSMS = input("Enter SMS Content: ")
preprocessText = textPreprocessing(inputSMS)
vector = finalWordVectorCreator.transform(preprocessText) # vector is nothing but BOW
finalFeature = tfidfObject.transform(vector)
pred = model.predict(finalFeature)[0]

print("Given SMS is ",pred)

Enter SMS Content:  Win lottery guaranteed!!1


Given SMS is  spam
