Bag of words (a.k.a. BOW) is a technique used for text representation in natural language processing.


In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("spam.csv")
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
df.Category.value_counts()

ham     4825
spam     747
Name: Category, dtype: int64

In [9]:
df.shape

(5572, 3)

In [4]:
# function for creating number columm for spam
# def get_spam(x):
#   if x == 'spam':
#     return 1
#   return 0

In [5]:
df['spam'] = df.Category.apply(lambda x: 1 if x == 'spam' else 0)

In [6]:
df.head(
  
)

Unnamed: 0,Category,Message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


Train Test Split

In [8]:
from sklearn.model_selection import train_test_split 

X_train, X_test, y_train, y_test = train_test_split(df.Message, df.spam, test_size = 0.2)


In [10]:
X_train.shape

(4457,)

In [11]:
X_test.shape

(1115,)

In [12]:
type(X_train)

pandas.core.series.Series

In [13]:
X_train[:4]

3428                    Haha okay... Today weekend leh...
4369    1 I don't have her number and 2 its gonna be a...
243     Okay. No no, just shining on. That was meant t...
2254                         Lol enjoy role playing much?
Name: Message, dtype: object

In [14]:
type(y_train)

pandas.core.series.Series

In [15]:
y_train[:4]

3428    0
4369    0
243     0
2254    0
Name: spam, dtype: int64

In [16]:
type(X_train.values)

numpy.ndarray

Create bag of words representation using CountVectorizer

In [17]:
from sklearn.feature_extraction.text import CountVectorizer

v = CountVectorizer()

X_train_cv = v.fit_transform(X_train.values)
X_train_cv


<4457x7752 sparse matrix of type '<class 'numpy.int64'>'
	with 59031 stored elements in Compressed Sparse Row format>

In [19]:
# convert to array to view
X_train_cv.toarray()[:2][0]

array([0, 0, 0, ..., 0, 0, 0])

In [20]:
X_train_cv.shape

(4457, 7752)

In [26]:
# to get entire bag of words in vocabulary
bow = v.get_feature_names_out()
bow

array(['00', '000', '000pes', ..., 'èn', 'ú1', '〨ud'], dtype=object)

In [24]:
bow[:10]

array(['00', '000', '000pes', '008704050406', '0089', '0121',
       '01223585236', '01223585334', '0125698789', '02'], dtype=object)

In [27]:
bow[1000:1050]

array(['antha', 'anthony', 'anti', 'antibiotic', 'any', 'anybody',
       'anyhow', 'anymore', 'anyone', 'anyones', 'anyplaces', 'anythiing',
       'anythin', 'anything', 'anythingtomorrow', 'anytime', 'anyway',
       'anyways', 'anywhere', 'apart', 'apartment', 'apes', 'apeshit',
       'aphex', 'apo', 'apologetic', 'apologise', 'apologize', 'apology',
       'app', 'apparently', 'appeal', 'appear', 'appendix',
       'applausestore', 'applebees', 'application', 'apply',
       'appointment', 'appreciate', 'appreciated', 'approaches',
       'approaching', 'appropriate', 'approve', 'approved', 'approx',
       'appt', 'appy', 'april'], dtype=object)

In [28]:
bow.shape

(7752,)

In [30]:
dir(v)

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_char_ngrams',
 '_char_wb_ngrams',
 '_check_feature_names',
 '_check_n_features',
 '_check_stop_words_consistency',
 '_check_vocabulary',
 '_count_vocab',
 '_get_param_names',
 '_get_tags',
 '_limit_features',
 '_more_tags',
 '_repr_html_',
 '_repr_html_inner',
 '_repr_mimebundle_',
 '_sort_features',
 '_stop_words_id',
 '_validate_data',
 '_validate_params',
 '_validate_vocabulary',
 '_warn_for_unused_params',
 '_white_spaces',
 '_word_ngrams',
 'analyzer',
 'binary',
 'build_analyzer',
 'build_preprocessor',
 'build_tokenizer',
 'decode',
 'decode_error',
 'dtype',
 'encoding',
 'fit',

In [32]:
v.vocabulary_

{'haha': 3285,
 'okay': 4903,
 'today': 6938,
 'weekend': 7437,
 'leh': 4060,
 'don': 2398,
 'have': 3346,
 'her': 3399,
 'number': 4846,
 'and': 972,
 'its': 3760,
 'gonna': 3174,
 'be': 1277,
 'massive': 4359,
 'pain': 5044,
 'in': 3637,
 'the': 6809,
 'ass': 1110,
 'rather': 5566,
 'not': 4818,
 'get': 3110,
 'involved': 3722,
 'if': 3595,
 'that': 6806,
 'possible': 5312,
 'no': 4783,
 'just': 3866,
 'shining': 6084,
 'on': 4919,
 'was': 7385,
 'meant': 4398,
 'to': 6931,
 'signing': 6153,
 'but': 1582,
 'sounds': 6336,
 'better': 1335,
 'lol': 4169,
 'enjoy': 2587,
 'role': 5808,
 'playing': 5233,
 'much': 4620,
 'wewa': 7472,
 'is': 3740,
 '130': 292,
 'iriver': 3734,
 '255': 370,
 'all': 922,
 '128': 288,
 'mb': 4384,
 'want': 7371,
 'sent': 6013,
 'lt': 4232,
 'gt': 3251,
 'mesages': 4439,
 'thats': 6808,
 'sorry': 6325,
 'hurts': 3555,
 'hui': 3538,
 'xin': 7656,
 'da': 2134,
 'lib': 4078,
 'please': 5237,
 'tell': 6748,
 'me': 4390,
 'of': 4882,
 'my': 4649,
 'car': 1655,
 'k

In [33]:
bow[5412]

'prize'

In [34]:
# convert email to np array
X_train_np = X_train_cv.toarray()
X_train_np

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [35]:
# first email is...
X_train_np[:4][0]

array([0, 0, 0, ..., 0, 0, 0])

In [36]:
# check for zero value

np.where(X_train_np[0]!=0)

(array([3285, 4060, 4903, 6938, 7437]),)

In [43]:
X_train[:4][3428]

'Haha okay... Today weekend leh...'

In [45]:
X_train_np[0][3285]

1

In [51]:
bow[3285]

'haha'

create naive bayes classifier

In [52]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(X_train_cv, y_train)

MultinomialNB()

In [54]:
X_test_cv = v.transform(X_test)
X_test_cv

<1115x7752 sparse matrix of type '<class 'numpy.int64'>'
	with 14052 stored elements in Compressed Sparse Row format>

evaluate perfomrance of model

In [56]:
from sklearn.metrics import classification_report

y_pred = model.predict(X_test_cv)

cr = classification_report(y_test, y_pred)
print(cr)

              precision    recall  f1-score   support

           0       0.99      0.99      0.99       974
           1       0.96      0.90      0.93       141

    accuracy                           0.98      1115
   macro avg       0.97      0.95      0.96      1115
weighted avg       0.98      0.98      0.98      1115



In [58]:
emails = [
    'Hey mohan, can we get together to watch footbal game tomorrow?',
    'Upto 20% discount on parking, exclusive offer just for you. Dont miss this reward!'
]

emails_count = v.transform(emails)
model.predict(emails_count)

array([0, 1])

using pipeline to skip coding

In [59]:
from sklearn.pipeline import Pipeline

clf = Pipeline([
                ('vectorizer', CountVectorizer()),
                ('nb', MultinomialNB())
])

In [60]:
clf.fit(X_train, y_train)

Pipeline(steps=[('vectorizer', CountVectorizer()), ('nb', MultinomialNB())])

In [61]:
y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99       974
           1       0.96      0.90      0.93       141

    accuracy                           0.98      1115
   macro avg       0.97      0.95      0.96      1115
weighted avg       0.98      0.98      0.98      1115

