<h2 align='center'>NLP Tutorial: Text Representation - Bag Of Words (BOW)</h2>

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("spam.csv")
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
df.Category.value_counts()

Category
ham     4825
spam     747
Name: count, dtype: int64

In [4]:
df['spam'] = df['Category'].apply(lambda x: 1 if x =='spam' else 0)

In [5]:
df.shape

(5572, 3)

In [6]:
df.head()

Unnamed: 0,Category,Message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


<h3>Train test split</h3>

In [7]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df.Message, df.spam, test_size=0.2)

In [8]:
X_train.shape

(4457,)

In [9]:
X_test.shape

(1115,)

In [10]:
type(X_train)

pandas.core.series.Series

In [11]:
X_train[:4]

1509    Sounds like something that someone testing me ...
2251    I am getting threats from your sales executive...
4648    God created gap btwn ur fingers so dat sum1 vr...
2820    Don't forget who owns you and who's private pr...
Name: Message, dtype: object

In [12]:
type(y_train)

pandas.core.series.Series

In [13]:
y_train[:4]

1509    0
2251    0
4648    0
2820    0
Name: spam, dtype: int64

In [14]:
type(X_train.values)

numpy.ndarray

<h3>Create bag of words representation using CountVectorizer</h3>

In [15]:
from sklearn.feature_extraction.text import CountVectorizer

v = CountVectorizer()

X_train_cv = v.fit_transform(X_train.values)
X_train_cv

<4457x7697 sparse matrix of type '<class 'numpy.int64'>'
	with 59442 stored elements in Compressed Sparse Row format>

In [16]:
X_train_cv.toarray()[:2][0]

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [17]:
X_train_cv.shape

(4457, 7697)

In [18]:
v.get_feature_names_out()[1771]

'chess'

In [19]:
v.vocabulary_

{'sounds': 6314,
 'like': 4096,
 'something': 6279,
 'that': 6776,
 'someone': 6274,
 'testing': 6749,
 'me': 4389,
 'would': 7566,
 'sayy': 5905,
 'am': 938,
 'getting': 3123,
 'threats': 6833,
 'from': 3018,
 'your': 7663,
 'sales': 5862,
 'executive': 2698,
 'shifad': 6059,
 'as': 1089,
 'raised': 5538,
 'complaint': 1932,
 'against': 864,
 'him': 3431,
 'its': 3762,
 'an': 958,
 'official': 4890,
 'message': 4440,
 'god': 3165,
 'created': 2065,
 'gap': 3070,
 'btwn': 1547,
 'ur': 7161,
 'fingers': 2857,
 'so': 6257,
 'dat': 2172,
 'sum1': 6560,
 'vry': 7285,
 'special': 6336,
 'will': 7480,
 'fill': 2841,
 'those': 6824,
 'gaps': 3071,
 'by': 1595,
 'holding': 3460,
 'hands': 3309,
 'now': 4829,
 'plz': 5244,
 'dont': 2411,
 'ask': 1096,
 'he': 3362,
 'much': 4618,
 'between': 1337,
 'legs': 4054,
 'don': 2408,
 'forget': 2940,
 'who': 7456,
 'owns': 5023,
 'you': 7658,
 'and': 963,
 'private': 5408,
 'property': 5449,
 'are': 1052,
 'be': 1272,
 'my': 4653,
 'good': 3184,
 'boy':

In [20]:
X_train_np = X_train_cv.toarray()
X_train_np[0]

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [21]:
np.where(X_train_np[0]!=0)

(array([4096, 4389, 5905, 6274, 6279, 6314, 6749, 6776, 7566], dtype=int64),)

In [22]:
X_train[:4][1579]

KeyError: 1579

In [23]:
X_train_np[0][1771]

0

<h3>Train the naive bayes model</h3>

In [24]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(X_train_cv, y_train)

In [25]:
X_test_cv = v.transform(X_test)

<h3>Evaluate Performance</h3>

In [26]:
from sklearn.metrics import classification_report

y_pred = model.predict(X_test_cv)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99       978
           1       0.99      0.93      0.96       137

    accuracy                           0.99      1115
   macro avg       0.99      0.96      0.98      1115
weighted avg       0.99      0.99      0.99      1115



In [27]:
emails = [
    'Hey mohan, can we get together to watch footbal game tomorrow?',
    'Upto 20% discount on parking, exclusive offer just for you. Dont miss this reward!'
]

emails_count = v.transform(emails)
model.predict(emails_count)

array([0, 1], dtype=int64)

<h3>Train the model using sklearn pipeline and reduce number of lines of code</h3>

In [28]:
from sklearn.pipeline import Pipeline

clf = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('nb', MultinomialNB())
])

In [29]:
clf.fit(X_train, y_train)

In [30]:
y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99       978
           1       0.99      0.93      0.96       137

    accuracy                           0.99      1115
   macro avg       0.99      0.96      0.98      1115
weighted avg       0.99      0.99      0.99      1115

