NLP Tutorial: Text Representation - Bag Of Words (BOW)

In [1]:
import pandas as pd
import numpy as np


In [2]:
df = pd.read_csv("spam.csv")
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
df['Category'].value_counts()

Category
ham     4825
spam     747
Name: count, dtype: int64

In [4]:
df.shape

(5572, 2)

In [5]:
df['Spam'] = df['Category'].apply(lambda x : 1 if x=='spam' else 0)
# OR if we use a function
"""
def get_spam_number(x):
    if x == "spam":
        return 1
    return 0
 
df['Spam'] = df['Category'].apply(get_spam_number)
"""


'\ndef get_spam_number(x):\n    if x == "spam":\n        return 1\n    return 0\n\ndf[\'Spam\'] = df[\'Category\'].apply(get_spam_number)\n'

In [6]:
df.head()

Unnamed: 0,Category,Message,Spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [7]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(df['Message'],df['Spam'],test_size= 0.2)

In [8]:
X_train.shape

(4457,)

In [9]:
X_test.shape

(1115,)

In [10]:
type(X_train)

pandas.core.series.Series

In [11]:
type(X_train.values)

numpy.ndarray

In [12]:
X_train[:4]

4474                             S but not able to sleep.
2441    Great. I'm in church now, will holla when i ge...
3945     Until 545 lor... Ya, can go 4 dinner together...
878     Sunshine Quiz Wkly Q! Win a top Sony DVD playe...
Name: Message, dtype: object

In [13]:
from sklearn.feature_extraction.text import CountVectorizer

v= CountVectorizer()

X_train_cv = v.fit_transform(X_train.values)
X_train_cv

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 59540 stored elements and shape (4457, 7702)>

In [14]:
X_train_np = X_train_cv.toarray()
X_train_np[0]

array([0, 0, 0, ..., 0, 0, 0])

In [15]:
v.get_feature_names_out()[1000:1050]

array(['anybody', 'anyhow', 'anymore', 'anyone', 'anyones', 'anyplaces',
       'anythiing', 'anythin', 'anything', 'anythingtomorrow', 'anytime',
       'anyway', 'anyways', 'anywhere', 'aom', 'apart', 'apartment',
       'apes', 'apeshit', 'apnt', 'apo', 'apologise', 'apology', 'app',
       'apparently', 'appeal', 'appear', 'appendix', 'applebees',
       'apples', 'application', 'apply', 'applyed', 'applying',
       'appointment', 'appointments', 'appreciate', 'appreciated',
       'approaches', 'approaching', 'appropriate', 'approve', 'approved',
       'approx', 'apps', 'appt', 'appy', 'april', 'aproach', 'apt'],
      dtype=object)

In [16]:
v.get_feature_names_out().shape

(7702,)

In [17]:
np.where(X_train_np[0]!=0)

(array([ 762, 1583, 4811, 6166, 6884]),)

In [18]:
v.get_feature_names_out()[1861]

'closer'

In [19]:
X_train_np[0][1268]

np.int64(0)

In [20]:
X_train[3456]

'No need lar. Jus testing e phone card. Dunno network not gd i thk. Me waiting 4 my sis 2 finish bathing so i can bathe. Dun disturb u liao u cleaning ur room.'

In [21]:
from sklearn.naive_bayes import MultinomialNB


model = MultinomialNB()
model.fit(X_train_np,y_train)

0,1,2
,alpha,1.0
,force_alpha,True
,fit_prior,True
,class_prior,


In [22]:
X_test_cv = v.transform(X_test) 
# only transform not fit transform bec we use pre learned parameters on unseen Data



In [23]:
from sklearn.metrics import classification_report

y_pred = model.predict(X_test_cv)

print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99       974
           1       0.98      0.95      0.96       141

    accuracy                           0.99      1115
   macro avg       0.99      0.97      0.98      1115
weighted avg       0.99      0.99      0.99      1115



In [24]:
# we can use a convenient API
from sklearn.pipeline import Pipeline

clf = Pipeline([
    ('vectorizer',CountVectorizer()),
    ('nb',MultinomialNB())
])

In [25]:
clf.fit(X_train,y_train)

0,1,2
,steps,"[('vectorizer', ...), ('nb', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'
,ngram_range,"(1, ...)"

0,1,2
,alpha,1.0
,force_alpha,True
,fit_prior,True
,class_prior,


In [26]:
print(classification_report(y_test,clf.predict(X_test)))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99       974
           1       0.98      0.95      0.96       141

    accuracy                           0.99      1115
   macro avg       0.99      0.97      0.98      1115
weighted avg       0.99      0.99      0.99      1115

