<h2 align='center'>NLP Tutorial: Text Representation - Bag Of Words (BOW)</h2>

In [99]:
import pandas as pd
import numpy as np

In [100]:
df = pd.read_csv("spam.csv")
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [101]:
df.Category.value_counts()

Category
ham     4825
spam     747
Name: count, dtype: int64

In [102]:
df['spam'] = df['Category'].apply(lambda x: 1 if x =='spam' else 0)

In [103]:
df.shape

(5572, 3)

In [104]:
df.head()

Unnamed: 0,Category,Message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


<h3>Train test split</h3>

In [105]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df.Message, df.spam, test_size=0.2)

In [106]:
y_train.value_counts()

spam
0    3872
1     585
Name: count, dtype: int64

In [107]:
X_test.shape

(1115,)

In [108]:
type(X_train)

pandas.core.series.Series

In [109]:
X_train[:4]

1783    My uncles in Atlanta. Wish you guys a great se...
4731    I dont know ask to my brother. Nothing problem...
4499    Latest Nokia Mobile or iPOD MP3 Player +£400 p...
4378                            How much is torch in 9ja.
Name: Message, dtype: object

In [110]:
type(y_train)

pandas.core.series.Series

In [111]:
y_train[:4]

1783    0
4731    0
4499    1
4378    0
Name: spam, dtype: int64

In [112]:
type(X_train.values)

numpy.ndarray

<h3>Create bag of words representation using CountVectorizer</h3>

In [113]:
from sklearn.feature_extraction.text import CountVectorizer

v = CountVectorizer()

X_train_cv = v.fit_transform(X_train.values)
X_train_cv

<4457x7723 sparse matrix of type '<class 'numpy.int64'>'
	with 59143 stored elements in Compressed Sparse Row format>

In [114]:
X_train_cv.toarray()[:2][0]

MemoryError: Unable to allocate 263. MiB for an array with shape (4457, 7723) and data type int64

In [None]:
X_train_cv.shape

(4457, 7757)

In [None]:
v.get_feature_names_out()[1771]

'chip'

In [None]:
v.vocabulary_

{'congrats': 1937,
 'treat': 7037,
 'pending': 5141,
 'am': 925,
 'not': 4822,
 'on': 4933,
 'mail': 4312,
 'for': 2921,
 'days': 2163,
 'will': 7531,
 'once': 4937,
 'thru': 6880,
 'respect': 5748,
 'mother': 4584,
 'at': 1101,
 'home': 3457,
 'check': 1729,
 'mails': 4315,
 'haf': 3285,
 'found': 2948,
 'him': 3421,
 'feel': 2786,
 'so': 6282,
 'stupid': 6545,
 'da': 2122,
 'cam': 1599,
 'was': 7401,
 'working': 7607,
 'take': 6691,
 'something': 6304,
 'pain': 5050,
 'if': 3587,
 'it': 3738,
 'moves': 4596,
 'however': 3509,
 'to': 6936,
 'any': 981,
 'side': 6156,
 'in': 3623,
 'the': 6819,
 'next': 4761,
 '6hrs': 591,
 'see': 5990,
 'doctor': 2367,
 'kano': 3881,
 'whr': 7513,
 'maga': 4303,
 'ok': 4916,
 'darlin': 2145,
 'supose': 6619,
 'just': 3860,
 'worry': 7615,
 'too': 6975,
 'much': 4626,
 'have': 3349,
 'do': 2362,
 'some': 6296,
 'film': 2827,
 'stuff': 6542,
 'my': 4657,
 'mate': 4372,
 'and': 951,
 'then': 6830,
 'babysit': 1177,
 'again': 848,
 'but': 1552,
 'you': 77

In [None]:
X_train_np = X_train_cv.toarray()
X_train_np[0]

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [None]:
np.where(X_train_np[0]!=0)

(array([ 925, 1101, 1729, 1937, 2163, 2921, 3457, 4312, 4315, 4584, 4822,
        4933, 4937, 5141, 5748, 6880, 7037, 7531], dtype=int64),)

In [None]:
X_train[:4][1579]

KeyError: 1579

In [None]:
X_train_np[0][1771]

1

<h3>Train the naive bayes model</h3>

In [None]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(X_train_cv, y_train)

In [None]:
X_test_cv = v.transform(X_test)

<h3>Evaluate Performance</h3>

In [None]:
from sklearn.metrics import classification_report

y_pred = model.predict(X_test_cv)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.87      0.97      0.92       958
           1       0.35      0.10      0.15       157

    accuracy                           0.85      1115
   macro avg       0.61      0.53      0.53      1115
weighted avg       0.79      0.85      0.81      1115



In [None]:
emails = [
    'Hey mohan, can we get together to watch footbal game tomorrow?',
    'Upto 20% discount on parking, exclusive offer just for you. Dont miss this reward!'
]

emails_count = v.transform(emails)
model.predict(emails_count)

array([0, 0], dtype=int64)

<h3>Train the model using sklearn pipeline and reduce number of lines of code</h3>

In [None]:
from sklearn.pipeline import Pipeline

clf = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('nb', MultinomialNB())
])

In [None]:
clf.fit(X_train, y_train)

In [None]:
y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99       958
           1       0.97      0.89      0.93       157

    accuracy                           0.98      1115
   macro avg       0.98      0.94      0.96      1115
weighted avg       0.98      0.98      0.98      1115

