<h2 align='center'>NLP Tutorial: Text Representation - Bag Of Words (BOW)</h2>

In [7]:
import pandas as pd
import numpy as np

In [8]:
df = pd.read_csv("spam.csv")
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [9]:
df['Category'].unique()

array(['ham', 'spam'], dtype=object)

In [10]:
df.Category.value_counts()

ham     4825
spam     747
Name: Category, dtype: int64

In [11]:
# def get_spam_number(x):
#     if x == 'spam':
#         return 1
#     else:
#         return 0
# 
# df['spam'] = df['Category'].apply(get_spam_number)

df['spam'] = df['Category'].apply(lambda x: 1 if x =='spam' else 0)

In [12]:
df.shape

(5572, 3)

In [13]:
df.head()

Unnamed: 0,Category,Message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [23]:
df[3:6]

Unnamed: 0,Category,Message,spam
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0
5,spam,FreeMsg Hey there darling it's been 3 week's n...,1


<h3>Train test split</h3>

In [14]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df.Message, df.spam, test_size=0.2)

In [25]:
X_train

3737    Hows the street where the end of library walk is?
4961            I want  &lt;#&gt;  rs da:)do you have it?
4556    7 wonders in My WORLD 7th You 6th Ur style 5th...
602     If u sending her home first it's ok lor. I'm n...
1165    Well. Im computerless. Time to make some oreo ...
                              ...                        
1841    Yeah. I got a list with only u and Joanna if I...
4127    SPJanuary Male Sale! Hot Gay chat now cheaper,...
3099             This is all just creepy and crazy to me.
4817    Anything is valuable in only 2 situations: Fir...
2077            Trust me. Even if isn't there, its there.
Name: Message, Length: 4457, dtype: object

In [15]:
X_train.shape

(4457,)

In [17]:
X_test.shape

(1115,)

In [18]:
type(X_train)

pandas.core.series.Series

In [19]:
X_train[:4]

3737    Hows the street where the end of library walk is?
4961            I want  &lt;#&gt;  rs da:)do you have it?
4556    7 wonders in My WORLD 7th You 6th Ur style 5th...
602     If u sending her home first it's ok lor. I'm n...
Name: Message, dtype: object

In [25]:
type(y_train)

pandas.core.series.Series

In [30]:
y_train[:4]

2169    0
99      0
4882    0
744     0
Name: spam, dtype: int64

In [28]:
type(X_train.values)

numpy.ndarray

<h3>Create bag of words representation using CountVectorizer</h3>

In [33]:
a = X_train.values
a[0]

'Hows the street where the end of library walk is?'

In [35]:
from sklearn.feature_extraction.text import CountVectorizer

v = CountVectorizer()

X_train_cv = v.fit_transform(X_train.values)
X_train_cv

X_train_cv.toarray()


array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [44]:
X_train_cv.toarray()[:2][0]

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [45]:
X_train_cv.shape

(4457, 7719)

In [58]:
dir(v)

['__annotations__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_char_ngrams',
 '_char_wb_ngrams',
 '_check_feature_names',
 '_check_n_features',
 '_check_stop_words_consistency',
 '_check_vocabulary',
 '_count_vocab',
 '_get_param_names',
 '_get_tags',
 '_limit_features',
 '_more_tags',
 '_parameter_constraints',
 '_repr_html_',
 '_repr_html_inner',
 '_repr_mimebundle_',
 '_sort_features',
 '_stop_words_id',
 '_validate_data',
 '_validate_ngram_range',
 '_validate_params',
 '_validate_vocabulary',
 '_warn_for_unused_params',
 '_white_spaces',
 '_word_ngrams',
 'analyzer',
 'binary',
 'build_analyzer',
 'build_preprocessor',
 'bui

In [75]:
v.get_feature_names_out()[4873]

'of'

In [47]:
v.vocabulary_

{'hows': 3529,
 'the': 6786,
 'street': 6482,
 'where': 7459,
 'end': 2600,
 'of': 4873,
 'library': 4105,
 'walk': 7334,
 'is': 3758,
 'want': 7351,
 'lt': 4256,
 'gt': 3268,
 'rs': 5813,
 'da': 2158,
 'do': 2385,
 'you': 7679,
 'have': 3358,
 'it': 3770,
 'wonders': 7555,
 'in': 3647,
 'my': 4656,
 'world': 7574,
 '7th': 639,
 '6th': 615,
 'ur': 7177,
 'style': 6516,
 '5th': 573,
 'smile': 6226,
 '4th': 534,
 'personality': 5141,
 '3rd': 473,
 'nature': 4693,
 '2nd': 406,
 'sms': 6236,
 'and': 975,
 '1st': 335,
 'lovely': 4237,
 'friendship': 3012,
 'good': 3196,
 'morning': 4578,
 'dear': 2212,
 'if': 3606,
 'sending': 5991,
 'her': 3414,
 'home': 3477,
 'first': 2879,
 'ok': 4896,
 'lor': 4215,
 'not': 4814,
 'ready': 5570,
 'yet': 7668,
 'well': 7427,
 'im': 3623,
 'computerless': 1962,
 'time': 6871,
 'to': 6905,
 'make': 4327,
 'some': 6269,
 'oreo': 4967,
 'truffles': 7028,
 'nt': 4835,
 'chikku': 1802,
 'simple': 6143,
 'habba': 3294,
 'hw': 3573,
 'abt': 771,
 'said': 5852,
 

In [69]:
X_train_np = X_train_cv.toarray()
X_train_np[3737]

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [70]:
np.where(X_train_np[3737]!=0)

(array([1017, 1062, 1136, 1283, 1597, 1650, 2385, 3268, 3402, 3606, 3968,
        4095, 4404, 4814, 4828, 5570, 5638, 6099, 6595, 6804, 6818, 6905,
        7195, 7455, 7655, 7679, 7684], dtype=int64),)

In [74]:
X_train[:4][3737]

'Hows the street where the end of library walk is?'

In [63]:
X_train_np[0][3737]

0

<h3>Train the naive bayes model</h3>

In [76]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(X_train_cv, y_train)

In [77]:
X_test_cv = v.transform(X_test)

<h3>Evaluate Performance</h3>

In [78]:
from sklearn.metrics import classification_report

y_pred = model.predict(X_test_cv)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99       974
           1       0.96      0.91      0.94       141

    accuracy                           0.98      1115
   macro avg       0.98      0.95      0.96      1115
weighted avg       0.98      0.98      0.98      1115



In [82]:
emails = [
    'Hey mohan, can we get together to watch footbal game tomorrow?',
    'Upto 20% discount on parking, exclusive offer just for you. Dont miss this reward!'
]

emails_CountVector = v.transform(emails)
model.predict(emails_CountVector)

array([0, 1], dtype=int64)

<h3>Train the model using sklearn pipeline and reduce number of lines of code</h3>

In [83]:
from sklearn.pipeline import Pipeline

clf = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('nb', MultinomialNB())
])

In [84]:
clf.fit(X_train, y_train)

In [85]:
y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99       974
           1       0.96      0.91      0.94       141

    accuracy                           0.98      1115
   macro avg       0.98      0.95      0.96      1115
weighted avg       0.98      0.98      0.98      1115

