# NLP Tutorial: Text Representation - Bag Of Words (BOW)


In [1]:
import pandas as pd
import numpy as np


In [2]:
df=pd.read_csv("spam.csv")
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
df.Category.value_counts()

Category
ham     4825
spam     747
Name: count, dtype: int64

In [4]:
df['spam']=df['Category'].apply(lambda x: 1 if x=="spam" else 0)
print(df['spam'])

0       0
1       0
2       1
3       0
4       0
       ..
5567    1
5568    0
5569    0
5570    0
5571    0
Name: spam, Length: 5572, dtype: int64


In [5]:
df.shape

(5572, 3)

In [6]:
df.head()

Unnamed: 0,Category,Message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


# Train test split

In [7]:
print(df.spam)

0       0
1       0
2       1
3       0
4       0
       ..
5567    1
5568    0
5569    0
5570    0
5571    0
Name: spam, Length: 5572, dtype: int64


In [8]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df.Message, df.spam, test_size=0.2)


In [9]:
X_train.shape

(4457,)

In [10]:
X_test.shape

(1115,)

In [11]:
type(X_train)

pandas.core.series.Series

In [12]:
X_train[:4]

82                       Ok i am on the way to home hi hi
3012                Hm good morning, headache anyone? :-)
1176    Horrible u eat macs eat until u forgot abt me ...
4509                             Not able to do anything.
Name: Message, dtype: object

In [13]:
type(y_train)

pandas.core.series.Series

In [24]:
y_train[:4]

603     0
5221    0
776     0
1131    0
Name: spam, dtype: int64

In [14]:
type(X_train.values)

numpy.ndarray

# Create bag of words representation using CountVectorizer

In [21]:
from sklearn.feature_extraction.text import CountVectorizer
v=CountVectorizer()
X_train_cv=v.fit_transform(X_train.values)
X_train_cv



<4457x7780 sparse matrix of type '<class 'numpy.int64'>'
	with 59215 stored elements in Compressed Sparse Row format>

In [22]:
X_train_cv.toarray()[:2][0]


array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [23]:
X_train_cv.shape

(4457, 7780)

In [24]:
v.get_feature_names_out()[1771]

'checkin'

In [25]:
v.vocabulary_

{'ok': 4952,
 'am': 946,
 'on': 4971,
 'the': 6859,
 'way': 7458,
 'to': 6981,
 'home': 3490,
 'hi': 3442,
 'hm': 3468,
 'good': 3194,
 'morning': 4616,
 'headache': 3384,
 'anyone': 1007,
 'horrible': 3518,
 'eat': 2540,
 'macs': 4320,
 'until': 7231,
 'forgot': 2962,
 'abt': 775,
 'me': 4428,
 'already': 936,
 'rite': 5838,
 'take': 6734,
 'so': 6321,
 'long': 4209,
 'reply': 5759,
 'thk': 6900,
 'it': 3774,
 'more': 4612,
 'toot': 7024,
 'than': 6844,
 'b4': 1193,
 'prepared': 5427,
 'now': 4875,
 'wat': 7442,
 'shall': 6103,
 'not': 4861,
 'able': 766,
 'do': 2389,
 'anything': 1012,
 'hey': 3438,
 'what': 7520,
 'are': 1062,
 'you': 7746,
 'doing': 2415,
 'no': 4827,
 'pa': 5083,
 'chile': 1805,
 'please': 5289,
 'only': 4982,
 'lt': 4269,
 'decimal': 2218,
 'gt': 3270,
 'hour': 3530,
 'drive': 2477,
 'for': 2951,
 'come': 1924,
 'down': 2449,
 'all': 923,
 'time': 6947,
 'and': 971,
 'will': 7563,
 'be': 1284,
 'subletting': 6596,
 'feb': 2813,
 'april': 1051,
 'audition': 1153,


In [35]:
print(dir(v))

['__annotations__', '__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__setstate__', '__sizeof__', '__sklearn_clone__', '__str__', '__subclasshook__', '__weakref__', '_build_request_for_signature', '_char_ngrams', '_char_wb_ngrams', '_check_feature_names', '_check_n_features', '_check_stop_words_consistency', '_check_vocabulary', '_count_vocab', '_get_default_requests', '_get_metadata_request', '_get_param_names', '_get_tags', '_limit_features', '_more_tags', '_parameter_constraints', '_repr_html_', '_repr_html_inner', '_repr_mimebundle_', '_sort_features', '_stop_words_id', '_validate_data', '_validate_ngram_range', '_validate_params', '_validate_vocabulary', '_warn_for_unused_params', '_white_spaces', '_word_ngrams', 'analyzer', 'binary'

In [26]:
X_train_np = X_train_cv.toarray()
X_train_np[0]


array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [27]:
np.where(X_train_np[0]!=0)

(array([ 946, 3442, 3490, 4952, 4971, 6859, 6981, 7458], dtype=int64),)

In [28]:
X_train[1579]

"How to Make a girl Happy? It's not at all difficult to make girls happy. U only need to be... 1. A friend 2. Companion 3. Lover 4. Chef . . .  &lt;#&gt; . Good listener  &lt;#&gt; . Organizer  &lt;#&gt; . Good boyfriend  &lt;#&gt; . Very clean  &lt;#&gt; . Sympathetic  &lt;#&gt; . Athletic  &lt;#&gt; . Warm . . .  &lt;#&gt; . Courageous  &lt;#&gt; . Determined  &lt;#&gt; . True  &lt;#&gt; . Dependable  &lt;#&gt; . Intelligent . . .  &lt;#&gt; . Psychologist  &lt;#&gt; . Pest exterminator  &lt;#&gt; . Psychiatrist  &lt;#&gt; . Healer . .  &lt;#&gt; . Stylist  &lt;#&gt; . Driver . . Aaniye pudunga venaam.."

In [29]:
X_train_np[0][1771]

0

# Train the naive bayes model

In [30]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(X_train_cv, y_train)

In [33]:
X_test_cv = v.transform(X_test)


# Evaluate Performance

In [34]:
from sklearn.metrics import classification_report

y_pred = model.predict(X_test_cv)

print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.98      1.00      0.99       968
           1       0.98      0.90      0.94       147

    accuracy                           0.98      1115
   macro avg       0.98      0.95      0.96      1115
weighted avg       0.98      0.98      0.98      1115



In [35]:
emails = [
    'Hey mohan, can we get together to watch footbal game tomorrow?',
    'Upto 20% discount on parking, exclusive offer just for you. Dont miss this reward!'
]

emails_count = v.transform(emails)
model.predict(emails_count)


array([0, 1], dtype=int64)

# Train the model using sklearn pipeline and reduce number of lines of code


In [38]:
from sklearn.pipeline import Pipeline

clf = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('nb', MultinomialNB())
])


In [39]:
clf.fit(X_train, y_train)

In [40]:
y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.98      1.00      0.99       968
           1       0.98      0.90      0.94       147

    accuracy                           0.98      1115
   macro avg       0.98      0.95      0.96      1115
weighted avg       0.98      0.98      0.98      1115

