# ***Text Representation - Bag Of Words***

In [2]:
import pandas as pd
import numpy as np

In [4]:
df = pd.read_csv("spam.csv")
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
df.Category.value_counts()

Category
ham     4825
spam     747
Name: count, dtype: int64

In [6]:
df['spam'] = df['Category'].apply(lambda x: 1 if x =='spam' else 0)

In [7]:
df.shape

(5572, 3)

In [8]:
df.head()

Unnamed: 0,Category,Message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


Train test split

In [9]:
from sklearn.model_selection import train_test_split

In [10]:
X_train, X_test, y_train, y_test = train_test_split(df.Message, df.spam, test_size=0.2)

In [11]:
X_train.shape

(4457,)

In [12]:
X_test.shape

(1115,)

In [13]:
type(X_train)

pandas.core.series.Series

In [14]:
X_train[:4]

2418    Oh... Lk tt den we take e one tt ends at cine ...
922     On ma way to school. Can you pls send me ashle...
3071    I'm now but have to wait till 2 for the bus to...
3033    Jokin only lar... :-) depends on which phone m...
Name: Message, dtype: object

In [15]:
type(y_train)

pandas.core.series.Series

In [16]:
type(X_train.values)

numpy.ndarray

***Create bag of words representation using CountVectorizer***

In [17]:
from sklearn.feature_extraction.text import CountVectorizer

v = CountVectorizer()

X_train_cv = v.fit_transform(X_train.values)
X_train_cv

<4457x7718 sparse matrix of type '<class 'numpy.int64'>'
	with 59408 stored elements in Compressed Sparse Row format>

In [18]:
X_train_cv.toarray()[:2][0]

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [19]:
X_train_cv.shape

(4457, 7718)

In [20]:
v.get_feature_names_out()[1771]

'cheery'

In [21]:
v.vocabulary_

{'oh': 4880,
 'lk': 4145,
 'tt': 7039,
 'den': 2238,
 'we': 7386,
 'take': 6665,
 'one': 4906,
 'ends': 2584,
 'at': 1118,
 'cine': 1823,
 'lor': 4192,
 'dun': 2483,
 'wan': 7344,
 'yogasana': 7676,
 'oso': 4963,
 'can': 1637,
 'on': 4901,
 'ma': 4268,
 'way': 7381,
 'to': 6904,
 'school': 5905,
 'you': 7679,
 'pls': 5228,
 'send': 5974,
 'me': 4384,
 'ashley': 1090,
 'number': 4826,
 'now': 4813,
 'but': 1582,
 'have': 3344,
 'wait': 7324,
 'till': 6870,
 'for': 2933,
 'the': 6789,
 'bus': 1575,
 'pick': 5173,
 'jokin': 3830,
 'only': 4912,
 'lar': 4011,
 'depends': 2250,
 'which': 7460,
 'phone': 5160,
 'my': 4642,
 'father': 2782,
 'get': 3105,
 'most': 4569,
 'beautiful': 1283,
 'girl': 3126,
 'ive': 3758,
 'ever': 2662,
 'seen': 5956,
 'baby': 1195,
 'come': 1912,
 'and': 959,
 'in': 3624,
 'common': 1923,
 'room': 5787,
 'if': 3588,
 'don': 2402,
 'your': 7685,
 'prize': 5391,
 'will': 7491,
 'go': 3149,
 'another': 980,
 'customer': 2124,
 'www': 7613,
 'biz': 1382,
 '18': 315,


In [22]:
X_train_np = X_train_cv.toarray()
X_train_np[0]

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [23]:
np.where(X_train_np[0]!=0)

(array([1118, 1637, 1823, 2238, 2483, 2584, 4145, 4192, 4880, 4906, 4963,
        6665, 7039, 7344, 7386, 7676], dtype=int64),)

In [24]:
X_train_np[0][1771]

0

***Train the naive bayes model***

In [25]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(X_train_cv, y_train)

In [26]:
X_test_cv = v.transform(X_test)

***Evaluate Performance***

In [27]:
from sklearn.metrics import classification_report

y_pred = model.predict(X_test_cv)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99       975
           1       0.97      0.90      0.93       140

    accuracy                           0.98      1115
   macro avg       0.98      0.95      0.96      1115
weighted avg       0.98      0.98      0.98      1115



In [28]:
emails = [
    'Hey mohan, can we get together to watch footbal game tomorrow?',
    'Upto 20% discount on parking, exclusive offer just for you. Dont miss this reward!'
]

emails_count = v.transform(emails)
model.predict(emails_count)

array([0, 1], dtype=int64)

***Train the model using sklearn pipeline and reduce number of lines of code***

In [29]:
from sklearn.pipeline import Pipeline

clf = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('nb', MultinomialNB())
])

In [30]:
clf.fit(X_train, y_train)

In [31]:
y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99       975
           1       0.97      0.90      0.93       140

    accuracy                           0.98      1115
   macro avg       0.98      0.95      0.96      1115
weighted avg       0.98      0.98      0.98      1115

