In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("spam.csv")
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
df.Category.value_counts()

ham     4825
spam     747
Name: Category, dtype: int64

In [6]:
df['spam'] = df.Category.map({
    'ham' : 0,
    'spam' : 1
})

In [7]:
df.shape

(5572, 3)

In [9]:
df.head()

Unnamed: 0,Category,Message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [10]:
from sklearn.model_selection import train_test_split

X_train , X_test, y_train , y_test = train_test_split(
    df.Message,
    df.spam,
    test_size=0.2,
    random_state=2024
)

In [11]:
X_train.shape

(4457,)

In [12]:
X_test.shape

(1115,)

In [13]:
X_train[:4]

903     I wonder if your phone battery went dead ? I h...
689            Thanks love. But am i doing torch or bold.
3749    A bit of Ur smile is my hppnss, a drop of Ur t...
1761               Nt yet chikku..simple habba..hw abt u?
Name: Message, dtype: object

<h3>Create bag of words representation using CountVectorizer</h3>

In [17]:
from sklearn.feature_extraction.text import CountVectorizer

v = CountVectorizer()

X_train_cv = v.fit_transform(X_train.values)

In [22]:
X_train_cv

<4457x7773 sparse matrix of type '<class 'numpy.int64'>'
	with 59358 stored elements in Compressed Sparse Row format>

In [28]:
X_train_cv.toarray()[:2][0]

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [29]:
X_train_cv.shape

(4457, 7773)

In [37]:
v.get_feature_names_out()[7610]

'wonder'

In [36]:
v.vocabulary_

{'wonder': 7610,
 'if': 3624,
 'your': 7739,
 'phone': 5216,
 'battery': 1272,
 'went': 7492,
 'dead': 2196,
 'had': 3312,
 'to': 6961,
 'tell': 6778,
 'you': 7735,
 'love': 4260,
 'babe': 1200,
 'thanks': 6826,
 'but': 1587,
 'am': 943,
 'doing': 2409,
 'torch': 7015,
 'or': 5008,
 'bold': 1437,
 'bit': 1388,
 'of': 4923,
 'ur': 7230,
 'smile': 6286,
 'is': 3776,
 'my': 4701,
 'hppnss': 3550,
 'drop': 2472,
 'tear': 6762,
 'sorrow': 6358,
 'part': 5119,
 'heart': 3404,
 'life': 4136,
 'like': 4147,
 'mine': 4524,
 'wil': 7547,
 'care': 1664,
 'for': 2953,
 'forevr': 2959,
 'as': 1095,
 'goodfriend': 3201,
 'nt': 4880,
 'yet': 7723,
 'chikku': 1791,
 'simple': 6204,
 'habba': 3309,
 'hw': 3592,
 'abt': 772,
 'wot': 7637,
 'up': 7214,
 'then': 6851,
 'bitch': 1389,
 'so': 6315,
 'wats': 7441,
 'opinion': 4995,
 'him': 3457,
 'and': 969,
 'how': 3543,
 'character': 1740,
 'awesome': 1188,
 'be': 1292,
 'there': 6856,
 'in': 3664,
 'minute': 4534,
 'first': 2883,
 'time': 6928,
 'everythi

In [38]:
X_train_np = X_train_cv.toarray()
X_train_np[0]

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [39]:
np.where(X_train_np[0]!=0)

(array([1200, 1272, 2196, 3312, 3624, 4260, 5216, 6778, 6961, 7492, 7610,
        7735, 7739], dtype=int64),)

In [43]:
X_train[:4][903]

'I wonder if your phone battery went dead ? I had to tell you, I love you babe'

In [47]:
X_train_np[0][1200]

1

In [48]:
v.get_feature_names_out()[1200]

'babe'

<h3>Train the naive bayes model</h3>

In [49]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(X_train_cv,y_train)

In [50]:
X_test_cv = v.transform(X_test)

<h3>Evaluate Performance</h3>

In [51]:
from sklearn.metrics import classification_report

y_pred = model.predict(X_test_cv)

print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99       966
           1       0.98      0.91      0.94       149

    accuracy                           0.98      1115
   macro avg       0.98      0.95      0.97      1115
weighted avg       0.98      0.98      0.98      1115



In [52]:
emails = [
    'Hey mohan, can we get together to watch footbal game tomorrow?',
    'Upto 20% discount on parking, exclusive offer just for you. Dont miss this reward!'
]

emails_count = v.transform(emails)
model.predict(emails_count)

array([0, 1], dtype=int64)

<h3>Train the model using sklearn pipeline and reduce number of lines of code</h3>

In [53]:
from sklearn.pipeline import Pipeline

clf = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('nb', MultinomialNB())
])

In [54]:
clf.fit(X_train, y_train)

In [55]:
y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99       966
           1       0.98      0.91      0.94       149

    accuracy                           0.98      1115
   macro avg       0.98      0.95      0.97      1115
weighted avg       0.98      0.98      0.98      1115

