In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("spam.csv")
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
df.Category.value_counts()

ham     4825
spam     747
Name: Category, dtype: int64

In [4]:
df['spam'] = df['Category'].apply(lambda x: 1 if x=='spam' else 0)

In [5]:
df.shape

(5572, 3)

In [6]:
df.head()

Unnamed: 0,Category,Message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


## Train Test Split

In [7]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df.Message, df.spam, test_size= 0.2)

In [8]:
X_train.shape

(4457,)

In [9]:
X_test.shape

(1115,)

In [10]:
type(X_train)

pandas.core.series.Series

In [11]:
type(X_train.values)

numpy.ndarray

In [12]:
type(y_train)

pandas.core.series.Series

## Create a bag of words using CountVectorizer

In [13]:
from sklearn.feature_extraction.text import CountVectorizer

In [14]:
v = CountVectorizer()

X_train_cv = v.fit_transform(X_train)
X_train_cv

<4457x7726 sparse matrix of type '<class 'numpy.int64'>'
	with 59208 stored elements in Compressed Sparse Row format>

In [15]:
X_train_cv.toarray()[0]

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [16]:
X_train_cv.shape

(4457, 7726)

In [17]:
v.vocabulary_

{'am': 929,
 'on': 4894,
 'my': 4630,
 'way': 7385,
 'why': 7475,
 'come': 1891,
 'in': 3620,
 'between': 1327,
 'you': 7686,
 'people': 5116,
 'yun': 7705,
 'ah': 869,
 'now': 4804,
 'wkg': 7532,
 'where': 7457,
 'btw': 1539,
 'if': 3579,
 'go': 3145,
 'nus': 4821,
 'sc': 5913,
 'wana': 7350,
 'specialise': 6344,
 'wad': 7321,
 'so': 6263,
 'how': 3498,
 'the': 6791,
 'weather': 7397,
 'over': 4990,
 'there': 6809,
 'still': 6459,
 'have': 3342,
 'not': 4791,
 'checked': 1753,
 'it': 3732,
 'da': 2124,
 'marvel': 4336,
 'mobile': 4508,
 'play': 5210,
 'official': 4863,
 'ultimate': 7101,
 'spider': 6359,
 'man': 4302,
 'game': 3052,
 '50': 536,
 'ur': 7177,
 'right': 5762,
 'text': 6763,
 'to': 6912,
 '83338': 668,
 'for': 2923,
 'we': 7389,
 'll': 4136,
 'send': 5991,
 'free': 2968,
 '8ball': 720,
 'wallpaper': 7343,
 'company': 1903,
 'is': 3721,
 'very': 7250,
 'good': 3169,
 'environment': 2598,
 'terrific': 6752,
 'and': 954,
 'food': 2915,
 'really': 5582,
 'nice': 4734,
 'said'

In [18]:
v.get_feature_names_out()[3697]

'invaders'

In [19]:
X_train_np = X_train_cv.toarray()
X_train_np[0]

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [20]:
np.where(X_train_np[0]!=0)

(array([ 929, 4630, 4894, 7385], dtype=int64),)

## Train the model using Naive Bayes

In [21]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(X_train_cv, y_train)

In [22]:
X_test_cv = v.transform(X_test)

y_pred = model.predict(X_test_cv)

In [24]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99       972
           1       0.96      0.94      0.95       143

    accuracy                           0.99      1115
   macro avg       0.98      0.97      0.97      1115
weighted avg       0.99      0.99      0.99      1115



In [25]:
emails = [
    'Hey mohan, can we get together to watch footbal game tomorrow?',
    'Upto 20% discount on parking, exclusive offer just for you. Dont miss this reward!'
]

emails_count = v.transform(emails)
model.predict(emails_count)

array([0, 1], dtype=int64)

## Train the model using sklearn pipeline and reduce number of lines of code 

In [26]:
from sklearn.pipeline import Pipeline

clf = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('nb', MultinomialNB())
])

In [27]:
clf.fit(X_train, y_train)

In [28]:
y_pred = clf.predict(X_test)

In [29]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99       972
           1       0.96      0.94      0.95       143

    accuracy                           0.99      1115
   macro avg       0.98      0.97      0.97      1115
weighted avg       0.99      0.99      0.99      1115

