In [1]:
import pandas as pd
import numpy as np


In [2]:
df = pd.read_csv('spam_emails.csv')
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
df.Category.value_counts()

ham     4825
spam     747
Name: Category, dtype: int64

In [4]:
df['spam'] = df['Category'].apply(lambda x : 1 if x == "spam" else 0)

In [5]:
df.head()

Unnamed: 0,Category,Message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [6]:
df.drop('Category', axis = 'columns')

Unnamed: 0,Message,spam
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,U dun say so early hor... U c already then say...,0
4,"Nah I don't think he goes to usf, he lives aro...",0
...,...,...
5567,This is the 2nd time we have tried 2 contact u...,1
5568,Will �_ b going to esplanade fr home?,0
5569,"Pity, * was in mood for that. So...any other s...",0
5570,The guy did some bitching but I acted like i'd...,0


In [7]:
X = df.Message
y = df.spam

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [8]:
X_train.shape

(4457,)

In [9]:
X_test.shape

(1115,)

In [10]:
from sklearn.feature_extraction.text import CountVectorizer

v = CountVectorizer()

X_train_cv = v.fit_transform(X_train.values) 

In [11]:
X_train_cv.toarray()[: 2]

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [13]:
X_train_cv.shape # I have 7580 words in my vocab

(4457, 7580)

In [14]:
v.vocabulary_

{'no': 4687,
 'this': 6701,
 'is': 3672,
 'kallis': 3804,
 'home': 3405,
 'ground': 3189,
 'amla': 906,
 'town': 6850,
 'durban': 2452,
 'am': 898,
 'in': 3565,
 'escape': 2592,
 'theatre': 6667,
 'now': 4737,
 'going': 3109,
 'to': 6783,
 'watch': 7238,
 'kavalan': 3816,
 'few': 2761,
 'minutes': 4405,
 'we': 7258,
 'walked': 7202,
 'from': 2959,
 'my': 4563,
 'moms': 4464,
 'right': 5658,
 'on': 4826,
 'stagwood': 6294,
 'pass': 4991,
 'winterstone': 7375,
 'left': 3978,
 'victors': 7121,
 'hill': 3361,
 'address': 781,
 'lt': 4158,
 'gt': 3196,
 'dunno': 2451,
 'they': 6685,
 'close': 1813,
 'oredi': 4875,
 'not': 4724,
 'ma': 4199,
 'fan': 2711,
 'yo': 7540,
 'im': 3539,
 'by': 1553,
 'work': 7433,
 'its': 3688,
 'ur': 7047,
 'luck': 4163,
 'love': 4138,
 'someone': 6165,
 'fortune': 2899,
 'the': 6665,
 'one': 4830,
 'who': 7337,
 'loves': 4146,
 'but': 1539,
 'he': 3297,
 'also': 890,
 'knows': 3872,
 'about': 724,
 'lunch': 4170,
 'menu': 4354,
 'only': 4836,
 'da': 2116,
 'know

In [16]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()

model.fit(X_train_cv, y_train)

MultinomialNB()

In [17]:
X_test_cv = v.transform(X_test)

In [19]:
from sklearn.metrics import classification_report

y_pred = model.predict(X_test_cv)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99       949
           1       0.99      0.92      0.95       166

    accuracy                           0.99      1115
   macro avg       0.99      0.96      0.97      1115
weighted avg       0.99      0.99      0.99      1115



In [20]:
emails = ['Hey mohan, can we get together to watch footbal game tomorrow?',
    'Upto 20% discount on parking, exclusive offer just for you. Dont miss this reward!']

emails_count = v.transform(emails)
model.predict(emails_count)

array([0, 1], dtype=int64)

In [21]:
# Now I can use Pipeline to do all this steps just using simple code
from sklearn.pipeline import Pipeline

clf = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('nb', MultinomialNB())
])

In [22]:
clf.fit(X_train, y_train) # i don`t need to use X_train_cv because I supply vectorizer in my classifier   

Pipeline(steps=[('vectorizer', CountVectorizer()), ('nb', MultinomialNB())])

In [23]:
y_pred = clf.predict(X_test)

In [25]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99       949
           1       0.99      0.92      0.95       166

    accuracy                           0.99      1115
   macro avg       0.99      0.96      0.97      1115
weighted avg       0.99      0.99      0.99      1115

