In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv("spam.csv")
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
df.Category.value_counts()

ham     4825
spam     747
Name: Category, dtype: int64

In [4]:
df['spam'] = df['Category'].apply(lambda x: 1 if x =='spam' else 0)

In [5]:
df.head()

Unnamed: 0,Category,Message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [7]:
df.shape

(5572, 3)

In [8]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df.Message, df.spam, test_size=0.2)

In [9]:
X_train[:5]

2766             and  picking them up from various points
5460    December only! Had your mobile 11mths+? You ar...
3048    Ok. Not much to do here though. H&M Friday, ca...
2341    I will take care of financial problem.i will h...
1097    Dear Subscriber ur draw 4 £100 gift voucher wi...
Name: Message, dtype: object

In [10]:
y_train[:5]

2766    0
5460    1
3048    0
2341    0
1097    1
Name: spam, dtype: int64

In [11]:
from sklearn.feature_extraction.text import CountVectorizer

v = CountVectorizer()

In [12]:
X_train_cv = v.fit_transform(X_train)
X_train_cv

<4457x7700 sparse matrix of type '<class 'numpy.int64'>'
	with 59293 stored elements in Compressed Sparse Row format>

In [20]:
X_train_cv.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [23]:
v.get_feature_names()[1500]

'brighten'

In [24]:
v.vocabulary_

{'and': 932,
 'picking': 5205,
 'them': 6796,
 'up': 7152,
 'from': 3018,
 'various': 7218,
 'points': 5283,
 'december': 2190,
 'only': 4947,
 'had': 3293,
 'your': 7674,
 'mobile': 4547,
 '11mths': 259,
 'you': 7670,
 'are': 1020,
 'entitled': 2610,
 'to': 6905,
 'update': 7155,
 'the': 6788,
 'latest': 4026,
 'colour': 1893,
 'camera': 1615,
 'for': 2933,
 'free': 2978,
 'call': 1595,
 'co': 1864,
 'on': 4935,
 '08002986906': 56,
 'ok': 4917,
 'not': 4825,
 'much': 4633,
 'do': 2366,
 'here': 3413,
 'though': 6836,
 'friday': 2997,
 'cant': 1629,
 'wait': 7319,
 'dunno': 2481,
 'wot': 7575,
 'hell': 3396,
 'im': 3612,
 'gonna': 3183,
 'another': 951,
 'weeks': 7412,
 'become': 1260,
 'slob': 6206,
 'oh': 4912,
 'already': 895,
 'done': 2400,
 'that': 6784,
 'will': 7484,
 'take': 6659,
 'care': 1640,
 'of': 4893,
 'financial': 2849,
 'problem': 5418,
 'help': 3400,
 'dear': 2182,
 'subscriber': 6538,
 'ur': 7167,
 'draw': 2433,
 '100': 241,
 'gift': 3132,
 'voucher': 7290,
 'entered

In [25]:
X_train_np = X_train_cv.toarray()
X_train_np[0]

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [26]:
np.where(X_train_np[0]!=0)

(array([ 932, 3018, 5205, 5283, 6796, 7152, 7218], dtype=int64),)

In [27]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(X_train_cv, y_train)

MultinomialNB()

In [28]:
X_test_cv = v.transform(X_test)

In [29]:
from sklearn.metrics import classification_report

y_pred = model.predict(X_test_cv)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99       953
           1       0.98      0.92      0.95       162

    accuracy                           0.99      1115
   macro avg       0.98      0.96      0.97      1115
weighted avg       0.99      0.99      0.99      1115



In [30]:
emails = [
    'Hey tabish, can we get together to watch pak v ind match tomorrow?',
    'Upto 20% discount on all products, exclusive offer just for you. Dont miss this reward!'
]

emails_count = v.transform(emails)
model.predict(emails_count)

array([0, 1], dtype=int64)

In [31]:
# Model can also be trained using the pre-trained sklearn pipeline


from sklearn.pipeline import Pipeline

clf = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('nb', MultinomialNB())
])

In [32]:
clf.fit(X_train, y_train)

Pipeline(steps=[('vectorizer', CountVectorizer()), ('nb', MultinomialNB())])

In [33]:
y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99       953
           1       0.98      0.92      0.95       162

    accuracy                           0.99      1115
   macro avg       0.98      0.96      0.97      1115
weighted avg       0.99      0.99      0.99      1115

