# Goal: Predict sms a spam or ham

In [2]:
# [data source]
# https://archive.ics.uci.edu/ml/machine-learning-databases/00228/

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import seaborn as sns
sns.set_style('whitegrid')
%matplotlib inline

In [5]:
df = pd.read_csv('SMSSpamCollection', sep='\t', names=['label', 'message'])

In [6]:
df.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [7]:
df.describe()

Unnamed: 0,label,message
count,5572,5572
unique,2,5169
top,ham,"Sorry, I'll call later"
freq,4825,30


In [10]:
"""
How many are are spam and ham ?
"""
df.label.value_counts()

ham     4825
spam     747
Name: label, dtype: int64

In [13]:
"""
Add new series representing spam, ham in numerical form.
"""
df['label_number'] = df.label.map({'ham':0, 'spam':1})

In [14]:
df.head()

Unnamed: 0,label,message,label_number
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [15]:
X = df.message
y = df.label_number

In [17]:
# One dim object
print(X.shape, y.shape)

(5572,) (5572,)


In [27]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=13)

In [29]:
print(X_train.shape, X_test.shape) # one dim object

(3900,) (1672,)


In [30]:
print(y_train.shape, y_test.shape) # one dim object

(3900,) (1672,)


In [34]:
"""
    Purpose of CountVectorizer is to convert text string to matrix of tokens (words).
    CountVectorizer is not a model.
"""
vector = CountVectorizer()

In [36]:
"""
    Vector fit learn vocabalary in text .
"""
vector.fit(X_train)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [37]:
"""
    vector transfor to convert text data to documentor matrix (matrix of words).
"""
X_train_dtmatric = vector.transform(X_train)

In [38]:
X_train_dtmatric.shape

(3900, 7125)

In [39]:
X_train_dtmatric

<3900x7125 sparse matrix of type '<class 'numpy.int64'>'
	with 51886 stored elements in Compressed Sparse Row format>

In [40]:
X_test_dtmatric = vector.transform(X_test)

In [41]:
X_test_dtmatric

<1672x7125 sparse matrix of type '<class 'numpy.int64'>'
	with 20507 stored elements in Compressed Sparse Row format>

### Multinomial Naive Bayes Model

In [53]:
from sklearn.naive_bayes import MultinomialNB

In [44]:
nb_model = MultinomialNB()

In [46]:
nb_model.fit(X_train_dtmatric, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [48]:
y_prediction = nb_model.predict(X_test_dtmatric)

In [49]:
from sklearn import metrics

In [51]:
print(metrics.accuracy_score(y_test, y_prediction))

0.9808612440191388


In [52]:
print(metrics.confusion_matrix(y_test, y_prediction))

[[1431    8]
 [  24  209]]


### Logistic Reg Model

In [58]:
from sklearn.linear_model import LogisticRegression

In [61]:
model2 = LogisticRegression()

In [63]:
 model2.fit(X_train_dtmatric, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [64]:
y_prediction2 = model2.predict(X_test_dtmatric)

In [65]:
print(metrics.accuracy_score(y_test, y_prediction2))

0.979066985645933


In [66]:
print(metrics.confusion_matrix(y_test, y_prediction2))

[[1433    6]
 [  29  204]]


In [82]:
# False positive: these sms were ham but predicted spam
X_test[y_test < y_prediction]

1672                              Glad to see your reply.
4622                   Received, understood n acted upon!
4862                               Nokia phone is lovly..
574                                Waiting for your call.
216     Finally the match heading towards draw as your...
991                                          26th OF JULY
4729    I (Career Tel) have added u as a contact on IN...
4702                               I liked the new mobile
Name: message, dtype: object