# SMS Spam Detection

## Agenda

### Working with text data
- Representing text as data
- Reading SMS data
- Vectorizing SMS data
- Examining the tokens and their counts
- Bonus: Calculating the "spamminess" of each token

### Naive Bayes
- Building a Naive Bayes model
- Comparing Naive Bayes with logistic regression

In [1]:
#importing libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score

In [2]:
#grabing data
data = pd.read_csv('D:\python notebooks\sms-spam_detection\spam.csv',encoding='latin-1')
data = data.drop(['Unnamed: 2','Unnamed: 3','Unnamed: 4'],axis=1)
data.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
#changing column names
data = data.rename(columns = { 'v1':'label', 'v2':'sms' })
data.head()

Unnamed: 0,label,sms
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
#checking null values and value count
print(data.isnull().any())
data.label.value_counts()

label    False
sms      False
dtype: bool


ham     4825
spam     747
Name: label, dtype: int64

In [5]:
#converting label into numeric value
data.label = data.label.map({'ham':0,'spam':1})
data.head()

Unnamed: 0,label,sms
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [6]:
#spliting data into training and test set
X = data.sms
y = data.label

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.3)
print(X_train.shape)
print(X_test.shape)

(3900,)
(1672,)


## Vectorizatioin of words

In [7]:
from sklearn.feature_extraction.text import CountVectorizer

#instantiate the vector
vec = CountVectorizer(max_df=0.90,min_df = 0.001,stop_words='english',strip_accents = 'unicode')

In [8]:
#learn training data vocabulary and convert it into document term matrix
train_vec = vec.fit_transform(X_train)
train_vec

<3900x1523 sparse matrix of type '<class 'numpy.int64'>'
	with 22898 stored elements in Compressed Sparse Row format>

In [9]:
test_vec = vec.transform(X_test)
test_vec

<1672x1523 sparse matrix of type '<class 'numpy.int64'>'
	with 9464 stored elements in Compressed Sparse Row format>

### Examine tokens and count in vectors

In [10]:
#storing token names
train_x_token = vec.get_feature_names()
print(train_x_token[-51:])

#view token array
train_vec.toarray()

['wkly', 'woke', 'woman', 'won', 'wonder', 'wonderful', 'wondering', 'wont', 'word', 'words', 'work', 'workin', 'working', 'world', 'worried', 'worries', 'worry', 'worse', 'worth', 'wot', 'wouldn', 'wow', 'write', 'wrong', 'www', 'xchat', 'xmas', 'xx', 'xxx', 'xy', 'ya', 'yahoo', 'yan', 'yar', 'yay', 'yeah', 'year', 'years', 'yep', 'yes', 'yest', 'yesterday', 'yijue', 'ym', 'yo', 'yoga', 'yr', 'yrs', 'yun', 'yup', 'zed']


array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [11]:
train_x_token_count = np.sum(train_vec.toarray(),axis=0)
train_x_token_count

array([ 6, 19,  5, ...,  5, 33,  6], dtype=int64)

In [12]:
#create df with tokens and counts
df = pd.DataFrame({'token': train_x_token,'count':train_x_token_count})
df.head()

Unnamed: 0,token,count
0,0,6
1,0,19
2,2,5
3,3,9
4,4,9


### Calculating spam for each token

In [13]:
#Create saperate dataframe for spam
sms_ham = data[data.label == 0]
sms_spam = data[data.label == 1]

In [14]:
#learn the vocabulary of all messages
vec.fit(data.sms)
tokens = vec.get_feature_names()

In [15]:
#create document term matrix for ham and spam
ham_dtm = vec.transform(sms_ham.sms)
spam_dtm = vec.transform(sms_spam.sms)

In [16]:
#Count the token of ham messages 
ham_token_count = np.sum(ham_dtm.toarray(), axis=0)
spam_token_count = np.sum(spam_dtm.toarray(),axis=0)

In [17]:
#create dataframe of token with seperate ham and spam counts
df1 = pd.DataFrame({'token': tokens,'ham_count': ham_token_count, 'spam_count': spam_token_count})
df1

Unnamed: 0,token,ham_count,spam_count
0,00,0,10
1,000,0,29
2,02,0,8
3,03,0,13
4,04,0,12
...,...,...,...
1356,yoga,7,0
1357,yr,3,11
1358,yrs,5,3
1359,yup,43,0


In [18]:
#lets add 1 to ham and spam count to avoid dividing by zero
df1['ham_count'] = df1['ham_count'] + 1
df1['spam_count'] = df1['spam_count'] + 1

In [19]:
df1['spam_ratio'] = df1['spam_count']/df1['ham_count']
df1.sort_values('spam_ratio')

Unnamed: 0,token,ham_count,spam_count,spam_ratio
500,gt,319,1,0.003135
704,lt,317,1,0.003155
690,lor,163,1,0.006135
302,da,151,1,0.006623
643,later,136,1,0.007353
...,...,...,...,...
30,18,1,52,52.000000
1188,tone,1,61,61.000000
26,150p,1,72,72.000000
921,prize,1,94,94.000000


## Building Naive Bayes Model

In [20]:
#The multinomial Naive Bayes classifier is suitable for classification with discrete features

nb = MultinomialNB()
nb.fit(train_vec,y_train)

MultinomialNB()

In [21]:
# make class predictions for test_x_dtm
pred_y = nb.predict(test_vec)

In [22]:
# calculate accuracy
accuracy_score(y_test,pred_y)

0.986244019138756

In [23]:
confusion_matrix(y_test,pred_y)

array([[1446,   13],
       [  10,  203]], dtype=int64)

In [24]:
roc_auc_score(y_test,pred_y)

0.9720707153590954

In [25]:
# print message text for the false positives
X_test[y_test < pred_y]

5159                         No but the bluray player can
5044    We have sent JD for Customer Service cum Accou...
1081                    Can u get pic msgs to your phone?
3362                                   Can... I'm free...
1289    Hey...Great deal...Farm tour 9am to 5pm $95/pa...
2161    Is she replying. Has boye changed his phone nu...
4771    Hi, Mobile no.  &lt;#&gt;  has added you in th...
2430                           How was txting and driving
4633          These won't do. Have to move on to morphine
148                     K..i deleted my contact that why?
1505    Total video converter free download type this ...
4417                           When you get free, call me
3118                             Stop knowing me so well!
Name: sms, dtype: object

In [26]:
# print message text for the false negatives
X_test[y_test > pred_y]

1429    For sale - arsenal dartboard. Good condition b...
2557    This message is brought to you by GMW Ltd. and...
868     Hello. We need some posh birds and chaps to us...
3528    Xmas & New Years Eve tickets are now on sale f...
1939    More people are dogging in your area now. Call...
1637    0A$NETWORKS allow companies to bill for SMS, s...
2698                            FROM 88066 LOST å£12 HELP
5381           You have 1 new message. Call 0207-083-6089
954             Filthy stories and GIRLS waiting for your
2939     You have 1 new message. Please call 08712400200.
Name: sms, dtype: object