In [40]:
import pandas as pd
import numpy as np
import nltk

from nltk import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import log_loss, precision_score, recall_score, f1_score

### Read data

In [2]:
data = pd.read_table('spam.csv', encoding='latin-1', header=None)

In [3]:
df = pd.DataFrame(data=(s.split(',', 1) for s in data[0][1:]), columns=['class','text'])
df['class_id'] = df['class']=='spam' # as values from the first column are unclean
                                    # we move our target value to boolean 'class_id' column

In [4]:
df.head()

Unnamed: 0,class,text,class_id
0,ham,"""Go until jurong point, crazy.. Available only...",False
1,ham,"Ok lar... Joking wif u oni...,,,",False
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,True
3,ham,U dun say so early hor... U c already then say...,False
4,ham,"""Nah I don't think he goes to usf, he lives ar...",False


In [5]:
del df['class']
df.head()

Unnamed: 0,text,class_id
0,"""Go until jurong point, crazy.. Available only...",False
1,"Ok lar... Joking wif u oni...,,,",False
2,Free entry in 2 a wkly comp to win FA Cup fina...,True
3,U dun say so early hor... U c already then say...,False
4,"""Nah I don't think he goes to usf, he lives ar...",False


In [6]:
df.shape

(5574, 2)

In [7]:
X = df['text'].values
Y = df['class_id'].values

### Data partition into train and test sets

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.1, random_state=42)

### Creating features with BOW

In [9]:
vectorizer = CountVectorizer()

In [10]:
vectorizer.fit(X_train)
X_train_bow = vectorizer.transform(X_train)
X_test_bow = vectorizer.transform(X_test)

In [11]:
print(X_train_bow.shape)
print(X_test_bow.shape)

(5016, 8255)
(558, 8255)


In [12]:
X_train_bow[0,:]

<1x8255 sparse matrix of type '<class 'numpy.int64'>'
	with 36 stored elements in Compressed Sparse Row format>

In [13]:
spam_indexes_train = np.arange(y_train.shape[0])[y_train == True]
ham_indexes_train = np.arange(y_train.shape[0])[y_train == False]
# spam_indexes_test = np.arange(y_test.shape[0])[y_test == True]
# ham_indexes_test = np.arange(y_test.shape[0])[y_test == False]

In [14]:
X_train_bow_spam = X_train_bow[spam_indexes_train, :]
X_train_bow_ham = X_train_bow[ham_indexes_train, :]
# X_test_bow_spam = X_test_bow[spam_indexes_test, :]
# X_test_bow_ham = X_test_bow[ham_indexes_test, :]

### Getting values to calculate Naive Bayes 

In [15]:
n_train_spam = X_train_bow_spam.shape[0]
n_train_ham = X_train_bow_ham.shape[0]

In [16]:
prob_spam = n_train_spam / (n_train_ham + n_train_spam)
prob_ham = 1 - prob_spam

In [17]:
prob_ham

0.8670255183413078

In [18]:
dist_spam = np.sum(X_train_bow_spam, axis=0)
dist_ham = np.sum(X_train_bow_ham, axis=0)

In [19]:
dist_ham.shape

(1, 8255)

In [20]:
m = dist_spam / n_train_spam

In [21]:
n_word = dist_spam.shape[1]
feature_weights_spam = np.zeros(n_word)
feature_weights_ham = np.zeros(n_word)

for i in range(n_word):
    
    f_prob_spam = dist_spam[0,i] / n_train_spam
    if f_prob_spam != 0.0:
        feature_weights_spam[i] = f_prob_spam
        
    f_prob_ham = dist_ham[0,i] / n_train_ham
    if f_prob_ham != 0.0:
        feature_weights_ham[i] = f_prob_ham        

In [22]:
feature_weights_spam.shape

(8255,)

### Making prediction implementing Naive Bayes model

In [24]:
def spam_predict(sent):
    dot_pr_sp = np.dot(sent.toarray(), feature_weights_spam)
    res_spam = dot_pr_sp * prob_spam
    dot_pr_h = np.dot(sent.toarray(), feature_weights_ham)
    res_ham = dot_pr_h * prob_ham
    if res_spam > res_ham:
        return True
    else:
        return False

In [31]:
pred_train = []
for x in X_train_bow:
    pred_train.append(spam_predict(x))
pred_train = np.array(pred_train)

In [32]:
pred_test = []
for x in X_test_bow:
    pred_test.append(spam_predict(x))
pred_test = np.array(pred_test)

In [34]:
print(y_train.shape, pred_train.shape)
print(y_test.shape, pred_test.shape)

(5016,) (5016,)
(558,) (558,)


In [37]:
sum(y_train == pred_train)/5016

0.87420255183413076

#### Evaluating results

In [41]:
def evaluate(y_true, y_pred, threshold=0.5):
    y_pred_label = y_pred >= threshold
    
    print('Log loss: ', log_loss(y_true, y_pred))
    print('Precision:', precision_score(y_true, y_pred_label))
    print('Recall:   ', recall_score(y_true, y_pred_label))
    print('F1 score: ', f1_score(y_true, y_pred_label))

Training evaluation

In [42]:
evaluate(y_train, pred_train)

Log loss:  4.34489025207
Precision: 0.95
Recall:    0.0569715142429
F1 score:  0.107496463932


Testing evaluation

In [44]:
evaluate(y_test, pred_test)

Log loss:  4.6423100882
Precision: 0.857142857143
Recall:    0.075
F1 score:  0.137931034483


### Making prediction using Naive Bayes from sklearn library

In [45]:
model = MultinomialNB()

In [46]:
model.fit(X_train_bow, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [47]:
y_pred_train = model.predict(X_train_bow)

In [48]:
y_pred_test = model.predict(X_test_bow)

#### Evaluating results

In [50]:
print('Training evaluation')
evaluate(y_train, y_pred_train)
print()
print('Testing evaluation')
evaluate(y_test, y_pred_test)

Training evaluation
Log loss:  0.220344984012
Precision: 0.981790591806
Recall:    0.970014992504
F1 score:  0.975867269985

Testing evaluation
Log loss:  0.371386125389
Precision: 0.986842105263
Recall:    0.9375
F1 score:  0.961538461538
