### Import modules

In [65]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

### Load dataset

In [55]:
sms_dataset = pd.read_csv('../dataset/sms-spam-collection', sep='\t', names=['label','sms_message'])

In [56]:
sms_dataset.head(5)

Unnamed: 0,label,sms_message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


### Data preprocessing

In [57]:
sms_dataset['label'] = sms_dataset.label.map({'ham':0, 'spam':1})

In [58]:
sms_dataset.head(5)

Unnamed: 0,label,sms_message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


### Split dataset (train & test)

In [59]:
X_train, X_test, y_train, y_test = train_test_split(sms_dataset['sms_message'],
                                                   sms_dataset['label'],
                                                   random_state=1)

print("Total dataset size: {}" .format(sms_dataset.shape[0]))
print("Train dataset size: {}" .format(X_train.shape[0]))
print("Test dataset size: {}" .format(X_test.shape[0]))

Total dataset size: 5572
Train dataset size: 4179
Test dataset size: 1393


### Apply  bag of words (BoW)

In [62]:
count_vector = CountVectorizer()
train_data = count_vector.fit_transform(X_train)
# Using only transform once the count vectorizer has already been fitted on the previous operation
# doing it again would make the BoW to have different dictionary sizes
test_data = count_vector.transform(X_test)

print(train_data.shape)
print(test_data.shape)

(4179, 7456)
(1393, 7456)


### Naive Bayes model - Sklearn

In [63]:
naive_bayes = MultinomialNB()
naive_bayes.fit(train_data, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [64]:
predictions = naive_bayes.predict(test_data)

### Model evaluation

In [66]:
print("Accuracy score: ", format(accuracy_score(y_test, predictions)))
print("Precision score: ", format(precision_score(y_test, predictions)))
print("Recall score: ", format(recall_score(y_test, predictions)))
print("F1 score: ", format(f1_score(y_test, predictions)))

Accuracy score:  0.9885139985642498
Precision score:  0.9720670391061452
Recall score:  0.9405405405405406
F1 score:  0.9560439560439562
