In [1]:
import pandas as pd

In [2]:
df = pd.read_table('Datasets/SMSSpamCollection.txt', header=None, sep='\t', names=['label', 'sms_messages'])
df.head()

Unnamed: 0,label,sms_messages
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


### Data Preprocessing

In [3]:
df['label'] = df['label'].map({'ham':0, 'spam':1})
print(df.shape)
df.head()

(5572, 2)


Unnamed: 0,label,sms_messages
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
df['label'].value_counts()

0    4825
1     747
Name: label, dtype: int64

In [5]:
documents = ['Hello, how are you!',
             'Win money, win from home.',
             'Call me now.',
             'Hello, Call hello you tomorrow?']

lower_case_documents = []
for i in documents:
    lower_case_documents.append(i.lower())
print(lower_case_documents)    

['hello, how are you!', 'win money, win from home.', 'call me now.', 'hello, call hello you tomorrow?']


In [6]:
sans_punctuation_documents = []
import string

for i in lower_case_documents:
    sans_punctuation_documents.append(i.translate(str.maketrans('', '', string.punctuation)))

print(sans_punctuation_documents)

['hello how are you', 'win money win from home', 'call me now', 'hello call hello you tomorrow']


In [7]:
preprocessed_documents = []
for i in sans_punctuation_documents:
    preprocessed_documents.append(i.split(' '))
print(preprocessed_documents)    

[['hello', 'how', 'are', 'you'], ['win', 'money', 'win', 'from', 'home'], ['call', 'me', 'now'], ['hello', 'call', 'hello', 'you', 'tomorrow']]


In [8]:
documents = ['Hello, how are you!',
             'Win money, win from home.',
             'Call me now.',
             'Hello, Call hello you tomorrow?']

lower_case_documents = []
for i in documents:
    lower_case_documents.append(i.lower())
    
print(lower_case_documents)    

['hello, how are you!', 'win money, win from home.', 'call me now.', 'hello, call hello you tomorrow?']


In [9]:
sans_punctuation_documents = []

def string_punctuation(s):
    return ''.join(c for c in s if c not in string.punctuation)

for i in lower_case_documents:
    sans_punctuation_documents.append(string_punctuation(i))
    
print(sans_punctuation_documents)    

['hello how are you', 'win money win from home', 'call me now', 'hello call hello you tomorrow']


In [10]:
preprocessed_documents = []
for i in sans_punctuation_documents:
    preprocessed_documents.append(i.split())
print(preprocessed_documents)    

[['hello', 'how', 'are', 'you'], ['win', 'money', 'win', 'from', 'home'], ['call', 'me', 'now'], ['hello', 'call', 'hello', 'you', 'tomorrow']]


In [11]:
frequency_list = []
import pprint
from collections import Counter

for i in preprocessed_documents:
    frequency_list.append(Counter(i))
    
pprint.pprint(frequency_list)    

[Counter({'hello': 1, 'how': 1, 'are': 1, 'you': 1}),
 Counter({'win': 2, 'money': 1, 'from': 1, 'home': 1}),
 Counter({'call': 1, 'me': 1, 'now': 1}),
 Counter({'hello': 2, 'call': 1, 'you': 1, 'tomorrow': 1})]


In [12]:
from sklearn.feature_extraction.text import CountVectorizer

count_vector = CountVectorizer(stop_words='english')

In [13]:
count_vector.fit(documents)

CountVectorizer(stop_words='english')

In [14]:
count_vector.get_feature_names()

['hello', 'home', 'money', 'tomorrow', 'win']

In [15]:
doc_array = count_vector.transform(documents).toarray()

In [16]:
doc_array

array([[1, 0, 0, 0, 0],
       [0, 1, 1, 0, 2],
       [0, 0, 0, 0, 0],
       [2, 0, 0, 1, 0]], dtype=int64)

In [17]:
frequency_matrix = pd.DataFrame(data = doc_array, columns=count_vector.get_feature_names())
frequency_matrix

Unnamed: 0,hello,home,money,tomorrow,win
0,1,0,0,0,0
1,0,1,1,0,2
2,0,0,0,0,0
3,2,0,0,1,0


In [18]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df['sms_messages'], df['label'], random_state=1)

print(f'Number of rows in the total set: {df.shape[0]}')
print(f'Number of rows in the training set: {X_train.shape[0]}')
print(f'Number of rows in the testing set: {X_test.shape[0]}')

Number of rows in the total set: 5572
Number of rows in the training set: 4179
Number of rows in the testing set: 1393


In [19]:
count_vect = CountVectorizer()

training_data = count_vect.fit_transform(X_train)

testing_data = count_vect.transform(X_test)

### Naive Bayes Implementation using Scikit-learn

In [22]:
from sklearn.naive_bayes import MultinomialNB
naive_bayes = MultinomialNB()
naive_bayes.fit(training_data, y_train)

MultinomialNB()

In [23]:
predictions = naive_bayes.predict(testing_data)

In [26]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

print(f'Accuracy Score: {accuracy_score(y_test, predictions)*100:0.4f}%')
print(f'Precision Score: {precision_score(y_test, predictions)*100:0.4f}%')
print(f'Recall Score: {recall_score(y_test, predictions)*100:0.4f}%')
print(f'F1 Score: {f1_score(y_test, predictions)*100:0.4f}%')

Accuracy Score: 98.8514%
Precision Score: 97.2067%
Recall Score: 94.0541%
F1 Score: 95.6044%
