In [1]:
# Libraries

import pandas as pd
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
# Implementing Bag of Words from scratch - just for learning
import pprint  
from collections import Counter
#Implementing Bag of Words in scikit-learn
from sklearn.feature_extraction.text import CountVectorizer
#Training and testing sets
from sklearn.model_selection import train_test_split     #cross selection is not active anymore
#Naive Bayes implementation using scikit-learn
from sklearn.naive_bayes import MultinomialNB
#Evaluating the model
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [2]:
#Importing data

data=pd.read_csv('C://Users//TheGhost//Desktop//data.txt' ,sep='\t' ,header =None , names=['label' , 'sms_message'] )   

#Because this is a tab separated dataset we will be using '\t' as the value for the 'sep' argumentwhich specifies this format

# changeing output into numerical values (0&1) using Mapping
data['label'] = data.label.map( {'ham':0, 'spam':1})
data['label']

data.head()



Unnamed: 0,label,sms_message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
# Implementing Bag of Words from scratch - just for learning

#Step 1: Convert all strings to their lower case form.
data_lower_case=[]
for i in data['sms_message'] :
        data_lower_case.append(i.lower())

#Step 2: Removing all punctuations
data_remove_punctuations=[]
for i in data_lower_case :
    str(i)
    x=re.sub(r'[^\w\s]','',i)
    data_remove_punctuations.append(x)

#Step 3: Tokenization
data_Tokenization=[]
for i in data_remove_punctuations :
    i=word_tokenize(i)
    data_Tokenization.append(i)

#Step 4: stop words removal
data_stop_words_removal=[]  
for i in data_Tokenization :
    i=[w for w in i if w not in stopwords.words('english')] 
    data_stop_words_removal.append(i)

#Step 4: Count frequencies
frequency_list = []
for y in data_stop_words_removal:
    frequency_counts = Counter(y)
    frequency_list.append(frequency_counts)
    
    


In [4]:
# splitting the Training and testing sets

X_train, X_test, y_train, y_test = train_test_split(data['sms_message'], 
                                                    data['label'], 
                                                    random_state=1)

print('Number of rows in the total set: {}'.format(data.shape[0]))
print('Number of rows in the training set: {}'.format(X_train.shape[0]))
print('Number of rows in the test set: {}'.format(X_test.shape[0]))


Number of rows in the total set: 5574
Number of rows in the training set: 4180
Number of rows in the test set: 1394


In [5]:
#Implementing Bag of Words in scikit-learn

# Instantiate the CountVectorizer method
count_vector = CountVectorizer(stop_words='english')
# Fit the training data and then return the matrix
training_data = count_vector.fit_transform(X_train)

# Transform testing data and return the matrix. Note we are not fitting the testing data into the CountVectorizer()
testing_data = count_vector.transform(X_test)



In [6]:
#Naive Bayes implementation using scikit-learn

naive_bayes = MultinomialNB()
naive_bayes.fit(training_data, y_train)

#make predictions on the test data
predictions = naive_bayes.predict(testing_data)



In [7]:
#Evaluating the model

print('Accuracy score: ', format(accuracy_score(y_test, predictions)))
print('Precision score: ', format(precision_score(y_test, predictions)))
print('Recall score: ', format(recall_score(y_test, predictions)))
print('F1 score: ', format(f1_score(y_test, predictions)))

#Accuracy score:  0.9928263988522238
#Precision score:  0.9777777777777777
#Recall score:  0.967032967032967
#F1 score:  0.9723756906077348

Accuracy score:  0.9928263988522238
Precision score:  0.9777777777777777
Recall score:  0.967032967032967
F1 score:  0.9723756906077348
