In [1]:
#importing libraries

import numpy as np
import pandas as pd
import os
import re
import string
import nltk
from sklearn.model_selection import train_test_split

In [2]:
#read data into dataframe

data = pd.read_csv('spam ham data set.csv', encoding = "ISO-8859-1")
print(data.shape)
data.drop(['Unnamed: 2','Unnamed: 3','Unnamed: 4'], axis=1, inplace=True)
print(data.shape)

(5572, 5)
(5572, 2)


In [3]:
#Downloading the files if required.
# nltk.download('punkt')
# nltk.download('stopwords')

In [4]:
data

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [5]:
#convert text to lowercase

def to_lower(text):
    return text.lower()

In [6]:
#remove digits from the data

def remove_digits(text):
    return re.sub(r'\d+', '', text)

In [7]:
#remove punctuations marks and special symbol

def remove_punctuation(text):
    return text.translate(str.maketrans('','', string.punctuation))

In [8]:
#removing stop words

def remove_stop_words(text):
    tex = ''
    stop_words = set(nltk.corpus.stopwords.words('english'))
    tokens = nltk.tokenize.word_tokenize(text)
    for i in tokens:
        if not i in stop_words:
            tex = tex +" "+ i
    return tex
#     return [i for i in tokens if not i in stop_words]

In [9]:
#preprocessing the data

def preprocess(data):
    lower_data = data.applymap(to_lower)
    rem_digit_data = lower_data.applymap(remove_digits)
    no_punc_data = rem_digit_data.applymap(remove_punctuation)
    rem_stopwords = no_punc_data.applymap(remove_stop_words)
    
    return rem_stopwords.drop_duplicates()

In [10]:
#marking spam as 1 and ham as 0 after preprocessing

clean_data = preprocess(data)
clean_data['v1'] = clean_data['v1'].replace({'spam':1, 'ham':0}, regex=True)

In [11]:
#splitting data into testing and training set

train, test = train_test_split(clean_data, test_size = 0.20)
print(len(train), len(test))

4059 1015


In [12]:
#counting the number of spam and ham in training set

count_spam = train.groupby('v1').count()['v2'].array[1]
count_ham = train.groupby('v1').count()['v2'].array[0]
total = len(train)

In [13]:
#creating the dictionaries and word counts

dictionary = {}
word_dict = {}
word_no = -1

count_spam_words = 0
count_ham_words = 0

for i in range(total):
    s_h = train['v1'].iloc[i]
    words_line = train['v2'].iloc[i].split()
    for word in words_line:
        
        if word not in word_dict:
            word_no += 1
            word_dict[word] = word_no
        
        if s_h == 0:
            try:
                dictionary[word][0] += 1
            except KeyError as e:
                count_ham_words += 1
                dictionary[word] = [1, 0]    
        
        elif s_h == 1:
            try:
                dictionary[word][1] += 1
            except KeyError as e:
                count_spam_words += 1
                dictionary[word] = [0, 1]

In [14]:
#calculating the probability of different words

for item in list(dictionary):
    dictionary[item][0] = (dictionary[item][0] + 1) / (count_ham_words + len(word_dict))
    dictionary[item][1] = (dictionary[item][1] + 1) / (count_spam_words + len(word_dict))

In [15]:
#returns the vector for a given sentence

def vectorize(sentence, word_dict):
    vector = [0] * len(word_dict)
    for word in sentence:
        if word in word_dict:
            index = word_dict[word]
            vector[index] += 1
    return vector

In [16]:
#probability of spam and ham in training data

prob_spam = count_spam/total
prob_ham = count_ham/total

In [17]:
#Predicting if the mails in test set are spam or ham using the calculated probabilities

pred_ham = 0
pred_spam = 0

pred_correct = 0

word_list = list(dictionary.keys())

for i in range(len(test)):
    prob_data_spam = 1.0
    prob_data_ham = 1.0
    
    words_line = test['v2'].iloc[i].split()
    correct_output = test['v1'].iloc[i]
    
    vector = vectorize(words_line, word_dict)
    
    for i,j in enumerate(vector):
        word = word_list[i]
        if j == 0:
            prob_data_spam *= (1 - dictionary[word][1])
            prob_data_ham *= (1 - dictionary[word][0])
        elif j == 1:
            prob_data_spam *= dictionary[word][1]
            prob_data_ham *= dictionary[word][0]

    total_prob = (prob_ham * prob_data_ham) + (prob_spam * prob_data_spam)

    prob_ham_data = (prob_data_ham * prob_ham)/ total_prob
    prob_spam_data = (prob_data_spam * prob_spam)/ total_prob
    
    if prob_ham_data > prob_spam_data:
        pred_ham += 1
        if correct_output == 0:
            pred_correct += 1
    else :
        pred_spam += 1
        if correct_output == 1:
            pred_correct += 1

In [18]:
accuracy = (pred_correct/len(test))*100
print("The accuracy of the model is {0:.2f} %".format(accuracy))

The accuracy of the model is 97.44 %


In [19]:
act_ham = test.groupby('v1').count()['v2'].array[0]
act_spam = test.groupby('v1').count()['v2'].array[1]
print("Spam : ", act_spam," detected : ", pred_spam)
print("Ham  : ", act_ham,"detected : ", pred_ham)

Spam :  122  detected :  106
Ham  :  893 detected :  909
