1 - Imports

In [1]:
import numpy as np
import pandas as pd
import os
import warnings
warnings.filterwarnings('ignore')
from collections import Counter

2 - We extract the data from the file

In [2]:
def extract_data(message_file):
    # Create dataframe
    data = pd.DataFrame(columns=['Message', 'Spam'])

    # Open the file
    with open(message_file, 'r') as file:
        for line in file:
            # Get spam boolean value and line string
            if line.split()[0] == 'ham':
                spam = 0
                line = line[4:-1]
            elif line.split()[0] == 'spam':
                spam = 1
                line = line[8:-1]
            
            # Append new row
            data = data.append({ 'Message': line, 'Spam': spam }, ignore_index=True)
    
    return data

data = extract_data('messages.txt')
data.head()

Unnamed: 0,Message,Spam
0,Yup i've finished c Ã¼ there...,0
1,Remember to ask alex about his pizza,0
2,No da..today also i forgot..,0
3,Ola would get back to you maybe not today but ...,0
4,Fwiw the reason I'm only around when it's time...,0


3 - Divide the data in training and test examples

In [3]:
def divide_data(data, train_frac=0.8):
    # Divide
    df_train = data.sample(frac=train_frac)
    df_test = data.drop(df_train.index)
    
    # Reset indexes
    df_train = df_train.reset_index()
    df_train.drop(columns='index', inplace=True)
    df_test = df_test.reset_index()
    df_test.drop(columns='index', inplace=True)
    
    return df_train, df_test

df_train, df_test = divide_data(data)

print(df_train.shape)
print(df_test.shape)

(4000, 2)
(1000, 2)


4 - Generate a dictionary from the training data

In [4]:
def make_dictionary(df, nb_words=2000):
    # We have a list of 'stop words': commonly used words that are useless for Spam identification 
    # (see https://www.geeksforgeeks.org/removing-stop-words-nltk-python/)
    stop_words = ['ourselves', 'hers', 'between', 'yourself', 'but', 'again', 'there', 'about', 'once',
                  'during', 'out', 'very', 'having', 'with', 'they', 'own', 'an', 'be', 'some', 'for', 
                  'do', 'its', 'yours', 'such', 'into', 'of', 'most', 'itself', 'other', 'off', 'is', 
                  's', 'am', 'or', 'who', 'as', 'from', 'him', 'each', 'the', 'themselves', 'until', 
                  'below', 'are', 'we', 'these', 'your', 'his', 'through', 'don', 'nor', 'me', 'were', 
                  'her', 'more', 'himself', 'this', 'down', 'should', 'our', 'their', 'while', 'above', 
                  'both', 'up', 'to', 'ours', 'had', 'she', 'all', 'no', 'when', 'at', 'any', 'before', 
                  'them', 'same', 'and', 'been', 'have', 'in', 'will', 'on', 'does', 'yourselves', 
                  'then', 'that', 'because', 'what', 'over', 'why', 'so', 'can', 'did', 'not', 'now', 
                  'under', 'he', 'you', 'herself', 'has', 'just', 'where', 'too', 'only', 'myself', 
                  'which', 'those', 'i', 'after', 'few', 'whom', 't', 'being', 'if', 'theirs', 'my', 
                  'against', 'a', 'by', 'doing', 'it', 'how', 'further', 'was', 'here', 'than']
    all_words = []
    
    # Read all the data
    for i, row in df.iterrows():
        # Add words
        all_words += row['Message'].split()
    
    # Create dictionary
    dictionary = Counter(all_words)
    
    # Remove bad items
    for item in list(dictionary):
        # Non-alphanumeric words
        if item.isalpha() == False:
            del dictionary[item]
        # One letter words
        elif len(item) == 1:
            del dictionary[item]
        # Elimate bad words for Spam identification
        elif item.lower() in stop_words:
            del dictionary[item]
    
    # Keep the *nb_words* most common words
    dictionary = dictionary.most_common(nb_words)
    
    return dictionary
              
dictionary = make_dictionary(df_train, nb_words=1500)
print(dictionary[:4])

[('call', 273), ('get', 231), ('ur', 213), ('go', 168)]


5 - Extract features from the data sets

In [5]:
def extract_features(df, dictionary, multi=True):
    # Add feature columns to dataframe
    for word_id, d in enumerate(dictionary):
        df[d[0]] = 0
    
    # Read all rows
    for row_id, row in df.iterrows():
        # Get all the words of the message
        word_list = row['Message'].split()
        
        # For all these words
        for word in word_list:
            # For all the words in the dictionary
            for word_id, d in enumerate(dictionary):
                if (d[0] == word) and (row[d[0]] == 0):
                    if multi:
                        # Put the count of this word in the right columns
                        df.at[row_id, d[0]] = word_list.count(word)
                    else:
                        # Put 1 in the right column
                        df.at[row_id, d[0]] = 1
    
    # Remove 'Message' column
    df.drop(columns='Message', inplace=True)

extract_features(df_train, dictionary)
extract_features(df_test, dictionary)
df_train.head()

Unnamed: 0,Spam,call,get,ur,go,know,like,got,come,want,...,returns,however,Chinese,insurance,Msgs,gentle,juicy,Convey,hip,kb
0,0,0,0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


6 - Implement the fit function of the Naive Bayes algorithm

In [6]:
def fit(df):
    # theta_n | y = 0 AND theta_n | y = 1
    theta_n = [pd.Series(), pd.Series()]
    theta_y = 0
    I = df.shape[0]
                    
    # Get the nb of spams
    nb_spam = df['Spam'].sum()
    
    # Get the nb of hams
    nb_ham = I - nb_spam
    
    # Compute theta_y
    theta_y = nb_spam / I
    
    # For all words
    for word in df.drop(columns='Spam').columns:
        theta_n[1][word] = ((df['Spam'] * df[word]).sum() + 1) / (nb_spam + 2)
        theta_n[0][word] = (((1 - df['Spam']) * df[word]).sum() + 1) / (nb_ham + 2)
        
    return theta_n, theta_y

theta_n, theta_y = fit(df_train)

7 - Implement the predict function of the Naive Bayes algorithm

In [7]:
def predict(X, theta_n, theta_y):
    X['Prediction'] = 0
    
    # For all rows
    for row_id, row in X.drop(columns='Prediction').iterrows():
        # Compute the probability of each word
        prob_0 = row * theta_n[0]
        prob_1 = row * theta_n[1]
        
        # Compute the probability of P(Y|X)
        predict_0 = (1 - theta_y)
        predict_1 = theta_y
        for prob in prob_0:
            if prob != 0:
                predict_0 *= prob
        for prob in prob_1:
            if prob != 0:
                predict_1 *= prob
        
        # Conclude with prediction: whatever is the highest prediction
        if predict_1 >= predict_0:
            X.at[row_id, 'Prediction'] = 1
    
    return X['Prediction']

prediction = predict(df_test.drop(columns='Spam'), theta_n, theta_y)
            

8 - Compute the confusion matrix

In [8]:
conf_mat = pd.crosstab(df_test['Spam'], prediction)
conf_mat

Prediction,0,1
Spam,Unnamed: 1_level_1,Unnamed: 2_level_1
0,791,63
1,8,138


In [9]:
accuracy = 100 * (conf_mat[0][0] + conf_mat[1][1]) / conf_mat.sum().sum()
print('We have good results. The accuracy is {}%, which is pretty good. However, we have a lot of False positives.'.format(accuracy))

We have good results. The accuracy is 92.9%, which is pretty good. However, we have a lot of False positives.
