In [1]:
import pandas as pd

# Import the spam dataset
email_data = pd.read_csv('/home/jack/SMSSpamCollection.csv', sep='\t', names=['Class', 'Message'])

email_data

Unnamed: 0,Class,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [2]:
# Amount of spam and non-spam emails
email_data['Class'].value_counts(normalize=True)

ham     0.865937
spam    0.134063
Name: Class, dtype: float64

In [3]:
# We're going to split our data set so we can test
# the model later with data it hasn't already seen

# training_emails = 70% of the data
# testing_emails = 30% of the data
from sklearn.model_selection import train_test_split
train_emails, test_emails = train_test_split(email_data, test_size=0.3, random_state=42)

In [4]:
not_spam = train_emails['Class'].value_counts(normalize=True)[0]
spam = train_emails['Class'].value_counts(normalize=True)[1]

In [5]:
# Removes punctuation
train_emails['Message'] = train_emails['Message'].str.replace('\W', ' ')
# Makes message all lowercase
train_emails['Message'] = train_emails['Message'].str.lower()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [6]:
train_emails['Message'] = train_emails['Message'].str.split()

vocabulary = []
for message in train_emails['Message']:
    for word in message:
        vocabulary.append(word)

vocabulary = list(set(vocabulary))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [7]:
len(vocabulary)

7301

In [8]:
word_counts_per_sms = {unique_word: [0] * len(train_emails['Message']) for unique_word in vocabulary}

for index, message in enumerate(train_emails['Message']):
    for word in message:
        word_counts_per_sms[word][index] += 1

In [9]:
word_counts = pd.DataFrame(word_counts_per_sms)
word_counts.head(10)

Unnamed: 0,every,sao,fusion,difficulties,working,nalli,room,will,cleaning,09064012160,...,waking,09056242159,inches,rate,pics,paru,sagamu,adewale,08718738002,nahi
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
# Reset the index of the dataframes we'll be concatinating
train_emails.reset_index(drop=True, inplace=True)
word_counts.reset_index(drop=True, inplace=True)

In [12]:
emails_and_counts = pd.concat([train_emails, word_counts], axis=1)

In [15]:
emails_and_counts.head()

Unnamed: 0,Class,Message,every,sao,fusion,difficulties,working,nalli,room,will,...,waking,09056242159,inches,rate,pics,paru,sagamu,adewale,08718738002,nahi
0,ham,"[quite, late, lar, ard, 12, anyway, i, wun, b,...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,ham,"[on, a, tuesday, night, r, u, 4, real]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,ham,"[go, chase, after, her, and, run, her, over, w...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,ham,"[g, says, you, never, answer, your, texts, con...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,ham,"[still, work, going, on, it, is, very, small, ...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [16]:
# Isolating spam and ham messages first
spam_messages = emails_and_counts[emails_and_counts['Class'] == 'spam']
ham_messages = emails_and_counts[emails_and_counts['Class'] == 'ham']

# P(Spam) and P(Ham)
p_spam = len(spam_messages) / len(emails_and_counts)
p_ham = len(ham_messages) / len(emails_and_counts)

# N_Spam
n_words_per_spam_message = spam_messages['Message'].apply(len)
n_spam = n_words_per_spam_message.sum()

# N_Ham
n_words_per_ham_message = ham_messages['Message'].apply(len)
n_ham = n_words_per_ham_message.sum()

# N_Vocabulary
n_vocabulary = len(vocabulary)

# Laplace smoothing
alpha = 1

In [17]:
# Initialize parameters
spam_parameters = {unique_word:0 for unique_word in vocabulary}
ham_parameters = {unique_word:0 for unique_word in vocabulary}

# Calculate parameters
for word in vocabulary:
    # Number of unique words in spam messages
    n_word_given_spam = spam_messages[word].sum()
    # Probability a certain vocab word will be in a spam message
    p_word_given_spam = (n_word_given_spam + alpha) / (n_spam + alpha*n_vocabulary)
    spam_parameters[word] = p_word_given_spam

    # Number of unique words in non-spam messages
    n_word_given_ham = ham_messages[word].sum()
    # Probability a certain vocab word will be in a non-spam message
    p_word_given_ham = (n_word_given_ham + alpha) / (n_ham + alpha*n_vocabulary)
    ham_parameters[word] = p_word_given_ham

In [28]:
import re

def NaiveBayesClassifier(message):

    # Remove the punctuation using the re.sub() function
    message = re.sub('\W', ' ', message)
    # Bring all letters to lower case using the str.lower() method
    message = message.lower().split()

    # Calculate p_spam_given_message and p_ham_given_message 
    p_spam_given_message = p_spam
    p_ham_given_message = p_ham

    for word in message:
        if word in parameters_spam:
            p_spam_given_message *= parameters_spam[word]

        if word in parameters_ham: 
            p_ham_given_message *= parameters_ham[word]

    print('Spam Probability:', p_spam_given_message)
    print('Ham Probability:', p_ham_given_message)

    # Compare p_spam_given_message with p_ham_given_message
    # then print a classification label. 
    if p_ham_given_message > p_spam_given_message:
        print('Email Type: Ham')
    elif p_ham_given_message < p_spam_given_message:
        print('Email Type: Spam')
    else:
        print('Email Type: Equal probabilities spam or ham')

In [29]:
NaiveBayesClassifier('WINNER! FREE 1000 dollar gift card! Just enter your social security number and birthday!')

Spam Probability: 3.746381913822179e-43
Ham Probability: 2.7803438912906147e-47
Email Type: Spam


In [30]:
NaiveBayesClassifier("That article I just read on Naive Bayes Classifiers was so cool")

Spam Probability: 1.2220427379964892e-26
Ham Probability: 5.581785151765721e-21
Email Type: Ham


In [33]:
def NB_test(message):

    # Remove the punctuation with re.sub()
    message = re.sub('\W', ' ', message)
    # Make all letters lower case with str.lower()
    message = message.lower().split()

    # Calculate p_spam_given_message and p_ham_given_message 
    p_spam_given_message = p_spam
    p_ham_given_message = p_ham

    for word in message:
        if word in parameters_spam:
            p_spam_given_message *= parameters_spam[word]

        if word in parameters_ham:
            p_ham_given_message *= parameters_ham[word]

    # Compare p_spam_given_message with p_ham_given_message
    # then RETURN a classification label instead of printing             
    if p_ham_given_message > p_spam_given_message:
        return 'ham'
    elif p_spam_given_message > p_ham_given_message:
        return 'spam'
    else:
        return 'equal'

In [37]:
test_emails['Predicted'] = test_emails['Message'].apply(NB_test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [40]:
test_emails

Unnamed: 0,Class,Message,predicted,Predicted
3245,ham,Squeeeeeze!! This is christmas hug.. If u lik ...,ham,ham
944,ham,And also I've sorta blown him off a couple tim...,ham,ham
1044,ham,Mmm thats better now i got a roast down me! i...,ham,ham
2484,ham,Mm have some kanji dont eat anything heavy ok,ham,ham
812,ham,So there's a ring that comes with the guys cos...,ham,ham
...,...,...,...,...
2505,ham,"Hello, my boytoy! I made it home and my consta...",ham,ham
2525,spam,FREE entry into our £250 weekly comp just send...,spam,spam
4975,ham,Aiyo u so poor thing... Then u dun wan 2 eat? ...,ham,ham
650,spam,"You have won ?1,000 cash or a ?2,000 prize! To...",spam,spam


In [38]:
correctly_predicted_messages = 0
total_messages = test_emails.shape[0]

# For each row in our test set
for row in test_emails.iterrows():
    row = row[1]
    # If the class label is equal to the label predicted
    if row['Class'] == row['predicted']:
        # Increment the number of correctly predicted messages by 1
        correctly_predicted_messages += 1

print('Correctly Classified Emails:', correctly_predicted_messages)
print('Incorrectly Classified Emails:', total_messages - correctly_predicted_messages, '\n')
print('Accuracy:', correctly_predicted_messages / total_messages)

Correctly Classified Emails: 1655
Incorrectly Classified Emails: 17 

Accuracy: 0.9898325358851675
