In [1]:
import pandas as pd
import re  # RegEx

In [2]:
# read the data
data = pd.read_csv('spam_ham_dataset.csv',
                   sep=',',
                   header=0,
                   names=['anon','Label', 'Mail', 'label_num'])

# take a look at the data
print(data.shape)
data.head()

(5171, 4)


Unnamed: 0,anon,Label,Mail,label_num
0,605,ham,Subject: enron methanol ; meter # : 988291\r\n...,0
1,2349,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,3624,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,4685,spam,"Subject: photoshop , windows , office . cheap ...",1
4,2030,ham,Subject: re : indian springs\r\nthis deal is t...,0


In [3]:
# check distribution of classes
data['Label'].value_counts(normalize=True)

ham     0.710114
spam    0.289886
Name: Label, dtype: float64

In [4]:
data.dtypes  # check dtypes

anon          int64
Label        object
Mail         object
label_num     int64
dtype: object

In [5]:
# randomize the data
data_randomized = data.sample(frac=1, random_state=1)

# create split index
train_test_index = round(len(data_randomized) * 0.7)

# split into train and test data
train_data = data_randomized[:train_test_index].reset_index(drop=True)
test_data = data_randomized[train_test_index:].reset_index(drop=True)

# check size of train and test set
print(train_data.shape)
print(test_data.shape)

(3620, 4)
(1551, 4)


In [6]:
# check distribution of classes in training data
train_data['Label'].value_counts(normalize=True)

ham     0.705249
spam    0.294751
Name: Label, dtype: float64

In [7]:
# check distribution of classes in test data
test_data['Label'].value_counts(normalize=True)

ham     0.72147
spam    0.27853
Name: Label, dtype: float64

In [8]:
# look at data before cleaning up
train_data.head(3)

Unnamed: 0,anon,Label,Mail,label_num
0,3430,ham,"Subject: what the heck\r\ndaren ,\r\nnow what ...",0
1,2070,ham,Subject: hilcorp old ocean volume\r\naccording...,0
2,3974,spam,"Subject: jurirne get latest softwares , 99 % s...",1


In [9]:
# clean the data
train_data['Mail'] = train_data['Mail'].str.replace('\W', ' ') # remove punctuation with regular expression
train_data['Mail'] = train_data['Mail'].str.lower()  # make everything lowercase
train_data.head(3)  # look at clean data

  train_data['Mail'] = train_data['Mail'].str.replace('\W', ' ') # remove punctuation with regular expression


Unnamed: 0,anon,Label,Mail,label_num
0,3430,ham,subject what the heck daren now what i ...,0
1,2070,ham,subject hilcorp old ocean volume according t...,0
2,3974,spam,subject jurirne get latest softwares 99 s...,1


In [10]:
# train data as list
train_data['Mail'] = train_data['Mail'].str.split()

# create vocabulary
vocabulary = []
for mail in train_data['Mail']:
   for word in mail:
      vocabulary.append(word)

# use set to deduplicate (as sets allow no duplicates) and change back to list -> easy deduplication
vocabulary = list(set(vocabulary))

In [11]:
# check size of the vocabulary
len(vocabulary)

41198

In [12]:
# create word count 'matrix'
word_counts_per_mail = {unique_word: [0] * len(train_data['Mail']) for unique_word in vocabulary}

for index, mail in enumerate(train_data['Mail']):
   for word in mail:
      word_counts_per_mail[word][index] += 1

In [13]:
# store word count in pd.DataFrame
word_counts = pd.DataFrame(word_counts_per_mail)
word_counts.head()  # take a quick look

Unnamed: 0,acjab,gms,bootstrap,stayed,booth,ces,kishore,msidqi,cartie,investigations,...,sixth,recieves,limit,structure,relaxing,zeil,paragraph,divest,446555,polarogram
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [14]:
# add the word counts to the train data
train_data_clean = pd.concat([train_data, word_counts], axis=1)
train_data_clean.head()  # take a quick look once again

Unnamed: 0,anon,Label,Mail,label_num,acjab,gms,bootstrap,stayed,booth,ces,...,sixth,recieves,limit,structure,relaxing,zeil,paragraph,divest,446555,polarogram
0,3430,ham,"[subject, what, the, heck, daren, now, what, i...",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2070,ham,"[subject, hilcorp, old, ocean, volume, accordi...",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3974,spam,"[subject, jurirne, get, latest, softwares, 99,...",1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1502,ham,"[subject, sitara, patch, a, patch, is, being, ...",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4591,spam,"[subject, archived, great, shots, of, californ...",1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
# separate spam from ham
spam_mails = train_data_clean[train_data_clean['Label'] == 'spam']
ham_mails = train_data_clean[train_data_clean['Label'] == 'ham']

# calculate absolute probabilities for spam and ham (for train_data_clean)
p_spam = len(spam_mails) / len(train_data_clean)
p_ham = len(ham_mails) / len(train_data_clean)

# amount of words of spam in total
n_words_per_spam_mail = spam_mails['Mail'].apply(len)
n_spam = n_words_per_spam_mail.sum()

# amount of words of ham in total
n_words_per_ham_message = ham_mails['Mail'].apply(len)
n_ham = n_words_per_ham_message.sum()

# amount of words in vocabulary
n_vocabulary = len(vocabulary)

# Laplacian smoothing
alpha = 1

In [16]:
# initial params
parameters_spam = {unique_word:0 for unique_word in vocabulary}
parameters_ham = {unique_word:0 for unique_word in vocabulary}

# calculate params
for word in vocabulary:
   n_word_given_spam = spam_mails[word].sum() # spam_mails defined above
   p_word_given_spam = (n_word_given_spam + alpha) / (n_spam + alpha*n_vocabulary)
   parameters_spam[word] = p_word_given_spam

   n_word_given_ham = ham_mails[word].sum() # ham_mails defined above
   p_word_given_ham = (n_word_given_ham + alpha) / (n_ham + alpha*n_vocabulary)
   parameters_ham[word] = p_word_given_ham

In [17]:
# defining a function to classify mails as spam or ham
def classify(mail):
   mail = re.sub('\W', ' ', mail)  # remove punctuation
   mail = mail.lower().split()  # make everything lowercase and split into a list

   # probability of being given a spam or ham mail
   p_spam_given_mail = p_spam
   p_ham_given_mail = p_ham

   # calculate probability of spam or ham given the mail
   for word in mail:
      if word in parameters_spam:
         p_spam_given_mail *= parameters_spam[word]

      if word in parameters_ham: 
         p_ham_given_mail *= parameters_ham[word]

   # print probabilities
   print(f'P(spam|mail): {p_spam_given_mail}')
   print(f'P(ham|mail): {p_ham_given_mail}')

   # print class
   if p_ham_given_mail > p_spam_given_mail:
      print('This is not spam, but ham!')
   elif p_ham_given_mail < p_spam_given_mail:
      print('This is spam!')
   else:
      print('Cannot classify!')  # classification is not possible if the probabilities are equal

In [18]:
# using the classify function for examples
classify('Hey, do you want to come hang out?')

P(spam|mail): 1.324865032925075e-26
P(ham|mail): 3.453417684382185e-26
This is not spam, but ham!


In [19]:
classify('Hey, you won a car! Just call us and pick it up!')

P(spam|mail): 5.362055682493099e-38
P(ham|mail): 2.86705569318837e-38
This is spam!


In [20]:
# defining a classification function for the test data to evaluate the spam detection
def classify_test_data(mail):
   mail = re.sub('\W', ' ', mail)  # remove punctuation
   mail = mail.lower().split()  # set everything lowercase and split into a list

   # probability of being a spam or ham mail
   p_spam_given_mail = p_spam
   p_ham_given_mail = p_ham

   # calculate probabilities for spam or ham given the mail
   for word in mail:
      if word in parameters_spam:
         p_spam_given_mail *= parameters_spam[word]

      if word in parameters_ham:
         p_ham_given_mail *= parameters_ham[word]

   # return classification, no print needed here!
   if p_ham_given_mail > p_spam_given_mail:
      return 'ham'
   elif p_spam_given_mail > p_ham_given_mail:
      return 'spam'
   else:
      return 'Cannot classify!'  # in case of equal probabilities

In [21]:
# apply classify_test_data() on test data
test_data['predicted'] = test_data['Mail'].apply(classify_test_data)
test_data.head()

Unnamed: 0,anon,Label,Mail,label_num,predicted
0,4865,spam,Subject: something unusual\r\njust her size . ...,1,spam
1,552,ham,Subject: during / after hours contact informat...,0,ham
2,2311,ham,"Subject: hpl noms - dec . 30 - 31 , 2000 and j...",0,ham
3,3045,ham,Subject: account activated\r\nthank you for re...,0,Cannot classify!
4,821,ham,Subject: may nom\r\ndaren the stuff outlined i...,0,Cannot classify!


In [22]:
tp = 0  # true positive (here: identified spam as spam)
tn = 0  # true negative (here: identified ham as ham)
fp = 0  # false positive (here: identified ham as spam)
fn = 0  # false negative (here: identified spam as ham)
no_classification = 0

total = test_data.shape[0]  # amount of predictions

# calculate tp, tn, fp, fn
for row in test_data.iterrows():  # iterate over all rows
   row = row[1]
   if row['Label'] == row['predicted']:
      if row['predicted'] == 'spam':
         tp += 1
      elif row['predicted'] == 'ham':
         tn += 1
      else:
         pass
   elif row['predicted'] == 'spam':
      fp += 1
   elif row['predicted'] == 'ham':
      fn += 1
   elif row['predicted'] == 'Cannot classify!':
      no_classification += 1
   else:
      pass

print(f'true positive: {tp}')
print(f'true negative: {tn}')
print(f'false positive: {fp}')
print(f'false negative: {fn}')
print(f'Cannot classify: {no_classification}')
print(f'total predictions: {total}')

# calculate accuracy without no class in total
correct = tp + tn
acc_without_no_class = correct/(total-no_classification)
# print accuracy
print(f'Accuracy with (total-no_classification): {acc_without_no_class}')

# calculate accuracy
correct = tp + tn
acc = correct/total
# print accuracy
print(f'Accuracy: {acc}')

# calculate precision
prec = tp / (tp + fp)
# print precision
print(f'Precision: {prec}')

# calculate recall
rec = tp / (tp + fn)
# print recall
print(f'Recall: {rec}')

# calculate F1 score
f1 = (2 * prec * rec) / (prec + rec)
# print f1 score
print(f'F1 score: {f1}')

true positive: 220
true negative: 619
false positive: 2
false negative: 25
Cannot classify: 685
total predictions: 1551
Accuracy with (total-no_classification): 0.9688221709006929
Accuracy: 0.5409413281753708
Precision: 0.990990990990991
Recall: 0.8979591836734694
F1 score: 0.9421841541755889
