<a href="https://colab.research.google.com/github/NguyenDa18/MachineLearning_HW6/blob/master/Spam%20Naive%20Bayes.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import random
import math

def mean(numbers):
    return sum(numbers) / float(len(numbers))

# make training and test datasets
def split_dataset(dataset, split_ratio):
    train_size = int(len(dataset) * split_ratio)
    train_set = []
    copy = list(dataset)
    while (len(train_set) < train_size):
        index = random.randrange(len(copy))
        train_set.append(copy.pop(index))
    return [train_set, copy]

# Spam Classifier
- Column 1 = feature vector instance number
- Column 2 = {-1,1} whether message is spam (1) or not (-1)
- Column 3 = feature vector, 344 chars long where each char is a unique word or an email feature that would identify email as spam/non-spam. Boolean of whether or not feature occurred in email (1 = yes, 0 = no)

- [Source](https://www.youtube.com/watch?v=8aZNAmWKGfs)

# 1) Process txt data and split into test and train sets

In [57]:
import pandas as pd
import numpy as np

df_j = pd.read_csv('./SpamInstances.txt', header=None)

# Divide each row into arrs
df_j = df_j[0].str.split(" ")[1:]
for row in range(len(df_j)):
    df_j.iloc[row] = [int(el) for el in df_j.iloc[row]]
    
    
df_j, df_j_test = split_dataset(df_j, 0.80)

print(len(df_j))
print(len(df_j_test))

print(df_j_test[:6])


12397
3100
[[1009, -1, 1001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000], [101, -1, 100000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000], [1014, -1, 1000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000

# 2) Calculate Priors
- total num. SPAM & num. HAM in test dataset
- probability of SPAM & HAM for each feature in a vector

In [0]:
# Count number of times feature occurred in message
def calc_feature_info(str):
    occurred = str.count('1')
    non_occurred = str.count('0')
    return occurred, non_occurred

In [0]:
# Create our Spam / Non-Spam data structs
# SPAM = number of total data considered spam
# HAM = number of total data considered ham
# spam_feature = number of occurrences of feature in vector
# ham_feature = number of occurrences of non-spam feature in vector
def preprocess_test_data():
  SPAM = []
  HAM = []
  spam_feature = {}
  ham_feature = {}

  for row in range(len(df_j)):
      if (df_j[row][1] == 1):
          SPAM += [df_j[row]]
      elif (df_j[row][1] == -1):
          HAM += [df_j[row]]
      else:
          print(df_j.iloc[row])
        
  for row in range(len(df_j)):
      vector = str(df_j[row][2])
      occurred, non_occurred = calc_feature_info(vector)
      spam_feature[df_j[row][0]] = occurred / len(vector)
      ham_feature[df_j[row][0]] = non_occurred / len(vector)
        
  num_SPAM = len(SPAM) # 6712
  num_HAM = len(HAM) # 8785
  
  return num_SPAM, num_HAM

In [0]:
def find_priors():
  ham_probs = [0] * 334
  spam_probs = [0] * 334

  for row in range(len(df_j)):
      vector = str(df_j[row][2])
      for i, ltr in enumerate(vector):
          if (ltr == '1'):
              spam_probs[i] += 1
          elif (ltr == '0'):
              ham_probs[i] += 1

  ham_probs = [x / num_HAM for x in ham_probs]
  spam_probs = [x / num_SPAM for x in spam_probs]

  return ham_probs, spam_probs

In [0]:
# USE Naive Bayes to detect spam/ham
def classify_data(data):
  ham_probs, spam_probs = find_priors()
  spam_probabilities = []
  ham_probabilities = []
  for i, ltr in enumerate(str(data)):
    if ltr == '1':
      spam_probabilities += [spam_probs[i]]
      ham_probabilities += [ham_probs[i]]
    elif ltr == '0':
      spam_probabilities += [1 - spam_probs[i]]
      ham_probabilities += [1 - ham_probs[i]]
      
  # probability product for each feature vector that => SPAM
  spam_result = np.prod(np.array(spam_probabilities))
  
  # probability product for each feature vector that => HAM
  ham_result = np.prod(np.array(ham_probabilities))
  return spam_result, ham_result

In [0]:
def test_spam_naive_bayes(vector):
  spam_result, ham_result = classify_data(str(vector))
  
  # P(feature1 | SPAM) * P(feature2 | SPAM) * ... * P(featureN | SPAM) * P(SPAM) /
  # P(feature1 | SPAM) * P(feature2 | SPAM) * ... * P(featureN | SPAM) * P(SPAM) + P(feature1 | HAM) * P(feature2 | HAM) * ... * P(featureN | HAM) * P(HAM)
  
  num_SPAM, num_HAM = preprocess_test_data()
  P_SPAM = num_SPAM / len(df_j) # 0.43311608698457765
  P_HAM = num_HAM / len(df_j) # 0.5668839130154223
  
  is_spam = (spam_result * P_SPAM) / ((spam_result * P_SPAM) + (ham_result * P_HAM))
  is_ham = (ham_result * P_HAM) / ((spam_result * P_SPAM) + (ham_result * P_HAM))
  return is_spam, is_ham

In [79]:
vector = '1000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000'
test_spam_naive_bayes(vector)

(1.0, 2.5529159935480078e-80)

In [85]:
for i in range(len(df_j_test[:10])):
  is_spam, is_ham = test_spam_naive_bayes(i)
  print(is_spam, is_ham)

1523.170686456151 -1522.170686456151
0.9132568479438559 0.08674315205614415
0.43413729128014844 0.5658627087198516
0.43413729128014844 0.5658627087198516
0.43413729128014844 0.5658627087198516
0.43413729128014844 0.5658627087198516
0.43413729128014844 0.5658627087198516
0.43413729128014844 0.5658627087198516
0.43413729128014844 0.5658627087198516
0.43413729128014844 0.5658627087198516
