In [1]:
import csv
import random
import math

def mean(numbers):
    return sum(numbers) / float(len(numbers))

def stdev(numbers):
    avg = mean(numbers)
    variance = sum([pow(x - avg, 2) for x in numbers]) / float(len(numbers) - 1)
    return math.sqrt(variance)

# make training and test datasets
def split_dataset(dataset, split_ratio):
    train_size = int(len(dataset) * split_ratio)
    train_set = []
    copy = list(dataset)
    while (len(train_set) < train_size):
        index = random.randrange(len(copy))
        train_set.append(copy.pop(index))
    return [train_set, copy]


def separate_by_class(dataset):
    separated = []
    for i in range(len(dataset)):
        vector = dataset[i]
        if (vector[1] not in separated):
            separated[vector[-1]] = []
        separated[vector[-1]].append(vector)
    return separated

def summarize(dataset):
    summaries = [(mean(attribute), stdev(attribute)) for attribute in zip(*dataset)]
    return summaries

def summarize_by_class(dataset):
    separated = separate_by_class(dataset)
    summaries = {}
    for class_value, instances in separated.iteritems():
        summaries[class_value] = summarize(instances)
        
    return summaries

# Spam Classifier
- Column 1 = feature vector instance number
- Column 2 = {-1,1} whether message is spam (1) or not (-1)
- Column 3 = feature vector, 344 chars long where each char is a unique word or an email feature that would identify email as spam/non-spam. Boolean of whether or not feature occurred in email (1 = yes, 0 = no)

- [Source](https://www.youtube.com/watch?v=8aZNAmWKGfs)

In [5]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns

df_j = pd.read_csv('./SpamInstances.txt', header=None)

# Divide each row into arrs
df_j = df_j[0].str.split(" ")[1:]
for row in range(len(df_j)):
    df_j.iloc[row] = [int(el) for el in df_j.iloc[row]]


In [12]:
print(sorted(df_j))

[[1, 1, 1000000000000000000000000100000000000001000100000010000000000000100000000000000000000100000000000000000010000001000000001000000000000000000000000000000000000000000000000100000111000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000], [2, -1, 1000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000], [2, 1, 100100101000001000100000000000010000100001100000000000010000000000100010000001000000000000100000010000000001000000000100000000100000000000000000000000000000000000000000000010010000010000000000000000110000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000

In [44]:
def get_feature_indexes(str, boolean):
    return [i for i, ltr in enumerate(str) if ltr == boolean]

ham_probs = [0] * 334
spam_probs = [0] * 334

for row in range(len(df_j)):
    vector = str(df_j.iloc[row][2])
    for i, ltr in enumerate(vector):
        if (ltr == '1'):
            spam_probs[i] += 1
        elif (ltr == '0'):
            ham_probs[i] += 1
            

            
print("HAM PROBS")
print(ham_probs)
print("SPAM PROBS")
print(spam_probs)
# print(spam_count)
# print(ham_count)
            
            
    
# feature_present = get_feature_indexes(vector, '1')
# spam_indexes += [feature_present]
# feature_present = get_feature_indexes(vector, '0')
# ham_indexes += [feature_present]
    
# print('SPAM INDEXES')
# print(spam_indexes)

# print('HAM INDEXES')
# print(ham_indexes)

HAM PROBS
[1344, 11660, 12979, 13024, 13375, 13681, 12806, 12293, 13282, 13219, 13565, 13568, 13010, 13657, 13647, 13767, 13555, 13448, 13333, 13130, 13923, 13953, 13818, 13815, 13781, 12943, 13865, 13796, 13511, 13851, 13635, 13299, 12734, 13748, 13582, 13237, 13822, 14039, 13983, 13776, 13726, 13629, 13561, 13051, 13989, 13752, 13670, 13589, 13243, 13037, 12580, 14042, 13980, 13935, 13801, 13949, 13861, 13885, 14081, 14067, 13673, 14051, 14065, 14048, 14001, 14037, 13911, 13795, 13964, 14051, 13942, 13914, 14024, 14005, 13954, 14010, 13987, 13938, 13976, 14026, 13933, 13889, 14060, 14022, 13954, 14012, 13513, 13779, 13620, 13877, 13916, 13782, 13488, 12990, 13890, 13995, 14006, 13493, 14009, 13895, 13879, 14054, 13827, 13264, 13416, 14063, 13946, 13924, 13816, 13730, 13198, 13773, 14012, 13820, 13690, 13890, 13764, 13300, 13734, 13811, 13489, 13594, 13818, 14004, 13810, 13806, 13489, 13360, 12941, 13947, 13984, 14038, 13930, 13754, 13086, 12843, 14046, 13846, 13902, 13766, 13738, 140

In [14]:
# Count number of times feature occurred in message
def calc_feature_info(str):
    occurred = str.count('1')
    non_occurred = str.count('0')
    return occurred, non_occurred

In [17]:
# Create our Spam / Non-Spam data structs
# SPAM = number of total data considered spam
# HAM = number of total data considered ham
# spam_feature = number of occurrences of feature in vector
# ham_feature = number of occurrences of non-spam feature in vector
SPAM = []
HAM = []
spam_feature = {}
ham_feature = {}

for row in range(len(df_j)):
    if (df_j.iloc[row][1] == 1):
        SPAM += [df_j.iloc[row]]
    elif (df_j.iloc[row][1] == -1):
        HAM += [df_j.iloc[row]]
    else:
        print(df_j.iloc[row])
        
for row in range(len(df_j)):
    vector = str(df_j.iloc[row][2])
    occurred, non_occurred = calc_feature_info(vector)
    spam_feature[df_j.iloc[row][0]] = occurred / len(vector)
    ham_feature[df_j.iloc[row][0]] = non_occurred / len(vector)
        
        
num_SPAM = len(SPAM) # 6712
num_HAM = len(HAM) # 8785

P_SPAM = num_SPAM / len(df_j) # 0.43311608698457765
P_HAM = num_HAM / len(df_j) # 0.5668839130154223


print(ham_feature)

{1: 0.9569230769230769, 2: 0.9274924471299094, 3: 0.9538461538461539, 4: 0.9577039274924471, 5: 0.9487951807228916, 6: 0.9323076923076923, 7: 0.9692307692307692, 8: 0.9507692307692308, 9: 0.9723076923076923, 10: 0.9692307692307692, 11: 0.9335347432024169, 12: 0.9395770392749244, 13: 0.963855421686747, 14: 0.9274924471299094, 15: 0.9274924471299094, 16: 0.9274924471299094, 17: 0.9395770392749244, 18: 0.9395770392749244, 19: 0.9733333333333334, 20: 0.9384615384615385, 21: 0.9446153846153846, 22: 0.963076923076923, 23: 0.9397590361445783, 24: 0.9397590361445783, 25: 0.9728915662650602, 26: 0.9788519637462235, 27: 0.9476923076923077, 28: 0.9397590361445783, 29: 0.963076923076923, 30: 0.9661538461538461, 31: 0.9580838323353293, 32: 0.9546827794561934, 33: 0.9578313253012049, 34: 0.9487951807228916, 35: 0.9476923076923077, 36: 0.9384615384615385, 37: 0.9384615384615385, 38: 0.9637462235649547, 39: 0.9637462235649547, 40: 0.9548192771084337, 41: 0.927710843373494, 42: 0.9507692307692308, 43: 

In [None]:
def calc_spam(probs):
    result = np.prod(np.array(probs))

In [18]:
dataset = [[1], [2], [3], [4], [5]]
split_ratio = 0.67
train, test = split_dataset(dataset, split_ratio)

NameError: name 'split_dataset' is not defined

In [None]:
def calc_naive_bayes(data)