In [213]:
import re
from collections import defaultdict
from sklearn import preprocessing
import matplotlib.pyplot as plt
import numpy as np
from pylab import *
from decimal import Decimal
from sklearn import cross_validation
from sklearn.cross_validation import KFold
from sklearn.metrics import mean_squared_error
from sklearn import linear_model
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from decimal import Decimal
from scipy.misc import comb

%matplotlib inline

In [214]:
def load_data(filename):
    data = np.loadtxt(filename,delimiter=',')
    return data

In [215]:
def preprocess_spamdata(data,x_index):
#     print 'preprocessing data'
    x = data[:,:x_index+1]
    y = data[:,-1]
    X = []
    for i in x:
        temp = []
        for feature in i:
            if feature>0:
                temp.append(1)
            else:
                temp.append(0)
        X.append(temp)
    X = np.array(X)
    Y = np.array(y)
    return X,Y

In [216]:
def indicator(Y,class_val):
    l = list()
    for idx,y in enumerate(Y):
        if y == class_val:
            l.append(idx)
    return l

In [217]:
def compute_prior(Y,class_val):
    m = len(Y)
    indices = indicator(Y,class_val)
    return (1.*len(indices)/m)

# Naive Bayes with Bernoulli featues

In [218]:
''' all the alphai values of all the features of a specific x'''
def compute_alphai(X,indices,E=0.01):
    alpha = dict()
    for col in range(X.shape[1]):
        feat_col = X[:,col]
        a = 1.*(sum(feat_col[indices]) + E)/len(indices)+(2*E)
        alpha[col] = a
    return alpha

In [219]:
def membership_fun_ber(x,Y,class_val,alphai,priori):
    s = 0
    for ind,i in enumerate(x):
        a = alphai[ind]
        s += i*(math.log(a)) + (1-i)*(math.log(1-a))
    return (s+math.log(priori))       

In [221]:
def cross_validation_NBBer(filename,k):
    
    data = load_data(filename)
    X,Y = preprocess_spamdata(data,48)
    accuracy = list()
    precision = list()
    recall = list()
    f_measure = list()
    fold = 1
    for train_ind, test_ind in KFold(X.shape[0],k,shuffle=True,random_state=5):
        X_train = np.array(X)[train_ind]
        Y_train = np.array(Y)[train_ind]
        X_test = np.array(X)[test_ind]
        Y_test = np.array(Y)[test_ind]
        Y_predict = NB_Bern_exp(X_train,Y_train,X_test)
        
        temp = accuracy_score(Y_test,Y_predict,normalize=True, sample_weight=None)
        c_matrix = confusion_matrix(Y_test, Y_predict)
        prec = precision_score(Y_test, Y_predict) 
        rec = recall_score(Y_test, Y_predict)  
        fm = f1_score(Y_test, Y_predict)
    
        accuracy.append(temp)
        recall.append(rec)
        precision.append(prec)
        f_measure.append(fm)
        
        print 'fold:', fold
        print 'accuracy:', temp
        print 'confusion_matrix', c_matrix
        print 'prediction', prec
        print 'recall' , rec
        fold += 1
    
    avg_acc = sum(accuracy)/len(accuracy)
    avg_pre = sum(precision)/len(precision)
    avg_rec = sum(recall)/len(recall)
    avg_fm = sum(f_measure)/len(f_measure)
   
    print 'avg_accuracy: ' , avg_acc
    print 'avg_precision', avg_pre
    print 'avg_recall', avg_rec
    print 'avg_fmeasure', avg_fm

In [222]:
cross_validation_NBBer('spambase.data.txt',10)

fold: 1
accuracy: 0.872017353579
confusion_matrix [[269  14]
 [ 45 133]]
prediction 0.904761904762
recall 0.747191011236
fold: 2
accuracy: 0.882608695652
confusion_matrix [[269  23]
 [ 31 137]]
prediction 0.85625
recall 0.815476190476
fold: 3
accuracy: 0.863043478261
confusion_matrix [[250  27]
 [ 36 147]]
prediction 0.844827586207
recall 0.803278688525
fold: 4
accuracy: 0.845652173913
confusion_matrix [[254  20]
 [ 51 135]]
prediction 0.870967741935
recall 0.725806451613
fold: 5
accuracy: 0.854347826087
confusion_matrix [[254  13]
 [ 54 139]]
prediction 0.914473684211
recall 0.720207253886
fold: 6
accuracy: 0.902173913043
confusion_matrix [[262  13]
 [ 32 153]]
prediction 0.921686746988
recall 0.827027027027
fold: 7
accuracy: 0.910869565217
confusion_matrix [[256  11]
 [ 30 163]]
prediction 0.936781609195
recall 0.844559585492
fold: 8
accuracy: 0.84347826087
confusion_matrix [[253  20]
 [ 52 135]]
prediction 0.870967741935
recall 0.72192513369
fold: 9
accuracy: 0.871739130435
confusio

#    Naive Bayes with Binomial features

In [248]:
def preprocess_spamdata_bi(data,x_index):
#     print 'preprocessing data'
    x = data[:,:x_index+1]
    y = data[:,-1]
    X = []
    for i in x:
        temp = []
        for feature in i:
            temp.append(feature*20)
        X.append(temp)
    X = np.array(X)
    Y = np.array(y)
    return X,Y

In [244]:
def doc_length(X):
    doc_lengths = dict()
    for index,x in enumerate(X):
        doc_lengths[index] = sum(x)
    return doc_lengths
        

In [234]:
'''computing the alphaj for all the columns/features for a given label '''
def compute_alphai_Bi(indices,X,doc_length,E=0.01,k=2):
    alphaj = dict()
    n = 0
    d = sum([doc_length[i] for i in indices])
    for col in range(X.shape[1]):
        column = X[:,col]
        n = sum(column[indices])
        alphaj[col] = 1.*(n+E)/(d+(k*E))
    return alphaj
        

In [250]:
def membership_Bi(alphaj,prior,x):
    s = 0
    doc_len = sum(x)+10
    f = math.factorial
    for j in range(x.size):
        c = 1.*comb(doc_len,x[j])
        a = (alphaj[j])**(x[j])
        b = (1 - alphaj[j])**(doc_len-(x[j]))
        if (a*b) != 0:
            s += (math.log(c*a*b))
    r = (s + math.log(prior))
    return r

In [236]:
def t():
    data = load_data('spambase.data.txt')
    X,Y = preprocess_spamdata_bi(data,47)
    print X.shape
    doc_lengths = doc_length(X)
   
    prediction = list()
    labels = np.unique(Y)
    alphaj = dict()
    prior = dict()
    indices = dict()
    
    for label in labels:
        indices[label] = indicator(Y,label)
        prior[label] = compute_prior(Y,label)
        alphaj[label] = compute_alphai_Bi(indices[label],X,doc_lengths)

    for ind,x in enumerate(X[:10]):
        temp = []
        for label in labels:
            m = membership_Bi(alphaj[label],prior[label],x)
            temp.append(m)
            print label,m
        pred = labels[temp.index(max(temp))]
        print 'pred', pred
        print ' '
        prediction.append(pred)
    
    print prediction

In [232]:
t()

(4601, 48)
0.0 -974.365588521
1.0 -490.336427648
pred 1.0
 
0.0 -1339.71699562
1.0 -370.762775278
pred 1.0
 
0.0 -1325.25304991
1.0 -567.583764484
pred 1.0
 
0.0 -1218.35768732
1.0 -490.936192987
pred 1.0
 
0.0 -1218.35768732
1.0 -490.936192987
pred 1.0
 
0.0 -858.044931566
1.0 -1107.54760191
pred 0.0
 
0.0 -1259.92664965
1.0 -729.800942115
pred 1.0
 
0.0 -408.080528831
1.0 -1125.53460989
pred 0.0
 
0.0 -1499.15453533
1.0 -707.849930676
pred 1.0
 
0.0 -869.52024538
1.0 -307.658539848
pred 1.0
 
[1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 0.0, 1.0, 1.0]


In [228]:
def NB_Bino_exp(X,Y,x_predict):
    indices = dict()
    class_prior = dict()
    prediction = list()
    alphaj = dict()
    labels = np.unique(Y)
    doc_lengths = doc_length(X)
    
    for label in labels:
        indices[label] = indicator(Y,label)
        class_prior[label] = compute_prior(Y,label)
        alphaj[label] = compute_alphai_Bi(indices[label],X,doc_lengths)
    
    for ind,x in enumerate(x_predict):
        temp = []
        for label in labels:
            temp.append(membership_Bi(alphaj[label],class_prior[label],x))
        pred = labels[temp.index(max(temp))]
        prediction.append(pred)
    return prediction

In [229]:
def cross_validation_NBBi(filename,k):
    
    data = load_data(filename)
    X,Y = preprocess_spamdata_bi(data,47)
#     X,Y = preprocess_spamdata(data,47)
    accuracy = list()
    precision = list()
    recall = list()
    f_measure = list()
    fold = 1
    for train_ind, test_ind in KFold(X.shape[0],k,shuffle=True,random_state=5):
        X_train = np.array(X)[train_ind]
        Y_train = np.array(Y)[train_ind]
        X_test = np.array(X)[test_ind]
        Y_test = np.array(Y)[test_ind]
        Y_predict = NB_Bino_exp(X_train,Y_train,X_test)
        
        temp = accuracy_score(Y_test,Y_predict,normalize=True, sample_weight=None)
        c_matrix = confusion_matrix(Y_test, Y_predict)
        prec = precision_score(Y_test, Y_predict) 
        rec = recall_score(Y_test, Y_predict)  
        fm = f1_score(Y_test, Y_predict)
    
        accuracy.append(temp)
        recall.append(rec)
        precision.append(prec)
        f_measure.append(fm)
        
        print 'fold:', fold
        print 'accuracy:', temp
        print 'confusion_matrix', c_matrix
        print 'prediction', prec
        print 'recall' , rec
        fold += 1
    
    avg_acc = sum(accuracy)/len(accuracy)
    avg_pre = sum(precision)/len(precision)
    avg_rec = sum(recall)/len(recall)
    avg_fm = sum(f_measure)/len(f_measure)
   
    print 'avg_accuracy: ' , avg_acc
    print 'avg_precision', avg_pre
    print 'avg_recall', avg_rec
    print 'avg_fmeasure', avg_fm

In [251]:
cross_validation_NBBi('spambase.data.txt',10)

fold: 1
accuracy: 0.826464208243
confusion_matrix [[214  69]
 [ 11 167]]
prediction 0.707627118644
recall 0.938202247191
fold: 2
accuracy: 0.826086956522
confusion_matrix [[220  72]
 [  8 160]]
prediction 0.689655172414
recall 0.952380952381
fold: 3
accuracy: 0.802173913043
confusion_matrix [[204  73]
 [ 18 165]]
prediction 0.693277310924
recall 0.901639344262
fold: 4
accuracy: 0.817391304348
confusion_matrix [[205  69]
 [ 15 171]]
prediction 0.7125
recall 0.91935483871
fold: 5
accuracy: 0.826086956522
confusion_matrix [[201  66]
 [ 14 179]]
prediction 0.730612244898
recall 0.927461139896
fold: 6
accuracy: 0.873913043478
confusion_matrix [[226  49]
 [  9 176]]
prediction 0.782222222222
recall 0.951351351351
fold: 7
accuracy: 0.830434782609
confusion_matrix [[205  62]
 [ 16 177]]
prediction 0.740585774059
recall 0.917098445596
fold: 8
accuracy: 0.839130434783
confusion_matrix [[214  59]
 [ 15 172]]
prediction 0.744588744589
recall 0.919786096257
fold: 9
accuracy: 0.795652173913
confusio