In [1]:
import numpy as np
import pandas as pd
import random


In [2]:
# split dataset to train and test data with 10 subsets. k is the kth subset, range[0,9]. 
# using cross validation split dataset with 460 testSet, and 4141 trainSet.
def splitData(df, k):
    subsetC=int(len(df)/10)
    testSet=df[subsetC*k:subsetC*(k+1)]
    if k==0: 
        trainSet=df[subsetC*(k+1): ]
    else:
        trainSet1=df[ :subsetC*k]
        trainSet2=df[subsetC*(k+1): ]
        trainSet=pd.concat([trainSet1, trainSet2])

    return trainSet, testSet



In [3]:
# Summary how many rows of 0, how many rows of 1, and total rows for each of 57 columns
def summarize(df):
    attsNum=df.shape[1]-1
    result=np.zeros(shape=(attsNum,3))
    for i in range(0,attsNum):
        Total=df.shape[0]
        Num0=df[df[i]==0].shape[0]
        Num1=df[df[i]==1].shape[0]
        result[i]=[Num0,Num1,Total]
        
    return result
    

In [4]:
# get the probability of class spam or not-spam for each row in testSet
def probClass(df_noSpam,df_spam,pobj):
    attsNum=pobj.shape[0]
    logProb0=0
    logProb1=0
    for i in range(0,attsNum):
        if df_noSpam[i][0]==0:
            logProb0 += 0
        elif df_spam[i][1]==0:
            logProb1 += 0   
        elif int(pobj[i])==0:
            logProb0 += np.log2(df_noSpam[i][0]/df_noSpam[i][2]) 
            logProb1 += np.log2(1-df_spam[i][1]/df_spam[i][2])
        else:
            logProb0 += np.log2(1-df_noSpam[i][0]/df_noSpam[i][2])
            logProb1 += np.log2(df_spam[i][1]/df_spam[i][2])
            
    return np.exp2(logProb0),np.exp2(logProb1)
 


In [5]:
# load data
data_X=pd.read_table('spamdata_binary.txt',sep='\t',header=None)
data_Y=pd.read_table('spamlabels.txt',header=None)
data_Spam=pd.concat([data_X, data_Y], axis=1,ignore_index=True)
# ramdomlize the dataset
data_Spam=data_Spam.sample(frac=1).reset_index(drop=True)

accuracy=np.zeros(10)
for k in range(0,10): 
    # using cross validation(kth range[0,9]) split data into trainSet and testSet
    trainSet, testSet = splitData(data_Spam,k)
    # get the summary of class spam or not-spam in trainSet
    spamSet=trainSet[trainSet[57]==1]
    result_spam=summarize(spamSet)
    noSpamSet=trainSet[trainSet[57]==0]
    result_noSpam=summarize(noSpamSet)
 
    # probability of spam, not-spam
    p_spam=float(len(spamSet))/float(len(trainSet))
    p_noSpam=float(len(noSpamSet))/float(len(trainSet))
    # predict its spam or not-spam for each row of testSet
    # and get the accuracy of this prediction.
    test_pred=np.zeros(len(testSet))
    accurNum=0
    for i in range(0,len(testSet)):
        pobj=testSet[testSet.columns[0:57]][i:i+1]
        p=probClass(result_noSpam,result_spam,pobj)
        if p[0]*p_noSpam >p[1]*p_spam:
            test_pred[i]=0
        else:
            test_pred[i]=1

        if int(test_pred[i])==int(testSet[57][i+460*k]):
            accurNum += 1

    accuracy[k]=float(accurNum)/float(len(testSet))
    
# average accuracy of the 10 times train-test
avgAccuracy=np.mean(accuracy)
print(avgAccuracy)


0.655869565217
