## Import Libraries

In [14]:
import numpy as np
from sklearn.model_selection import KFold
from sklearn import svm
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import FunctionTransformer
from numpy import log1p
from IPython.display import HTML

## Read Data

In [15]:
# read data and split features and labels
datas = np.loadtxt('spambase.data', dtype=float, delimiter=',')
data, label = np.split(datas,(57,),axis=1)
data.shape, label.shape

((4601, 57), (4601, 1))

## Preprocessing Features

In [16]:
# standardization and transformation
data = StandardScaler().fit_transform(data)
data = FunctionTransformer(log1p).fit_transform(data)
data

array([[-0.41920969,  0.28584406,  0.53816377, ..., -0.0463029 ,
         0.04430194, -0.00876241],
       [ 0.29666119,  0.05060679,  0.36125512, ..., -0.00244626,
         0.22359372,  0.80124976],
       [-0.15773204, -0.18040968,  0.61611677, ...,  0.13620855,
         1.16972478,  1.44897158],
       ...,
       [ 0.49477421, -0.18040968,  0.03765554, ..., -0.12713141,
        -0.27042036, -0.3183169 ],
       [ 1.33546499, -0.18040968, -0.8136452 , ..., -0.13637276,
        -0.27716815, -0.413402  ],
       [-0.41920969, -0.18040968,  0.54967891, ..., -0.13265876,
        -0.27716815, -0.51296251]])

## K-Fold Cross-Validation

In [17]:
# spilt into 5 folds with shuffle
kf = KFold(5, shuffle=True, random_state=0)

## Evaluation Metrics

In [18]:
# build function to calculate FP, FN and error rates
FPlists = []
FNlists = []
errorlists = []
def evaluate(y_pre, y_true):
    TP = 0
    TN = 0
    FP = 0
    FN = 0
    for i in range(0, len(y_true)):
        if y_true[i] == y_pre[i] == 1:
            TP += 1
        elif y_true[i] == y_pre[i] == 0:
            TN += 1
        elif y_true[i] == 1 and y_pre[i] == 0:
            FN += 1
        elif y_true[i] == 0 and y_pre[i] == 1:
            FP += 1
    FPlists.append(FP/(FP+TN))
    FNlists.append(FN/(FN+TP))
    errorlists.append((FP+FN)/(FP+FN+TP+TN))

## Train Model

In [19]:
# use svm to classify
train_acc = 0
test_acc = 0
for train_index,test_index in kf.split(data):
    x_train, x_test = data[train_index,:], data[test_index,:]
    y_train, y_test = label[train_index,:], label[test_index,:]
    clf_svm = svm.SVC(C=4.3)
    clf_svm.fit(x_train, y_train.ravel())
    preidcted = clf_svm.predict(x_test)
    evaluate(preidcted, y_test)
    train_acc += clf_svm.score(x_train, y_train)
    test_acc += clf_svm.score(x_test, y_test)
# overall train accuracy and test accuracy
train_acc/5, test_acc/5

(0.9778853513341129, 0.9517504602747486)

## Display Results

In [20]:
# calculate FN, FP, error rate for each fold and average
results = []
header = ["Fold", "False Positive", "False Negative", "Error Rates"]
results.append(header)
for i in range(0, 5):
    tmp = [i+1, FPlists[i], FNlists[i], errorlists[i]]
    results.append(tmp)
tmp = ["average", np.mean(FPlists), np.mean(FNlists), np.mean(errorlists)]
results.append(tmp)
# show results in table
HTML('<table><tr>{}</tr></table>'.format(
       '</tr><tr>'.join(
           '<td>{}</td>'.format('</td><td>'.join(str(_) for _ in row)) for row in results)))

0,1,2,3
Fold,False Positive,False Negative,Error Rates
1,0.03345724907063197,0.0783289817232376,0.05211726384364821
2,0.025044722719141325,0.08310249307479224,0.04782608695652174
3,0.027985074626865673,0.07291666666666667,0.04673913043478261
4,0.041884816753926704,0.07780979827089338,0.05543478260869565
5,0.03264604810996564,0.05029585798816568,0.0391304347826087
average,0.032203582256106256,0.07249075954475112,0.04824953972525138


## Choice of Classifier
Different classification algorithms can be applied to this task, and I tried SVM, LogisticRegression and DecisionTreeClassifier. Among them, the best model is using SVM with rbf kernel, and the second one is using LogisticRegression with accuracy about 93.7%, the worst one is using DecisionTree with accuracy about 92.2%.