In [1]:
import random
import numpy as np
import matplotlib.pyplot as plt

# This is a bit of magic to make matplotlib figures appear inline in the notebook
# rather than in a new window.

%matplotlib inline
plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

# Some more magic so that the notebook will reload external python modules;
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython

%load_ext autoreload
%autoreload 2

In [2]:
from sklearn import preprocessing, metrics
import utils
import scipy.io
import numpy as np
from linear_classifier import LinearSVM_twoclass

# load the SPAM email training dataset

N = 3600
X,y = utils.load_mat('data/spamTrain.mat')
yy = np.ones(y.shape)
yy[y==0] = -1
X_train = X[:N]
yy_train = yy[:N]
X_val = X[N:]
yy_val = yy[N:]

# load the SPAM email test dataset

test_data = scipy.io.loadmat('data/spamTest.mat')
X_test = test_data['Xtest']
y_test = test_data['ytest'].flatten()

##################################################################################
#  YOUR CODE HERE for training the best performing SVM for the data above.       #
#  what should C be? What should num_iters be? Should X be scaled?               #
#  should X be kernelized? What should the learning rate be? What should the     #
#  number of iterations be?                                                      #
##################################################################################

from sklearn.metrics.pairwise import rbf_kernel

svm = LinearSVM_twoclass()
svm.theta = np.zeros((X.shape[1],))


sigma_vals = [0.3,1,3,10,30,100,300]
Cvals = [10,30,100,300,1000,3000,10000]
learning_rates = [1e-5,3e-5,1e-4,3e-4,1e-3,3e-3,1e-2]

bestAcc = 0.0
bests = (0, 0, 0)

for s in sigma_vals:
#    KKtrain = X_train
#    KKval = X_val
    gamma = 1.0 / (2 * s * s)
    Ktrain = rbf_kernel(X_train, X_train, gamma)
    # scale the data
    scaler = preprocessing.StandardScaler().fit(Ktrain)
    scaleKtrain = scaler.transform(Ktrain)
    # add the intercept term
    KKtrain = np.vstack([np.ones((scaleKtrain.shape[0],)), scaleKtrain.T]).T
    
    Kval = rbf_kernel(X_val, X_train, gamma)
    scaleKval = scaler.transform(Kval)
    KKval = np.vstack([np.ones((scaleKval.shape[0],)), scaleKval.T]).T

    svm.theta = np.zeros((KKtrain.shape[1],))
    for c in Cvals:
        for lr in learning_rates:
            svm.train(KKtrain, yy_train, learning_rate=lr, reg=c, num_iters=200, batch_size=400)
            yy_val_pre = svm.predict(KKval)
            curAcc = np.mean((yy_val_pre == yy_val) * 1.0)
             # print (s, c, lr, curAcc)
            if curAcc > bestAcc:
                bestAcc = curAcc
                bests = (s, c, lr)
                best_svm = svm
            
print ("Best sigma, C, lr are (%f, %f, %f) with an accuracy on the validation set of %f." %(bests[0], bests[1], bests[2], bestAcc))

##################################################################################
# YOUR CODE HERE for testing your best model's performance                       #
# what is the accuracy of your best model on the test set? On the training set?  #
##################################################################################

s = bests[0]
c = bests[1]
lr = bests[2]
#KK = X
gamma = 1.0 / (2 * s * s)
K = rbf_kernel(X, X, gamma)
scaler = preprocessing.StandardScaler().fit(K)
scaleK = scaler.transform(K)
KK = np.vstack([np.ones((scaleK.shape[0],)), scaleK.T]).T

best_svm.theta = np.zeros((KK.shape[1],))
best_svm.train(KK, yy, learning_rate=lr, reg=c, num_iters=30000, batch_size=KK.shape[0], verbose=True)
yy_pre = best_svm.predict(KK)
acc_train = np.mean((yy_pre == yy) * 1.0)
print ("Accuracy on the training set: ", acc_train)

yy_test = np.ones(y_test.shape)
yy_test[y_test==0] = -1
#KKtest = X_test
Ktest = rbf_kernel(X_test, X, gamma)
scaleKtest = scaler.transform(Ktest)
KKtest = np.vstack([np.ones((scaleKtest.shape[0],)), scaleKtest.T]).T

yy_test_pre = best_svm.predict(KKtest)
acc_test = np.mean((yy_test_pre == yy_test) * 1.0)
print ("Accuracy on the testing set: ", acc_test)

##################################################################################
# ANALYSIS OF MODEL: Print the top 15 words that are predictive of spam and for  #
# ham. Hint: use the coefficient values of the learned model                     #
##################################################################################
words, inv_words = utils.get_vocab_dict()

print ("###### top 15 words ######")
t = np.dot(best_svm.theta[1:], X).argsort()[::-1]
w15 = t[:15]
for w in w15:
    print (words[w])
##################################################################################
#                    END OF YOUR CODE                                            #
##################################################################################

Best sigma, C, lr are (10.000000, 3000.000000, 0.003000) with an accuracy on the validation set of 0.972500.
iteration 0 / 30000: loss 3000.000000
iteration 100 / 30000: loss 1630848.964543
iteration 200 / 30000: loss 306210.601406
iteration 300 / 30000: loss 165081.018428
iteration 400 / 30000: loss 116685.269818
iteration 500 / 30000: loss 89709.983528
iteration 600 / 30000: loss 1613583.118307
iteration 700 / 30000: loss 185175.453933
iteration 800 / 30000: loss 103825.061112
iteration 900 / 30000: loss 82158.074935
iteration 1000 / 30000: loss 67043.585306
iteration 1100 / 30000: loss 52923.482294
iteration 1200 / 30000: loss 43203.201252
iteration 1300 / 30000: loss 126767.862183
iteration 1400 / 30000: loss 71267.704709
iteration 1500 / 30000: loss 55699.873664
iteration 1600 / 30000: loss 44664.472627
iteration 1700 / 30000: loss 37997.205253
iteration 1800 / 30000: loss 33699.782016
iteration 1900 / 30000: loss 86136.996571
iteration 2000 / 30000: loss 50797.534175
iteration 21

iteration 19400 / 30000: loss 1678.073066
iteration 19500 / 30000: loss 1656.550321
iteration 19600 / 30000: loss 1563.253139
iteration 19700 / 30000: loss 1475.275255
iteration 19800 / 30000: loss 1454.934823
iteration 19900 / 30000: loss 1444.593037
iteration 20000 / 30000: loss 1438.486348
iteration 20100 / 30000: loss 1405.756997
iteration 20200 / 30000: loss 1394.052611
iteration 20300 / 30000: loss 1410.722723
iteration 20400 / 30000: loss 1322.522585
iteration 20500 / 30000: loss 1338.721881
iteration 20600 / 30000: loss 4904.146163
iteration 20700 / 30000: loss 1249.697703
iteration 20800 / 30000: loss 1320.830031
iteration 20900 / 30000: loss 1288.383713
iteration 21000 / 30000: loss 1185.896136
iteration 21100 / 30000: loss 1493.383105
iteration 21200 / 30000: loss 1041.844316
iteration 21300 / 30000: loss 1090.307823
iteration 21400 / 30000: loss 2964.747406
iteration 21500 / 30000: loss 1022.928082
iteration 21600 / 30000: loss 1032.808058
iteration 21700 / 30000: loss 1057

In [8]:
X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=uint8)

In [9]:
y_test

array([1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0,
       0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1,
       1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1,
       0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1,
       0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1,