In [1]:
import pickle
import matplotlib.pylab as pl
%matplotlib inline
import numpy as np
from utility import *

  "The Gtk3Agg backend is known to not work on Python 3.x with pycairo. "


### Question 1 & Question 2
    Please refer to utility.py.

In [2]:
# pre-process data
training_X, training_Y, validation_X, validation_Y, test_X, test_Y, vocabulary = format_dataset(X=20)

### Question 3
    Perceptron_train 

In [None]:
def error(W, X, Y):
    return np.sum(np.dot(X, W) * Y < 0)
def perceptron_error(W, X, Y):
    N, D = X.shape
    return error(W, X, Y) / N
def perceptron_train(W, X, Y, learning_rate=1, logging_interval=1, maximum_n_iterations=None):
    N, D = X.shape
    total_n_errors = 0
    n_iterations = 0
    while True:
        predictions = np.dot(X, W)
        update_filter = predictions * Y < 0
        gradient = np.mean((update_filter * Y).reshape((N, 1)) * X, axis=0)
        W += learning_rate * gradient
        n_iterations += 1
        if maximum_n_iterations is not None and maximum_n_iterations < n_iterations: break
        n_errors = np.sum(update_filter)
        if n_errors == 0: break
        else: total_n_errors += n_errors
        if isinstance(logging_interval, int):
            if n_iterations % logging_interval == 0: print('update %d errors %d' % (n_iterations, n_errors))
    return W, n_errors, n_iterations

### Question 4

In [None]:
D = len(vocabulary) # dimension of feature vector
W = np.random.normal(0, 1, D) # initialize weights
N_TRAINING_SAMPLES = 4000
W, n_errors, n_iterations = perceptron_train(W, training_X[:N_TRAINING_SAMPLES], training_Y[:N_TRAINING_SAMPLES])

update 1 errors 2421
update 2 errors 1666
update 3 errors 1571
update 4 errors 1457


In [None]:
# check whether there is no error on training set
perceptron_error(W, training_X, training_Y)

In [None]:
# test perceptron on validation set
perceptron_error(W, validation_X, validation_Y)

### Question 5

In [None]:
weights = W.tolist()
sorted_weights = sorted(weights)
N = 12
negative_N = tuple(vocabulary[weights.index(weight)] for weight in sorted_weights[:N])
positive_N = tuple(vocabulary[weights.index(weight)] for weight in sorted_weights[len(sorted_weights) - N:])
print('%d most negative words:' % N, *negative_N)
print('%d most positive words:' % N, *positive_N)

### Question 6 & Question 7

In [None]:
# record number of iteration and validation error
n_update_table = {}
validation_error_table = []
for N_TRAINING_SAMPLES in (100, 200, 400, 800, 2000, 4000):
    W = np.random.normal(0, 1, D)
    W, n_iterations = perceptron_train(
        W,
        training_X[:N_TRAINING_SAMPLES],
        training_Y[:N_TRAINING_SAMPLES],
        logging_interval=None
    )
    n_update_table[N_TRAINING_SAMPLES] = n_iterations
    validation_error_table.append(perceptron_error(W, validation_X, validation_Y))

In [None]:
# plot number of iteration
pl.plot(list(n_update_table.keys()), list(n_update_table.values()), 'bo')
pl.xlabel('number of training samples')
pl.ylabel('number of updates')
pl.title('number of updates required for 0 training error')
pl.grid()

In [None]:
# plot validation error
pl.plot(list(validation_error_table.keys()), list(validation_error_table.values()), 'bo')
pl.xlabel('number of training samples')
pl.ylabel('validation error')
pl.title('validation error')
pl.grid()

### Question 8

In [None]:
W = np.random.normal(0, 1, D)
N_TRAINING_SAMPLES = 4000
MAXIMUM_n_iterations = 200
W, n_iterations = perceptron_train(
    W,
    training_X[:N_TRAINING_SAMPLES],
    training_Y[:N_TRAINING_SAMPLES],
    maximum_n_iterations=MAXIMUM_n_iterations
)

In [None]:
# validation error
perceptron_error(W, validation_X, validation_Y)

### Question 10
    Provided X equaling 1200, the data is not linearly separable.

In [None]:
# generate data
training_X, training_Y, validation_X, validation_Y, test_X, test_Y, vocabulary = format_dataset(X=1200)

In [None]:
# count the number of identical data points labelled differently
positive_set = set(tuple(point) for point in training_X[training_Y == 1].tolist())
negative_set = set(tuple(point) for point in training_X[training_Y == -1].tolist())
len(positive_set & negative_set)