In [19]:
import numpy as np
import scipy
import pickle # to save data
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.metrics.pairwise import pairwise_kernels
from scipy.optimize import minimize
from scipy.spatial.distance import hamming
from mySVM_class import *
import matplotlib.pyplot as plt
%matplotlib inline

In [20]:
%%time
train = pd.read_csv('Data/train.csv', nrows = 10000)
images = ["%s%s" %("pixel",pixel_no) for pixel_no in range(0,28**2)]
train_images = np.array(train[images], dtype=np.float)/100
train_labels = np.array(train['label'])

Wall time: 964 ms


Test the ecoc algorithm:

In [30]:
%%time
no_train = 800
train = train_images[:no_train]
train_l = train_labels[:no_train]
test = train_images[no_train:no_train+400]
test_l = train_labels[no_train:no_train+400]
lambda_opt = 1./(400*np.array([18,18,18,18,17,18,16,16,16,18,18,20,20,18,18])) #optimal lambdas found via cross validation
C_list = 1./(2*len(train)*lambda_opt)  #compute optimal C from optimal lambda
sigma_list = [0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.0025, 0.005, 0.005, 0.005, 0.0025, 0.005] #optimal sigmas found via cross validation
ecoc_labels, list_supp_ind, list_alpha, list_b, list_kernel, code_words, barycenters = ecoc(
    train, train_l, penalty_list=C_list, 
    kernel=gaussian_kernel, list_sigma=sigma_list)

Wall time: 0 ns


In [11]:
final_labels = predict_ecoc(test, train, ecoc_labels, list_supp_ind, 
                 list_alpha, list_b, list_kernel, code_words, barycenters)
final_labels = final_labels.astype(np.int64)
score = sum([test_l[i] == final_labels[i] for i in range(len(final_labels))])/float(len(test_l))
score


Attention, data point could not be uniquely classified, index: 47, possible classification: [2, 8]
Attention, data point could not be uniquely classified, index: 219, possible classification: [2, 7]
Attention, data point could not be uniquely classified, index: 307, possible classification: [5, 9]
Attention, data point could not be uniquely classified, index: 348, possible classification: [4, 8]


0.89000000000000001

Observations: More training points result in less points that cannot be uniquely classified. More training points improve the performance (400 training points $\rightarrow$ 83%, 800 training points $\rightarrow$ 89%,)

### Train ECOC classifier

In [61]:
number_of_im = 200;
train_images_test = train_images[:number_of_im]
train_labels_test = train_labels[:number_of_im]
np.shape(train_images_test)[0]

200

In [51]:
# call classifier with pickle.load
ecoc_labels, list_supp_ind, list_alpha, list_b, list_kernel, list_sigma, code_words, barycenters=ecoc(train_images_test, train_labels_test)
ecoc_labels, list_supp_ind, list_alpha, list_b, list_kernel, list_sigma, code_words, barycenters=pickle.load(open ("trained_ecoc_"+str(number_of_im)+".dat", "rb"))



NameError: name 'train_images_test' is not defined

### Checking the Output of ECOC predict

In [63]:
# "unknown" data to be classified
number_of_test_data = 300
test_data = train_images[number_of_im:number_of_im+number_of_test_data]

# call predict function
predicted_labels_test=predict_ecoc(train_images[number_of_im:number_of_im+number_of_test_data], 
                                   train_images[:number_of_im], ecoc_labels, list_supp_ind, list_alpha, 
                                   list_b, list_kernel, code_words, barycenters)
predicted_labels_test = predicted_labels_test.astype(float)
# print("predicted labels: ", predicted_labels_test)

# get actual labels
train_labels_test=train_labels[number_of_im:number_of_im+number_of_test_data]
train_labels_test = train_labels_test.astype(float)
# print("actual labels: ", train_labels_test)

print("number of correctly classified labels: ", sum(predicted_labels_test.astype(int)==train_labels_test.astype(int)), 
      " out of ", number_of_test_data);

# check if there is are a couple of classifiers who make more mistakes than others
# then we could possibly change some of the parameters


300
Attention, data point could not be uniquely classified, index 45 possible classification [2, 8]
Attention, data point could not be uniquely classified, index 53 possible classification [0, 3, 6]
Attention, data point could not be uniquely classified, index 82 possible classification [1, 2]
Attention, data point could not be uniquely classified, index 93 possible classification [2, 7]
Attention, data point could not be uniquely classified, index 113 possible classification [1, 7]
Attention, data point could not be uniquely classified, index 120 possible classification [3, 9]
Attention, data point could not be uniquely classified, index 147 possible classification [0, 5]
Attention, data point could not be uniquely classified, index 153 possible classification [0, 3, 5, 6]
Attention, data point could not be uniquely classified, index 156 possible classification [5, 9]
Attention, data point could not be uniquely classified, index 169 possible classification [0, 5, 6]
Attention, data po