In [49]:
import numpy as np
import scipy

# to save data
import pickle

import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.metrics.pairwise import pairwise_kernels
from scipy.optimize import minimize
from scipy.spatial.distance import hamming
from mySVM_class import *

import matplotlib.pyplot as plt
%matplotlib inline

In [68]:
train = pd.read_csv('Data/train.csv', nrows = 5000)

In [69]:
images = ["%s%s" %("pixel",pixel_no) for pixel_no in range(0,28**2)]
train_images = np.array(train[images], dtype=np.float)/100
train_labels = np.array(train['label'])
label0 = np.array([1 if i==0 else -1 for i in train_labels])

'''#standardize data
data_mean = train_images.mean().astype(np.float32)
data_std = train_images.std().astype(np.float32)

def standardize(x): 
    return (x-data_mean)/data_std

train_images = standardize(train_images)'''

'#standardize data\ndata_mean = train_images.mean().astype(np.float32)\ndata_std = train_images.std().astype(np.float32)\n\ndef standardize(x): \n    return (x-data_mean)/data_std\n\ntrain_images = standardize(train_images)'

In [70]:
def ecoc(labeled_data, labels, kernel=scalar_product, penalty=1, list_sigma=[0.1]*15):

    labels=labels.astype(int);
    l=np.shape(labeled_data)[0];
    n=np.shape(labeled_data)[1]
    num_classifiers=15;
    ecoc_labels=np.zeros((l,15));
    
    # compute barycenters of the points of each label
    barycenters = np.zeros((10,n));
        
    for i in range(10):
        ind = labels == i
        barycenters[i] = np.mean(labeled_data[ind], axis=0)
    
    # define code_word matrix, the ith row corresponds to the number i
    # each column corresponds to a classifier that will have to be trained
    code_words=np.array([
        [ 1,  1, -1, -1, -1, -1,  1, -1,  1, -1, -1,  1,  1, -1,  1],
        [-1, -1,  1,  1,  1,  1, -1,  1, -1,  1,  1, -1, -1,  1, -1],
        [ 1, -1, -1,  1, -1, -1, -1,  1,  1,  1,  1, -1,  1, -1,  1],
        [-1, -1,  1,  1, -1,  1,  1,  1, -1, -1, -1, -1,  1, -1,  1],
        [ 1,  1,  1, -1,  1, -1,  1,  1, -1, -1,  1, -1, -1, -1,  1],
        [-1,  1, -1, -1,  1,  1, -1,  1,  1,  1, -1, -1, -1, -1,  1],
        [ 1, -1,  1,  1,  1, -1, -1, -1, -1,  1, -1,  1, -1, -1,  1],
        [-1, -1, -1,  1,  1,  1,  1, -1,  1, -1,  1,  1, -1, -1,  1],
        [ 1,  1, -1,  1, -1,  1,  1, -1, -1,  1, -1, -1, -1,  1,  1],
        [-1,  1,  1,  1, -1, -1, -1, -1,  1, -1,  1, -1, -1,  1,  1]]);
    
    # up until now training data has labels from 0 to 9
    # now these are replaced by the 15 digit string given by code_words
    for j in range(l):
        ecoc_labels[j]=code_words[labels[j]];
    
    list_supp_ind = [];
    list_alpha =[];
    list_b =[];
    list_kernel=[];
    
    # class an svm object for each classifier
    # here would be the possibility to parallelize
    for classifier in range(15):
        svm=mySVM(kernel=kernel, penalty=penalty, sigma=list_sigma[classifier]);
        svm.fit(labeled_data, ecoc_labels[:,classifier]);
        list_supp_ind.append(svm.supp_indices);
        list_alpha.append(svm.alpha);
        list_b.append(svm.b);
        list_kernel.append(svm.kernel);
        
    # pickle dump to save and call saved objects    
    #pickle.dump((ecoc_labels, list_supp_ind, list_alpha, list_b, list_kernel, list_sigma, code_words, barycenters), 
     #           open( "trained_ecoc_"+str(l)+".dat", "wb" ))
    
    # now I need to call a binary classifier for each column of ecoc_labels
    # from decision functions we get seperating hyperplanes, margin, ... 
    # return those
    
    
    return ecoc_labels, list_supp_ind, list_alpha, list_b, list_kernel, code_words, barycenters;

In [56]:
# suppose we have an unlabeled data point
def predict_ecoc(unlabeled_data, labeled_data, ecoc_labels, list_supp_ind, 
                 list_alpha, list_b, list_kernel, code_words, barycenters):
    # every row is one data point
    # number of rows = # of data points
    l=np.shape(unlabeled_data)[0];
    new_labels=np.zeros((l,15));
    temp_label_ind=[];
    final_labels = np.array(['inf']*l);
    
    for classifier in range(15):
        a_supp = list_alpha[classifier][list_supp_ind[classifier]];
        ecoc_labels_supp = ecoc_labels[list_supp_ind[classifier],classifier]
        a_times_labels=np.multiply(a_supp, ecoc_labels_supp)
        
        for i in range(l):
            # i_th row of kernel matrix k
            k=np.array([list_kernel[classifier](unlabeled_data[i],y) for y in labeled_data[list_supp_ind[classifier]]])
            
            # list of lists with 15 entries, one per classifier
            new_labels[i][classifier]=np.sign(np.dot(a_times_labels,k)+list_b[classifier]);
    
        
    for i in range(l):
        ham_dist = [hamming(new_labels[i], code_words[j]) for j in range(10)]
        temp_label_ind = [j for j in range(len(ham_dist)) if ham_dist[j] == min(ham_dist)]
        if len(temp_label_ind)!=1:
            print("Attention, data point could not be uniquely classified, index " 
                  + str(i) + " possible classification " + str(temp_label_ind));
            
            # ask which barycenter is closest out of temp_label_ind
            final_labels[i] = np.min(np.argmin([np.linalg.norm(unlabeled_data[i]-barycenters[k]) for k in temp_label_ind]))
            
            
        else:
            final_labels[i] = ham_dist.index(min(ham_dist));
        
   
    return final_labels;
    

In [73]:
%%time
no_train = 400
train = train_images[:no_train]
train_l = train_labels[:no_train]
test = train_images[no_train:no_train+400]
test_l = train_labels[no_train:no_train+400]
lambda_opt = 1/(36*200.)
C_opt = 1./(2*lambda_opt*len(train))
sigma_opt = 0.005
ecoc_labels, list_supp_ind, list_alpha, list_b, list_kernel, code_words, barycenters = ecoc(
    train, train_l, penalty=C_opt, 
    kernel=gaussian_kernel, list_sigma=[sigma_opt]*15)

Wall time: 12min 31s


In [74]:
final_labels = predict_ecoc(test, train, ecoc_labels, list_supp_ind, 
                 list_alpha, list_b, list_kernel, code_words, barycenters)
final_labels = final_labels.astype(np.int64)
score = sum([test_l[i] == final_labels[i] for i in range(len(final_labels))])/float(len(test_l))
score


Attention, data point could not be uniquely classified, index 53 possible classification [3, 5]
Attention, data point could not be uniquely classified, index 57 possible classification [0, 9]
Attention, data point could not be uniquely classified, index 103 possible classification [0, 3, 5]
Attention, data point could not be uniquely classified, index 120 possible classification [4, 7]
Attention, data point could not be uniquely classified, index 139 possible classification [3, 5, 9]
Attention, data point could not be uniquely classified, index 148 possible classification [0, 5]
Attention, data point could not be uniquely classified, index 241 possible classification [2, 4]
Attention, data point could not be uniquely classified, index 286 possible classification [5, 9]
Attention, data point could not be uniquely classified, index 334 possible classification [3, 5]
Attention, data point could not be uniquely classified, index 367 possible classification [0, 5, 6]


0.82999999999999996

### Train ECOC classifier

In [61]:
number_of_im = 200;
train_images_test = train_images[:number_of_im]
train_labels_test = train_labels[:number_of_im]
np.shape(train_images_test)[0]

200

In [51]:
# call classifier with pickle.load
ecoc_labels, list_supp_ind, list_alpha, list_b, list_kernel, list_sigma, code_words, barycenters=ecoc(train_images_test, train_labels_test)
ecoc_labels, list_supp_ind, list_alpha, list_b, list_kernel, list_sigma, code_words, barycenters=pickle.load(open ("trained_ecoc_"+str(number_of_im)+".dat", "rb"))



NameError: name 'train_images_test' is not defined

### Checking the Output of ECOC predict

In [63]:
# "unknown" data to be classified
number_of_test_data = 300
test_data = train_images[number_of_im:number_of_im+number_of_test_data]

# call predict function
predicted_labels_test=predict_ecoc(train_images[number_of_im:number_of_im+number_of_test_data], 
                                   train_images[:number_of_im], ecoc_labels, list_supp_ind, list_alpha, 
                                   list_b, list_kernel, code_words, barycenters)
predicted_labels_test = predicted_labels_test.astype(float)
# print("predicted labels: ", predicted_labels_test)

# get actual labels
train_labels_test=train_labels[number_of_im:number_of_im+number_of_test_data]
train_labels_test = train_labels_test.astype(float)
# print("actual labels: ", train_labels_test)

print("number of correctly classified labels: ", sum(predicted_labels_test.astype(int)==train_labels_test.astype(int)), 
      " out of ", number_of_test_data);

# check if there is are a couple of classifiers who make more mistakes than others
# then we could possibly change some of the parameters


300
Attention, data point could not be uniquely classified, index 45 possible classification [2, 8]
Attention, data point could not be uniquely classified, index 53 possible classification [0, 3, 6]
Attention, data point could not be uniquely classified, index 82 possible classification [1, 2]
Attention, data point could not be uniquely classified, index 93 possible classification [2, 7]
Attention, data point could not be uniquely classified, index 113 possible classification [1, 7]
Attention, data point could not be uniquely classified, index 120 possible classification [3, 9]
Attention, data point could not be uniquely classified, index 147 possible classification [0, 5]
Attention, data point could not be uniquely classified, index 153 possible classification [0, 3, 5, 6]
Attention, data point could not be uniquely classified, index 156 possible classification [5, 9]
Attention, data point could not be uniquely classified, index 169 possible classification [0, 5, 6]
Attention, data po