In [145]:
import numpy as np
import scipy

# to save data
import pickle

import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.metrics.pairwise import pairwise_kernels
from scipy.optimize import minimize
from scipy.spatial.distance import hamming

import matplotlib.pyplot as plt
%matplotlib inline

In [146]:
train = pd.read_csv('Data/train.csv')

In [147]:
images = ["%s%s" %("pixel",pixel_no) for pixel_no in range(0,28**2)]
train_images = np.array(train[images], dtype=np.float)/100
#dimensionserhoehung, damit am ende b = 0
train_images = np.concatenate((train_images, np.ones((len(train_images),1))), axis = 1)

train_labels = np.array(train['label'])
label0 = np.array([1 if i==0 else -1 for i in train_labels])
len(train_images)

42000

In [148]:
from mySVM_class import *


In [149]:
def ecoc(labeled_data, labels, kernel=scalar_product, penalty=1, list_sigma=[0.1]*15):
    # 
    labels=labels.astype(int);
    l=np.shape(labeled_data)[0];
    num_classifiers=15;
    ecoc_labels=np.zeros((l,15));
    
    # define code_word matrix, the ith row corresponds to the number i
    # each column corresponds to a classifier that will have to be trained
    code_words=np.array([
        [ 1,  1, -1, -1, -1, -1,  1, -1,  1, -1, -1,  1,  1, -1,  1],
        [-1, -1,  1,  1,  1,  1, -1,  1, -1,  1,  1, -1, -1,  1, -1],
        [ 1, -1, -1,  1, -1, -1, -1,  1,  1,  1,  1, -1,  1, -1,  1],
        [-1, -1,  1,  1, -1,  1,  1,  1, -1, -1, -1, -1,  1, -1,  1],
        [ 1,  1,  1, -1,  1, -1,  1,  1, -1, -1,  1, -1, -1, -1,  1],
        [-1,  1, -1, -1,  1,  1, -1,  1,  1,  1, -1, -1, -1, -1,  1],
        [ 1, -1,  1,  1,  1, -1, -1, -1, -1,  1, -1,  1, -1, -1,  1],
        [-1, -1, -1,  1,  1,  1,  1, -1,  1, -1,  1,  1, -1, -1,  1],
        [ 1,  1, -1,  1, -1,  1,  1, -1, -1,  1, -1, -1, -1,  1,  1],
        [-1,  1,  1,  1, -1, -1, -1, -1,  1, -1,  1, -1, -1,  1,  1]]);
    
    # up until now training data has labels from 0 to 9
    # now these are replaced by the 15 digit string given by code_words
    for j in range(l):
        ecoc_labels[j]=code_words[labels[j]];
    
    list_supp_ind = [];
    list_alpha =[];
    list_b =[];
    list_kernel=[];
    
    # class an svm object for each classifier
    # here would be the possibility to parallelize
    for classifier in range(15):
        svm=mySVM(kernel=kernel, penalty=penalty, sigma=list_sigma[classifier]);
        svm.fit(labeled_data, ecoc_labels[:,classifier]);
        list_supp_ind.append(svm.supp_indices);
        list_alpha.append(svm.alpha);
        list_b.append(svm.b);
        list_kernel.append(svm.kernel);
        
    # pickle dump to save and call saved objects    
    pickle.dump((ecoc_labels, list_supp_ind, list_alpha, list_b, list_kernel, code_words), open( "trained_ecoc_"+str(number_of_im)+".dat", "wb" ))
    
    # now I need to call a binary classifier for each column of ecoc_labels
    # from decision functions we get seperating hyperplanes, margin, ... 
    # return those
    
    
    return ecoc_labels, list_supp_ind, list_alpha, list_b, list_kernel, code_words;

In [150]:
# suppose we have an unlabeled data point
def predict_ecoc(unlabeled_data, labeled_data, ecoc_labels, list_supp_ind, list_alpha, list_b, list_kernel, code_words):
    # every row is one data point
    # number of rows = # of data points
    l=np.shape(unlabeled_data)[0];
    print(l)
    new_labels=np.zeros((l,15));
    
    temp_label_ind=[];
    final_labels = np.array(['NaN']*l);
    
    for classifier in range(15):
        a_supp = list_alpha[classifier][list_supp_ind[classifier]];
        ecoc_labels_supp = ecoc_labels[list_supp_ind[classifier],classifier]
        a_times_labels=np.multiply(a_supp, ecoc_labels_supp)
        
        for i in range(l):
            # i_th row of kernel matrix k
            k=np.array([list_kernel[classifier](unlabeled_data[i],y) for y in labeled_data[list_supp_ind[classifier]]])
            
            # list of lists with 15 entries, one per classifier
            new_labels[i][classifier]=np.sign(np.dot(a_times_labels,k)+list_b[classifier]);
    
        
    for i in range(l):
        ham_dist = [hamming(new_labels[i], code_words[j]) for j in range(10)]
        temp_label_ind = [j for j in range(len(ham_dist)) if ham_dist[j] == min(ham_dist)]
        if len(temp_label_ind)!=1:
            print("Attention, data point could not be uniquely classified, index " 
                  + str(i) + "possible classification" + str(temp_label_ind));
        else:
            final_labels[i] = ham_dist.index(min(ham_dist));
        
   
    return final_labels;
    

### Train ECOC classifier

In [151]:
number_of_im = 200;
train_images_test = train_images[:number_of_im]
train_labels_test = train_labels[:number_of_im]
np.shape(train_images_test)[0]

20

In [152]:
# call classifier with pickle.load
ecoc_labels, list_supp_ind, list_alpha, list_b, list_kernel, code_words=ecoc(train_images_test, train_labels_test)
ecoc_labels, list_supp_ind, list_alpha, list_b, list_kernel, code_words=pickle.load(open ("trained_ecoc_"+str(number_of_im)+".dat", "rb"))



### Checking the Output of ECOC predict

In [153]:
# "unknown" data to be classified
number_of_test_data = 50
test_data = train_images[number_of_im:number_of_im+number_of_test_data]

# call predict function
predicted_labels_test=predict_ecoc(train_images[number_of_im:number_of_im+20], train_images[:number_of_im], ecoc_labels, list_supp_ind, list_alpha, list_b, list_kernel, code_words)
predicted_labels_test = predicted_labels_test.astype(int)
# print("predicted labels: ", predicted_labels_test)

# get actual labels
train_labels_test=train_labels[number_of_im:number_of_im+20]
train_labels_test = train_labels_test.astype(int)
# print("actual labels: ", train_labels_test)

print("number of correctly classified labels: ", sum(predicted_labels_test.astype(int)==train_labels_test.astype(int)), 
      " out of ", number_of_test_data);

# check if there is are a couple of classifiers who make more mistakes than others
# then we could possibly change some of the parameters


20
Attention, data point could not be uniquely classified, index 12possible classification[0, 5]


ValueError: invalid literal for int() with base 10: 'NaN'