In [1]:
%load_ext autoreload
%autoreload 2

import pickle
import numpy as np
import pandas as pd
from kernel_svm import *
from kernel_functions import *
from scoring import *

# Import Data

In [2]:
# train = pd.read_csv('../data/Xtr.csv', sep = ",", header = None)
# test = pd.read_csv('../data/Xte.csv', sep = ",", header = None)

trainOutput = pd.read_csv('../data/Ytr.csv', sep = ",")

train = pd.read_csv('../pickles/HOG_train.csv', sep = ",", header = None)
test = pd.read_csv('../pickles/HOG_test.csv', sep = ",", header = None)

In [3]:
train.drop(train.columns[len(train.columns)-1], axis=1, inplace=True)
test.drop(test.columns[len(test.columns)-1], axis=1, inplace=True)

In [4]:
combined = train.append(test)
combined.reset_index(inplace=True)
combined.drop('index',inplace=True,axis=1)

# Define multiclass SVM (combination of 1v1 SVMs)

In [5]:
class multiclassSVM:
    
    def __init__(self, K=None, kernel_fun=None, C=300, **kwargs):
        self.kernel_fun = kernel_fun
        self.parameters = kwargs
        self.C = C
        self.K = K
        self.models = {}
        self.classes = None

    def fit(self, X_train, y_train):
        
        classes = np.unique(y_train)
        self.classes = classes
        for l1 in range(len(classes)):
            for l2 in range(l1+1,len(classes)):
                X_train_local = X_train[(y_train == classes[l1]) + (y_train == classes[l2])]
                y_train_local = y_train[(y_train == classes[l1]) + (y_train == classes[l2])]
                y_train_local = 2*(y_train_local == classes[l1]) - 1
                self.models[classes[l1],classes[l2]] = KernelSVM(K=self.K, kernel_fun=self.kernel_fun, C=self.C)
                self.models[classes[l1],classes[l2]].fit(X_train_local, y_train_local, K_train_index = y_train_local.index)
        
        
    def predict(self, X_test, X_test_index = None):
        
        ensemble_predictions = np.empty([len(X_test), 0])
        for k in self.models.keys():
            local_prediction = self.models[k].predict(X_test, X_test_index)
            ensemble_predictions = np.hstack((ensemble_predictions, np.array([k[0]*(local_prediction == 1) + k[1]*(local_prediction == -1)]).T))
            
        sum_array = np.array([np.sum(ensemble_predictions == c, axis = 1) for c in self.classes]).T
        final_predictions = self.classes[np.argmax(sum_array, axis=1)]
            
        return final_predictions, ensemble_predictions, sum_array

# Import Gaussian Kernel

In [6]:
K_poly = pickle.load(open("../pickles/Gaussian_HOG_Kernel.pickle", 'rb'))

# Make predictions

## For validation score:

In [7]:
length = 2000

ttrain, tval, tsorter = get_train_val(train[:length])
ttrainOutput = trainOutput[:length][tsorter == 0]['Prediction']
ttest = tval

K = K_poly[:length].T[:length]

## For final result:

In [8]:
# ttrain = combined[:5000]
# ttest = combined[5000:]
# ttrainOutput = trainOutput['Prediction']

# K = K_poly

## Use multiclass SVM

In [9]:
mSVM = multiclassSVM(K = K)

In [10]:
mSVM.fit(ttrain, ttrainOutput)

  unsupported[op_str]))


     pcost       dcost       gap    pres   dres
 0:  1.5853e+06 -7.6751e+06  9e+06  4e-17  2e-13
 1:  4.4428e+05 -7.9437e+05  1e+06  2e-16  1e-13
 2:  6.6678e+04 -8.4512e+04  2e+05  2e-16  5e-14
 3:  8.9479e+03 -1.2519e+04  2e+04  2e-16  2e-14
 4:  9.8038e+02 -2.0517e+03  3e+03  2e-16  7e-15
 5: -1.5635e+01 -4.3177e+02  4e+02  2e-16  3e-15
 6: -1.0884e+02 -1.5394e+02  5e+01  2e-16  1e-15
 7: -1.1245e+02 -1.1433e+02  2e+00  2e-16  5e-16
 8: -1.1257e+02 -1.1264e+02  7e-02  2e-16  5e-16
 9: -1.1258e+02 -1.1259e+02  2e-03  2e-16  5e-16
10: -1.1258e+02 -1.1258e+02  4e-05  2e-16  5e-16
Optimal solution found.
     pcost       dcost       gap    pres   dres
 0:  1.4608e+06 -6.1536e+06  8e+06  3e-17  2e-13
 1:  4.2475e+05 -8.7394e+05  1e+06  2e-16  1e-13
 2:  6.6640e+04 -9.2910e+04  2e+05  2e-16  6e-14
 3:  8.9907e+03 -1.2667e+04  2e+04  2e-16  2e-14
 4:  9.8219e+02 -2.0799e+03  3e+03  2e-16  8e-15
 5: -1.9045e+01 -4.3829e+02  4e+02  2e-16  3e-15
 6: -1.1242e+02 -1.5765e+02  5e+01  2e-16  1e-1

In [11]:
final_predictions, ensemble_predictions, sum_array = mSVM.predict(ttest, ttest.index)

In [12]:
print(len(final_predictions))
pd.Series(final_predictions).value_counts()

500


6    72
8    59
3    59
1    59
4    55
2    49
7    46
0    39
9    33
5    29
dtype: int64

## Get "validation" score

In [13]:
get_score(final_predictions, trainOutput[:length][tsorter == 1]['Prediction'])

0.496

## Dump to CSV

In [23]:
IdTest = np.array([i for i in range(1, 1 + len(test))])
output = [int(x) for x in final_predictions]
df_output = pd.DataFrame()
df_output['Id'] = IdTest
df_output['Prediction'] = output
df_output[['Id','Prediction']].to_csv('../predictions/test_multiclass_svm_HOG_gauss.csv', sep = ",", index=False)

ValueError: Length of values does not match length of index