In [1]:
import pandas as pd
import numpy as np
from sklearn.svm import NuSVC
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC


################################################################
def read_train_and_test_data():
    train_data = convert_train_and_test_data_into_a_list("digitTrain.txt")
    test_data = convert_train_and_test_data_into_a_list("digitTest.txt")
    
    actual_numbers_train, X_train, Y_train = split_the_data_as_train_and_labels(train_data)
    actual_numbers_test, X_test, Y_test = split_the_data_as_train_and_labels(test_data)
    
    return [[actual_numbers_train, X_train, Y_train], [actual_numbers_test, X_test, Y_test]]

    
################################################################
def convert_train_and_test_data_into_a_list(directory):
    data_list = []
    with open(directory) as file:
        for every_line in file.readlines():
            data_list.append(list(map(float, every_line.rstrip('\n').split())))
    
    return np.array(data_list)



################################################################
'''
This function reads the train_data (X) and test_data (Y) to read the digits 1 and 5 only. Also creates a new labels
list with +1 (for 1) and -1 (for 5).

One-Versus-One

In this case, 1 Vs 5
'''
def get_the_data_with_labels_1_and_5_for_OvO(actual_numbers, train_data, labels, class1, class2):
    
    '''find the indices of label1 (1) and label2 (5) in test_data'''
    class1_indices_in_test_data = np.where(actual_numbers == class1)[0]
    class2_indices_in_test_data = np.where(actual_numbers == class2)[0]
    
    '''sort the indices of label1 and label2 and concatenate'''
    sorted_indices_of_input_classes = np.sort(np.concatenate((class1_indices_in_test_data, class2_indices_in_test_data)))
    
    '''Form new train and labels data by copying the elements at indices specified in sorted_indoces list'''
    new_class_data = np.take(actual_numbers, sorted_indices_of_input_classes)
    new_train_data = np.take(train_data, sorted_indices_of_input_classes)
    new_label_data = np.take(labels, sorted_indices_of_input_classes)
    
    '''In test data, change the corresponding values of 1 -> 1 and 5 -> (-1)'''
    np.put(new_label_data, np.where(new_class_data == 1)[0], [1])
    np.put(new_label_data, np.where(new_class_data == 5)[0], [-1])
    
    return new_train_data, new_label_data



################################################################
def split_the_data_as_train_and_labels(inputData):
    actual_numbers = inputData[:, 0] #'''First column in the dataset'''
    X_symmetry_data = inputData[:, 2:] #'''third column in the dataset, symmetry'''
    Y_intensity_data =inputData[:, 1] #'''second column in the dataset, intensity'''
    
    return actual_numbers, X_symmetry_data, Y_intensity_data




################################################################
'''
This function reads the actual training and test datasets and returns the new training and test datasets
corresponding to the numbers 1 and 5 to facilitate OneVsOne approach
'''
def SVM_OVO_1Vs5_linear_get_training_and_test_data():
    train_test_data = read_train_and_test_data()
    actual_numbers_train, X_train, Y_train = train_test_data[0]
    actual_numbers_test, X_test, Y_test = train_test_data[1]
    
    
    train_X, train_Y = get_the_data_with_labels_1_and_5_for_OvO(actual_numbers_train, X_train, Y_train, 1, 5)
    test_X, test_Y = get_the_data_with_labels_1_and_5_for_OvO(actual_numbers_test, X_test, Y_test, 1, 5)
    
    return [[train_X, train_Y], [test_X, test_Y]]




################################################################
'''
Depending upon the number of training samples to be considered, this function reads the traing samples
accordingly, set the "nu" value for the classifier and then calls the classifier function
'''
def SVM_OVO_1Vs5_on_limited_samples(no_of_samples, train_X, train_Y, test_X, test_Y):
    
    no_of_support_vectors, accuracy_score_ = 0.0, 0.0
    nu_in_NuSVC = 0.5
    
    if no_of_samples != 0:
        portion_of_train_X = train_X[:no_of_samples,]
        portion_of_train_Y = train_Y[:no_of_samples,]
        
        if no_of_samples == 50:
            nu_in_NuSVC = 0.1
        
        elif no_of_samples == 100:
            nu_in_NuSVC = 0.2
        
        elif no_of_samples == 200:
            nu_in_NuSVC = 0.3
            
        elif no_of_samples == 800:
            nu_in_NuSVC = 0.4
        
        
        
        no_of_support_vectors, accuracy_score_ = classify_SVM_OvO_linear(portion_of_train_X, portion_of_train_Y, test_X, test_Y, nu_in_NuSVC)
    
    else:
        no_of_support_vectors, accuracy_score_ = classify_SVM_OvO_linear(train_X, train_Y, test_X, test_Y, nu_in_NuSVC)
    
    return no_of_support_vectors, accuracy_score_



################################################################

def classify_SVM_OvO_linear(train_X, train_Y, test_X, test_Y, nu_in_NuSVC = 0.5):
    
    classifier = NuSVC(nu = nu_in_NuSVC, kernel = 'linear', decision_function_shape = 'ovo')
    _, _, number_of_support_vectors, accuracy_score_ = classifier_generic(classifier, train_X, train_Y, test_X, test_Y)    
    return number_of_support_vectors, accuracy_score_


#################################################################

def classify_SVM_Polynomial(train_X, train_Y, test_X, test_Y, Q, C_value):
    
    classifier = SVC(C = C_value, kernel = 'poly', degree = Q, gamma = 0.4, decision_function_shape = 'ovo')    
    training_error, test_error, no_of_support_vectors, accuracy_score_ = classifier_generic(classifier, train_X, train_Y, test_X, test_Y)    
    return training_error, test_error, no_of_support_vectors, accuracy_score_


#################################################################

def classify_SVM_RBF_kernel(train_X, train_Y, test_X, test_Y, C_value, gamma_value):
    
    classifier = SVC(C = C_value, kernel = 'rbf', gamma = gamma_value)
    training_error, test_error, _, _ = classifier_generic(classifier, train_X, train_Y, test_X, test_Y)    
    return training_error, test_error
    

#################################################################

def classifier_generic(classifier, train_X, train_Y, test_X, test_Y):
    classifier.fit(train_X.reshape(-1, 1), train_Y)
    Y_predicted = classifier.predict(test_X.reshape(-1, 1))
    
    
    training_error = classifier.score(train_X.reshape(-1, 1), train_Y)
    test_error = classifier.score(test_X.reshape(-1, 1), test_Y)
    no_of_support_vectors = np.shape(classifier.support_)
    accuracy_score_ = accuracy_score(test_Y, Y_predicted)
    
    return training_error, test_error, no_of_support_vectors, accuracy_score_




In [20]:
'''Question 4.a & 4.b'''

no_of_samples_list = [0, 50, 100, 200, 800]

train_test_data = SVM_OVO_1Vs5_linear_get_training_and_test_data()
train_X, train_Y = train_test_data[0]
test_X, test_Y = train_test_data[1]

for number in no_of_samples_list:
    no_of_support_vectors, accuracy_score_ = SVM_OVO_1Vs5_on_limited_samples(number, train_X, train_Y, test_X, test_Y)
    

samples - 0 no of SVs - (782,) accuracy - 0.9834905660377359
samples - 50 no of SVs - (6,) accuracy - 0.9811320754716981
samples - 100 no of SVs - (20,) accuracy - 0.9811320754716981
samples - 200 no of SVs - (60,) accuracy - 0.9811320754716981
samples - 800 no of SVs - (320,) accuracy - 0.9811320754716981


In [22]:
'''Question 4.c'''
Q = [2, 5]
for degree in Q:
    for exponent in range(0, 5):
        C_value = 1 * pow(10, -exponent)
        training_error, test_error, no_of_support_vectors, accuracy_score_ = classify_SVM_Polynomial(train_X, train_Y, test_X, test_Y, degree, C_value)


-------  Q is 2 C is 1
TrErr - 0.9955156950672646 TSErr - 0.9811320754716981 no_SVs - (24,) accuracy score - 0.9811320754716981
-------  Q is 2 C is 0.1
TrErr - 0.9955156950672646 TSErr - 0.9811320754716981 no_SVs - (30,) accuracy score - 0.9811320754716981
-------  Q is 2 C is 0.01
TrErr - 0.9955156950672646 TSErr - 0.9834905660377359 no_SVs - (64,) accuracy score - 0.9834905660377359
-------  Q is 2 C is 0.001
TrErr - 0.9916720051249199 TSErr - 0.9787735849056604 no_SVs - (190,) accuracy score - 0.9787735849056604
-------  Q is 2 C is 0.0001
TrErr - 0.9705317104420244 TSErr - 0.9551886792452831 no_SVs - (642,) accuracy score - 0.9551886792452831
-------  Q is 5 C is 1
TrErr - 0.9955156950672646 TSErr - 0.9834905660377359 no_SVs - (26,) accuracy score - 0.9834905660377359
-------  Q is 5 C is 0.1
TrErr - 0.9955156950672646 TSErr - 0.9834905660377359 no_SVs - (26,) accuracy score - 0.9834905660377359
-------  Q is 5 C is 0.01
TrErr - 0.9955156950672646 TSErr - 0.9834905660377359 no_SVs

In [24]:
'''Question 4.d'''
for i in range(-2, 8, 2):
    C_value = 1 * pow(10, i)
    gamma_value = 0
    if C_value >= 10:
        gamma_value = 0.01
    else:
        gamma_value = 10
    training_error, test_error = classify_SVM_RBF_kernel(train_X, train_Y, test_X, test_Y, C_value, gamma_value)
    

C - 0.01 TrErr - 0.9916720051249199 TsErr - 0.9551886792452831
C - 1 TrErr - 0.9955156950672646 TsErr - 0.9787735849056604
C - 100 TrErr - 0.9955156950672646 TsErr - 0.9787735849056604
C - 10000 TrErr - 0.9955156950672646 TsErr - 0.9811320754716981
C - 1000000 TrErr - 0.9955156950672646 TsErr - 0.9787735849056604
