In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

from sklearn.svm import SVC
from typing import List, Tuple
import matplotlib.pyplot as plt

In [2]:
data = pd.read_csv('data.csv')
data.head(5)

Unnamed: 0,weight,height,closing,open_00_02,open_02_04,open_04_06,open_06_08,open_08_10,open_10_12,open_12_14,...,cuisine_y_Regional,cuisine_y_Seafood,cuisine_y_Vietnamese,parking_lot_none,parking_lot_public,parking_lot_valet parking,parking_lot_yes,smoking,age,target
0,69,1.77,12,0,0,0,0,0,1,0,...,0,0,0,1,0,0,0,0,32,2
1,69,1.77,24,1,1,1,1,1,1,1,...,0,0,0,0,1,0,0,0,32,0
2,69,1.77,20,0,0,0,0,0,1,1,...,0,1,0,0,0,0,1,0,32,1
3,69,1.77,23,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,32,2
4,69,1.77,23,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,32,2


In [3]:
def split_data(data_x: np.ndarray, data_y: np.ndarray):
    
    ordering = np.arange(data_x.shape[0])
    np.random.shuffle(ordering)
    data_x = data_x[ordering]
    data_y = data_y[ordering]
    
    valid_start = int(len(data_x) * 0.7)
    test_start = int(len(data_x) * 0.9)
    
    train_set = (data_x[:valid_start], data_y[:valid_start])
    valid_set = (data_x[valid_start:test_start], data_y[valid_start:test_start])
    test_set = (data_x[test_start:], data_y[test_start:])
    
    return train_set, valid_set, test_set

In [4]:
y = np.array(data['target'])
x = np.array(data.drop(['target'], axis=1))

train_set, valid_set, test_set = split_data(x,y)

x_train = train_set[0]
y_train = train_set[1]

x_val = valid_set[0]
y_val = valid_set[1]

x_test = test_set[0]
y_test = test_set[1]

print(x_train.shape)
print(x_val.shape)
print(x_test.shape)

(2055, 134)
(587, 134)
(294, 134)


In [5]:
x_train = np.concatenate( (x_train, x_val),axis= 0)
y_train = np.concatenate( (y_train, y_val),axis= 0)

# Grid search

In [6]:
from typing import List, Tuple

def cross_validation(X: np.ndarray, y: np.ndarray, k: int) -> List[Tuple[np.ndarray, np.ndarray, 
                                                                         np.ndarray, np.ndarray]]:  
    n = int(len(X)/k )+1   # number of elements in each group
    subsets_X = []
    subsets_y = []
    
    for i in range(k-1):
        subsets_X.append(X[i*n: (i+1)*n])
        subsets_y.append(y[i*n: (i+1)*n])
    
    subsets_X.append(X [(k-1)*n:])
    subsets_y.append(y [(k-1)*n:])
    tuples_to_return = []
    
    for i in range(k):
        indexes = np.arange(k)
        train_indexes = (np.delete(indexes, i)).astype(int)
        train_data =  np.vstack([subsets_X[j] for j in train_indexes ])
        train_labels =  np.hstack([ subsets_y[j] for j in train_indexes] )     
        test_data = np.array(subsets_X[i])
        test_labels = np.array(subsets_y[i])                
        tuples_to_return.append( (train_data, train_labels, test_data, test_labels)  )
        
    return tuples_to_return

In [7]:
def find_best_params_Poly(datasets):
    
    C_list = list(map(lambda x : 10**(x), [-3,-2,-1,0,1,2]))
    degree_list = list(map(lambda x : x, [1,2]))
    coef_list = list(map(lambda x : x, [-10, -1, 0, 0.1, 1, 10]))
    gamma_list = list(map(lambda x : 10**(-x), [0,1,2,3]))

    scores = []
    
    for C in C_list:
        for gamma in gamma_list:
            for coef in coef_list:
                for degree in degree_list:
                    
                    val_accuracy = []
                    for dataset in datasets:
                        svm = SVC(kernel = 'poly', C = C, degree = degree)
                        svm.fit(dataset[0], dataset[1])
                        val_acc = svm.score(dataset[2], dataset[3])
                        val_accuracy.append(val_acc)

                    scores.append( (C, gamma, np.mean(val_accuracy) ) )
                    print("C = ",C, "degree:", degree, "coef ", coef, "gamma ", gamma, "accuracy:", np.mean(val_accuracy), len(val_accuracy))
       
    argmax= np.argmax(scores, axis = 0)[2]
    print("\n\nPoly: accuracy: {:} for C = {:}, gamma  = {:}".format( scores[argmax][2],scores[argmax][0],scores[argmax][1])  )
    

In [8]:
def find_best_params_rbf(datasets):
    
    C_list = list(map(lambda x : 10**(x), [-3,-2,-1,0,1,2]))
    gamma_list = list(map(lambda x : 10**(-x), [0,1,2,3]))
    coef_list = list(map(lambda x : x, [-10, -1, 0, 0.1, 1, 10]))

    scores = []
    
    for coef in coef_list:
        for C in C_list:
            for gamma in gamma_list:
                val_accuracy = []

                for dataset in datasets:
                    svm = SVC(kernel = 'rbf', C = C, gamma=gamma, coef0 = coef)
                    svm.fit(dataset[0], dataset[1])
                    val_acc = svm.score(dataset[2], dataset[3])
                    val_accuracy.append(val_acc)

                scores.append( (C, gamma, coef, np.mean(val_accuracy) ) )
                #print("C = ", C, "gamma", gamma, "mean accuracy:", np.mean(val_accuracy), len(val_accuracy))

    argmax= np.argmax(scores, axis = 0)[3]
    print("rbf: accuracy: {:} for C = {:}, gamma  = {:}, coef = {:}".format( scores[argmax][3],scores[argmax][0],scores[argmax][1], scores[argmax][2])  )
 

In [9]:
def find_best_params_linear(datasets):

    C_list = list(map(lambda x : 10**(x), [-3,-2,-1,0,1,2,3]))
    gamma_list = list(map(lambda x : 10**(-x), [0,1,2,3]))
    coef_list = list(map(lambda x : x, [-10, -1, 0, 0.1, 1, 10]))
    scores = []
    
    for coef in coef_list:
        for C in C_list:  
            for gamma in gamma_list:

                val_accuracy = []
                for dataset in datasets:
                    svm = SVC(kernel = 'linear', C = C, gamma=gamma,  coef0 = coef)
                    svm.fit(dataset[0], dataset[1])
                    val_acc = svm.score(dataset[2], dataset[3])
                    val_accuracy.append(val_acc)

                scores.append( (C,gamma,coef,np.mean(val_accuracy) ) )
                #print("C = ", C, "gamma", gamma, "mean accuracy:", np.mean(val_accuracy), len(val_accuracy))
    
    argmax= np.argmax(scores, axis = 0)[3]
    print("linear: accuracy: {:} for C = {:}, gamma= {:} coef = {:}".format( scores[argmax][3],scores[argmax][0], scores[argmax][1],  scores[argmax][2],   )  )

In [10]:
def find_best_params_sigmoid(datasets):
       
    C_list = list(map(lambda x : 10**(x), [-6,-5,-4,-3,-2,-1,0,1,2]))
    scores = []
    
    for C in C_list:
        val_accuracy = []
        for dataset in datasets:
            svm = SVC(kernel ='sigmoid', C = C)
            svm.fit(dataset[0], dataset[1])
            val_acc = svm.score(dataset[2], dataset[3])
            val_accuracy.append(val_acc)

            scores.append( (C, np.mean(val_accuracy) ) )
            #print("C = ", C, "mean accuracy:", np.mean(val_accuracy), "len", len(val_accuracy))

    argmax= np.argmax(scores, axis = 0)[1]
    print("sigmoid: accuracy: {:} for C = {:}".format( scores[argmax][1], scores[argmax][0],   )  )

# Grid search using different kernels:

In [11]:
datasets = cross_validation(x_train, y_train, k= 3)
#for dataset in datasets:       
#    datasets_val = cross_validation(dataset[0], dataset[1], k= 3)
#    find_best_params_sigmoid(datasets_val)

In [12]:
#for dataset in datasets:       
#    datasets_val = cross_validation(dataset[0], dataset[1], k= 3)
#    find_best_params_Poly(datasets_val)

In [13]:
#for dataset in datasets:       
#    datasets_val = cross_validation(dataset[0], dataset[1], k= 3)
#    find_best_params_rbf(datasets_val) 

In [14]:
#for dataset in datasets:       
#    datasets_val = cross_validation(dataset[0], dataset[1], k= 3)
#    find_best_params_linear(datasets_val)

## Evaluate the model with best params:

In [15]:
model = SVC(kernel ='linear', C = 100,  gamma =  0.01)
model.fit(x_train, y_train)
accuracy = model.score(x_test, y_test)
print("Linear kernel, accuracy on test set = {:}".format(accuracy))

Linear kernel, accuracy on test set = 0.6598639455782312


In [16]:
model = SVC(kernel ='rbf', C = 10, gamma =  0.1, coef0 = -10)
model.fit(x_train, y_train)
accuracy = model.score(x_test, y_test)
print("RBF kernel, accuracy on test set = {:}".format(accuracy))  ## updated

RBF kernel, accuracy on test set = 0.9523809523809523


In [17]:
model = SVC(kernel ='poly', C = 100, degree  = 2, gamma = 1, coef0 = -1)
model.fit(x_train, y_train)
accuracy = model.score(x_test, y_test)
print("poly kernel, accuracy on test set = {:}".format(accuracy)) 

poly kernel, accuracy on test set = 0.8231292517006803


In [18]:
model = SVC(kernel ='sigmoid', C = 10)
model.fit(x_train, y_train)
accuracy = model.score(x_test, y_test)
print("Sigmoid kernel, accuracy on test set = {:}".format(accuracy))

Sigmoid kernel, accuracy on test set = 0.48639455782312924


# Best model: final evaluation

In [19]:
model = SVC(kernel ='rbf', C = 10, gamma =  0.1, coef0 = -10)
model.fit(x_train, y_train)


output = model.predict(x_train)
target = y_train
accuracy = sum(output == target)/ len(target)
print("\nAccuracy on train set: {:.2f} %".format(accuracy*100))

output = model.predict(x_val)
target = y_val
accuracy = sum(output == target)/ len(target)
print("\nAccuracy on validation set: {:.2f} %".format(accuracy*100))


output = model.predict(x_test)
target = y_test
accuracy = sum(output == target)/ len(target)
print("\nAccuracy on testing set: {:.2f} %".format(accuracy*100))


Accuracy on train set: 99.70 %

Accuracy on validation set: 99.66 %

Accuracy on testing set: 95.24 %
