# SVM Parameter Tuning

### Setting up variables, library and data

In [2]:
import pandas as pd
import numpy as np

## Data path handlers
TRAIN_TEST_SET_PATH = 'dataset/'
META_DATA_PATH = '../../Data/meta/'
AUG_SET_PATH = '../../Data/aug_data_all/'
MODEL_PATH = 'models/'
IMAGES_PATH = '../../Images/'

## Selecting the desired features
parameters = ['fz','mz']

In [3]:
### Getting data
X_train = pd.read_csv(TRAIN_TEST_SET_PATH+'X_train.csv',index_col=0)
y_train = pd.read_csv(TRAIN_TEST_SET_PATH+'y_train.csv',index_col=0)
X_test = pd.read_csv(TRAIN_TEST_SET_PATH+'X_test.csv',index_col=0)
y_test = pd.read_csv(TRAIN_TEST_SET_PATH+'y_test.csv',index_col=0)

In [4]:
### Standardizing the data
f_z = X_train.iloc[:, X_train.columns.str.contains(parameters[0])]
f_z = f_z/30
m_z = X_train.iloc[:, X_train.columns.str.contains(parameters[1])]
m_z = m_z/3
frames = [f_z, m_z]
X_train = pd.concat(frames, axis=1)
f_z = X_test.iloc[:, X_test.columns.str.contains(parameters[0])]
f_z = f_z/30
m_z = X_test.iloc[:, X_test.columns.str.contains(parameters[1])]
m_z = m_z/3
frames = [f_z, m_z]
X_test = pd.concat(frames, axis=1)

In [5]:
### Creating the validation set
X_train['labels'] = y_train.copy()

from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=27)
for train, val in split.split(X_train, X_train['labels']):
    X_train_vl = X_train.iloc[train].copy()
    X_val = X_train.iloc[val].copy()
    
y_train_vl = X_train_vl['labels'].copy()
y_val = X_val['labels'].copy()

X_train_vl = X_train_vl.iloc[:, ~X_train_vl.columns.str.contains('labels')]
X_val = X_val.iloc[:, ~X_val.columns.str.contains('labels')]
X_train = X_train.iloc[:, ~X_train.columns.str.contains('labels')]
X_train = np.array(X_train)
X_test = np.array(X_test)

y_train_vl = np.array(y_train_vl)
y_val = np.array(y_val)
y_test = np.array(y_test)
y_train = np.array(y_train)

### Setting up the svm model

In [6]:
### Model sketch
def  build_model(C=1,gamma=0.01,kernel='rbf'):
    from sklearn.svm import SVC
    model = SVC(C = 1, kernel = 'rbf', probability=True, gamma=0.01, verbose=True)
    return model

svm_classifier = build_model()

### Hyperparameter optimization

In [7]:
### Hyperparameter Optimization
#from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

params = {
    'C': [0.1,0.5,1,5,10,50,100],
    'gamma': [0.001,0.005,0.01,0.05,0.1,0.5,1],
    'kernel': ['rbf', 'poly', 'sigmoid', 'linear']
}

rnd_search = GridSearchCV(svm_classifier, params, cv=3, n_jobs=-1, verbose=3)

rnd_search.fit(X_train_vl, y_train_vl)



Fitting 3 folds for each of 196 candidates, totalling 588 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:    6.9s
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed:   31.9s
[Parallel(n_jobs=-1)]: Done 280 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 504 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 588 out of 588 | elapsed:  2.6min finished


[LibSVM]

GridSearchCV(cv=3, error_score=nan,
             estimator=SVC(C=1, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3, gamma=0.01,
                           kernel='rbf', max_iter=-1, probability=True,
                           random_state=None, shrinking=True, tol=0.001,
                           verbose=True),
             iid='deprecated', n_jobs=-1,
             param_grid={'C': [0.1, 0.5, 1, 5, 10, 50, 100],
                         'gamma': [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1],
                         'kernel': ['rbf', 'poly', 'sigmoid', 'linear']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=3)

In [8]:
### Best validation score
rnd_search.best_score_

0.8388338833883388

In [10]:
### Best set of hyperparameters
best_parameters = pd.DataFrame(rnd_search.best_params_, index=['values'])
best_parameters.to_csv(MODEL_PATH+'svm_original_data.csv')

rnd_search.best_params_

{'C': 0.5, 'gamma': 0.05, 'kernel': 'rbf'}

### Training the reference model

In [20]:
### Selecting and retraining the best model
best = rnd_search.best_estimator_
best.fit(X_train, y_train.ravel())

[LibSVM]

SVC(C=0.5, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.05, kernel='rbf',
    max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001,
    verbose=True)

In [22]:
### Model evaluation in accuracy terms
y_predict = best.predict(X_test)
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

cm = confusion_matrix(y_test, y_predict)
print('\n\nConfusion Matrix: \n', cm)
acc = accuracy_score(y_test, y_predict)
print('Accuracy: ', acc)

#Saving model
from sklearn.externals import joblib
joblib.dump(best,MODEL_PATH+"svm_original_data.pkl")



Confusion Matrix: 
 [[60  0  1]
 [12  0  0]
 [ 4  0 19]]
Accuracy:  0.8229166666666666


['models/svm_original_data.pkl']