# SVC and Logistic Regression with SIFT and SURF as features

In [1]:
# import bagoffeatures functions
from bagoffeatures import bag_of_features,detect_and_compute, stack_descriptors,create_vocabulary
# import external libraries
import matplotlib.pyplot as plt
from pathlib import Path
from sklearn.svm import LinearSVC, SVC
from sklearn.linear_model import LogisticRegression
from sklearn.externals import joblib
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix
from sklearn.preprocessing import StandardScaler
import numpy as np
import cv2

In [2]:
# Define paths
root = Path('./') # define root path
data_path = Path('data') # define data path
much_face_path = data_path.joinpath('much_face')
val_path = much_face_path.joinpath('val')
train_path = much_face_path.joinpath('train')
model_path = root.joinpath('models')

## SVC + SURF
SVC = SVM Type 1

In [7]:
# Set parameters:
featuretype = 'SURF'
vocabulary_size = 500

In [8]:
# Create (image,features) tuples and their class
descriptions, image_classes = detect_and_compute(train_path,featuretype)
# Stack descriptions into one large array
stacked_desc = stack_descriptors(descriptions,featuretype)
# Create vocabulary 
voc = create_vocabulary(stacked_desc,vocabulary_size)
# Create bag of features 
im_bof = bag_of_features(descriptions,voc,vocabulary_size)
# standardize 
stdSlr = StandardScaler().fit(im_bof)
im_bof = stdSlr.transform(im_bof)

In [9]:
# Define Grid Search Parameter for SVM
param_grid = [
  {'C': [1, 10, 100, 1000,10000], 'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1]
   , 'kernel': ['rbf']},
 ]

In [10]:
# initiate SVC model and fit 
svc = SVC()
svc_grid = GridSearchCV(svc,param_grid)
svc_grid.fit(im_bof, np.array(image_classes)) # fit bag of features

GridSearchCV(cv=None, error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid=[{'C': [1, 10, 100, 1000, 10000], 'kernel': ['rbf'], 'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [11]:
# display best score and parameters 
print(svc_grid.best_params_)
print(svc_grid.best_score_)

{'C': 10, 'kernel': 'rbf', 'gamma': 0.0005}
0.8177159590043924


In [21]:
# store class_names
class_names = list(set(image_classes))
# save best model to disk 
svcbest = svc_grid.best_estimator_
joblib.dump((svcbest, class_names, stdSlr, vocabulary_size, voc), 'models/SVC_SURF_gs_20180309.pkl', compress = 3)

['models/SVC_SURF_gs_20180309.pkl']

### Performance on Validation Set

In [18]:
# create (image,features) tuples and their class
descriptions_val, image_classes_val = detect_and_compute(val_path,featuretype)
# create bag of features and standard scaler
im_bof_val = bag_of_features(descriptions_val,voc,vocabulary_size)
im_bof_val = stdSlr.transform(im_bof_val)

In [19]:
pred_val = svcbest.predict(im_bof_val)

In [20]:
# calculate accuracy 
accuracy_score(image_classes_val,list(pred_val))

0.8434504792332268

## SVC + SIFT

In [3]:
# Set parameters:
featuretype = 'SIFT'
vocabulary_size = 500

In [4]:
# Create (image,features) tuples and their class 
descriptions, image_classes = detect_and_compute(train_path,featuretype)
# Stack descriptions into one large array
stacked_desc = stack_descriptors(descriptions,featuretype)
# Create vocabulary 
voc = create_vocabulary(stacked_desc,vocabulary_size)
# Create bag of features 
im_bof = bag_of_features(descriptions,voc,vocabulary_size)
# standardize 
stdSlr = StandardScaler().fit(im_bof)
im_bof = stdSlr.transform(im_bof)

In [5]:
# Define Grid Search Parameter for SVM
param_grid = [
  {'C': [1, 10, 100, 1000,10000], 'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1]
   , 'kernel': ['rbf']},
 ]

In [6]:
# initiate SVC model and fit 
svc = SVC()
svc_grid = GridSearchCV(svc,param_grid)
svc_grid.fit(im_bof, np.array(image_classes)) # fit bag of features

GridSearchCV(cv=None, error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid=[{'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1], 'C': [1, 10, 100, 1000, 10000], 'kernel': ['rbf']}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [7]:
# display best score and parameters 
print(svc_grid.best_params_)
print(svc_grid.best_score_)

{'gamma': 0.0005, 'C': 10, 'kernel': 'rbf'}
0.8118594436310396


In [8]:
# store class_names
class_names = list(set(image_classes))
# save best model to disk 
svcbest = svc_grid.best_estimator_
joblib.dump((svcbest, class_names, stdSlr, vocabulary_size, voc), 'models/SVC_SIFT_gs_20180310.pkl', compress = 3)

['models/SVC_SIFT_gs_20180310.pkl']

### Performance on Validation Set

In [9]:
# create (image,features) tuples and their class 
descriptions_val, image_classes_val = detect_and_compute(val_path,featuretype)
# create bag of features and standard scaler
im_bof_val = bag_of_features(descriptions_val,voc,vocabulary_size)
im_bof_val = stdSlr.transform(im_bof_val)

In [10]:
pred_val = svcbest.predict(im_bof_val)

In [11]:
# calculate accuracy 
accuracy_score(image_classes_val,list(pred_val))

0.8274760383386581

## Logistic Regression + SURF

In [57]:
# Set parameters:
featuretype = 'SURF'
vocabulary_size = 500

In [58]:
# Create (image,features) tuples and their class 
descriptions, image_classes = detect_and_compute(train_path,featuretype)
# Stack descriptions into one large array
stacked_desc = stack_descriptors(descriptions,featuretype)
# Create vocabulary 
voc = create_vocabulary(stacked_desc,vocabulary_size)
# Create bag of features 
im_bof = bag_of_features(descriptions,voc,vocabulary_size)
# standardize 
stdSlr = StandardScaler().fit(im_bof)
im_bof = stdSlr.transform(im_bof)

In [52]:
# Define Grid Search Parameter 
param_grid = [
  {'C': [ 0.1, 1, 10, 100, 1000]},
 ]

In [74]:
# initiate logistic regression model and fit 
logreg = LogisticRegression()
#logreg.fit(im_bof, np.array(image_classes))
# initiate SVC model and fit 
logreg_grid = GridSearchCV(svc,param_grid)
logreg_grid.fit(im_bof, np.array(image_classes)) # fit bag of features

GridSearchCV(cv=None, error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid=[{'C': [0.1, 1, 10, 100, 1000]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [75]:
# display best score and parameters 
print(logreg_grid.best_params_)
print(logreg_grid.best_score_)

{'C': 10}
0.773792093704246


In [64]:
# train on full dataset
logreg = LogisticRegression(C=10)
logreg.fit(im_bof, np.array(image_classes))

LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [80]:
# store class_names
class_names = list(set(image_classes))
# save best model to disk 
logregbest = logreg_grid.best_estimator_
joblib.dump((logregbest, class_names, stdSlr, vocabulary_size, voc), 'models/LOGREG_SURF_gs_20180310.pkl', compress = 3)

['models/LOGREG_SURF_gs_20180310.pkl']

### Performance on Validation Set

In [81]:
# create (image,features) tuples and their class (e.g. 004)
descriptions_val, image_classes_val = detect_and_compute(val_path,featuretype)
# create bag of features and standard scaler
im_bof_val = bag_of_features(descriptions_val,voc,vocabulary_size)
im_bof_val = stdSlr.transform(im_bof_val)

In [82]:
pred_val = logregbest.predict(im_bof_val)

In [83]:
# calculate accuracy 
accuracy_score(image_classes_val,list(pred_val))

0.8338658146964856

## Logistic Regression + SIFT 

In [84]:
# Set parameters:
featuretype = 'SIFT'
vocabulary_size = 500

In [85]:
# Create (image,features) tuples and their class (e.g. 004)
descriptions, image_classes = detect_and_compute(train_path,featuretype)
# Stack descriptions into one large array
stacked_desc = stack_descriptors(descriptions,featuretype)
# Create vocabulary 
voc = create_vocabulary(stacked_desc,vocabulary_size)
# Create bag of features 
im_bof = bag_of_features(descriptions,voc,vocabulary_size)
# standardize 
stdSlr = StandardScaler().fit(im_bof)
im_bof = stdSlr.transform(im_bof)

In [86]:
# Define Grid Search Parameter 
param_grid = [
  {'C': [ 0.1, 1, 10, 100, 1000]},
 ]

In [87]:
# initiate logistic regression model and fit 
logreg = LogisticRegression()
#logreg.fit(im_bof, np.array(image_classes))
# initiate SVC model and fit 
logreg_grid = GridSearchCV(svc,param_grid)
logreg_grid.fit(im_bof, np.array(image_classes)) # fit bag of features

GridSearchCV(cv=None, error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid=[{'C': [0.1, 1, 10, 100, 1000]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [88]:
# display best score and parameters 
print(logreg_grid.best_params_)
print(logreg_grid.best_score_)

{'C': 10}
0.7313323572474377


In [89]:
# train on full dataset
logreg = LogisticRegression(C=10)
logreg.fit(im_bof, np.array(image_classes))

LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [90]:
# store class_names
class_names = list(set(image_classes))
# save best model to disk 
logregbest = logreg_grid.best_estimator_
joblib.dump((logregbest, class_names, stdSlr, vocabulary_size, voc), 'models/LOGREG_SIFT_gs_20180310.pkl', compress = 3)

['models/LOGREG_SIFT_gs_20180310.pkl']

### Performance on Validation Set

In [91]:
# create (image,features) tuples and their class (e.g. 004)
descriptions_val, image_classes_val = detect_and_compute(val_path,featuretype)
# create bag of features and standard scaler
im_bof_val = bag_of_features(descriptions_val,voc,vocabulary_size)
im_bof_val = stdSlr.transform(im_bof_val)

In [92]:
pred_val = logregbest.predict(im_bof_val)

In [93]:
# calculate accuracy 
accuracy_score(image_classes_val,list(pred_val))

0.7827476038338658