# Image Classification using `sklearn.svm`

In [None]:
from pathlib import Path
import matplotlib.pyplot as plt
import numpy as np
%matplotlib notebook
from sklearn import svm, metrics, datasets
from sklearn.metrics import auc, roc_curve
from sklearn.utils import Bunch
from sklearn.model_selection import GridSearchCV, train_test_split
import cv2
from sklearn.cluster import KMeans

from skimage.io import imread
from skimage.transform import resize

In [None]:
class SiftDetector():
    def __init__(self, norm="L2", params=None):
        self.detector=self.get_detector(params)
        self.norm=norm

    def get_detector(self, params):
        if params is None:
            params={}
            params["n_features"]=0
            params["n_octave_layers"]=3
            params["contrast_threshold"]=0.03
            params["edge_threshold"]=10
            params["sigma"]=1.6

        detector = cv2.SIFT_create(
                nfeatures=params["n_features"],
                nOctaveLayers=params["n_octave_layers"],
                contrastThreshold=params["contrast_threshold"],
                edgeThreshold=params["edge_threshold"],
                sigma=params["sigma"])

        return detector
    


In [None]:
def preprocessing(img_path, sift_detector = None, ):
    sift = sift_detector
    if sift_detector is None:
        sift =  SiftDetector()
    image = cv2.imread(img_path)
    gray= cv2.cvtColor(image,cv2.COLOR_BGR2GRAY)
    detector = sift.get_detector(None)
    kp,dp = detector.detectAndCompute(gray,None ) 
#     img = cv2.drawKeypoints(image, kp, image)
    # plt.imshow(cv2.cvtColor(image,cv2.COLOR_BGR2RGB))
    return kp,dp

In [None]:
def calculate_clustered_features(dp):
    skewness = skew(dp, axis=1, bias=True).reshape(-1,1)
    kurt = kurtosis(dp, axis=1, bias=True).reshape(-1,1)
    minimum = np.amin(dp, axis=1).reshape(-1,1)
    maximum = np.amax(dp, axis=1).reshape(-1,1)
    mean = dp.mean(axis=1).reshape(-1,1)
    return np.concatenate((mean, minimum, maximum,skewness,kurt),axis=1)

### Load images in structured directory like it's sklearn sample dataset

In [None]:
def load_image_files(container_path, dimension=(64, 64)):
    """
    Load image files with categories as subfolder names 
    which performs like scikit-learn sample dataset
    
    Parameters
    ----------
    container_path : string or unicode
        Path to the main folder holding one subfolder per category
    dimension : tuple
        size to which image are adjusted to
        
    Returns
    -------
    Bunch
    """
    image_dir = Path(container_path)
    folders = [directory for directory in image_dir.iterdir() if directory.is_dir()]
    categories = [fo.name for fo in folders]
    
    descr = "A image classification dataset"
    images = []
    flat_data = []
    target = []
    for i, direc in enumerate(folders):
        for file in direc.iterdir():
            if not str(file).endswith("kmeans=800.txt"):
#                 print(file)
#                 img = cv2.imread(file)
                kp,dp = preprocessing(str(file))
                kmeans = KMeans(n_clusters=7)
                kmeans.fit(dp)
                clustered = kmeans.cluster_centers_
                value = calculate_clustered_features(clustered)
#                 img_resized = resize(img, dimension, anti_aliasing=True, mode='reflect')
#                 print(img_resized.flatten().shape)
                
#                 value = np.loadtxt(file, dtype=float)
#                 value = value.reshape(800,128)
#                 value = calculate_clustered_features(value)
                flat_data.append(value.flatten()) 
                images.append(value)
                target.append(i)
                print(i)
    
    flat_data = np.array(flat_data)
    target = np.array(target)
    images = np.array(images)

    return Bunch(data=flat_data,
                 target=target,
                 target_names=categories,
                 images=images,
                 DESCR=descr)

In [None]:
## load data from numpy format binary file
def load_data(file):

    data = np.load(file)
    return data

In [None]:
from scipy.stats import skew
from scipy.stats import kurtosis
import numpy as np


# kp,dp = preprocessing('dataset/train/_transform/17963/0adda285d98860cb.jpg')

### input : dp - desc
def calculate_clustered_features(dp):
    skewness = skew(dp, axis=1, bias=True).reshape(-1,1)
    kurt = kurtosis(dp, axis=1, bias=True).reshape(-1,1)
    minimum = np.amin(dp, axis=1).reshape(-1,1)
    maximum = np.amax(dp, axis=1).reshape(-1,1)
    mean = dp.mean(axis=1).reshape(-1,1)
    return np.concatenate((mean, minimum, maximum,skewness,kurt),axis=1)
        

In [None]:
import pandas as pd

pd.read_table("_transform/17963/0adda285d98860cb_hist=800.txt")





Unnamed: 0,5.00
0,5.0
1,11.0
2,14.0
3,19.0
4,10.0
...,...
794,2.0
795,5.0
796,1.0
797,1.0


In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    image_dataset.data, image_dataset.target, test_size=0.3,random_state=109)

## Different Set of training data

### 1. when cluster number is set to 4

In [None]:
from json import load


X_train = load_data('data/kmeans=4_train.npy').reshape((720,-1))
y_train = load_data('data/y_train.npy')
X_test = load_data('data/kmeans=4_test.npy').reshape((240,-1))
y_test = load_data('data/Y_test.npy')

X_val = load_data('data/kmeans=4_val.npy').reshape((240,-1))
X_train = np.concatenate((X_train, X_val), 0)
y_val = load_data('data/y_val.npy')
y_train = np.concatenate((y_train, y_val), 0)

model_name = 'svc_histogram_K=4_model'

### 2. when cluster number is set to 5

In [None]:
X_train = load_data('data/kmeans=5_train.npy').reshape((720,-1))
y_train = load_data('data/Y_train.npy')
X_test = load_data('data/kmeans=5_test.npy').reshape((240,-1))
y_test = load_data('data/Y_test.npy')

X_val = load_data('data/kmeans=5_val.npy').reshape((240,-1))
X_train = np.concatenate((X_train, X_val), 0)
y_val = load_data('data/y_val.npy')
y_train = np.concatenate((y_train, y_val), 0)
model_name = 'svc_histogram_K=5_model'

### 3. when cluster number is set to 20

In [None]:
X_train = load_data('data/kmeans=20_train.npy').reshape((720,-1))
y_train = load_data('data/Y_train.npy')
X_test = load_data('data/kmeans=20_test.npy').reshape((240,-1))
y_test = load_data('data/Y_test.npy')
model_name = 'svc_histogram_K=20_model'

In [None]:
X_val = load_data('data/kmeans=5_val.npy').reshape((240,-1))
X_train = np.concatenate((X_train, X_val), 0)
y_val = load_data('data/y_val.npy')
y_train = np.concatenate((y_train, y_val), 0)


### 4. when cluster number is set to 7

In [None]:
X_train = load_data('data/kmeans=7_train.npy').reshape((720,-1))
y_train = load_data('data/Y_train.npy')
X_test = load_data('data/kmeans=7_test.npy').reshape((240,-1))
y_test = load_data('data/Y_test.npy')

### 5. when cluster number is set to 2

In [None]:
X_train = load_data('data/kmeans=2_train.npy').reshape((720,-1))
y_train = load_data('data/Y_train.npy')
X_test = load_data('data/kmeans=2_test.npy').reshape((240,-1))
y_test = load_data('data/Y_test.npy')

X_val = load_data('data/kmeans=2_val.npy').reshape((240,-1))
X_train = np.concatenate((X_train, X_val), 0)
y_val = load_data('data/y_val.npy')
y_train = np.concatenate((y_train, y_val), 0)
model_name = 'svc_histogram_K=2_model'

### 5. when cluster number is set to 1

In [None]:
X_train = load_data('data/kmeans=1_train.npy').reshape((720,-1))
y_train = load_data('data/Y_train.npy')
X_test = load_data('data/kmeans=1_test.npy').reshape((240,-1))
y_test = load_data('data/Y_test.npy')

X_val = load_data('data/kmeans=1_val.npy').reshape((240,-1))
X_train = np.concatenate((X_train, X_val), 0)
y_val = load_data('data/y_val.npy')
y_train = np.concatenate((y_train, y_val), 0)
model_name = 'svc_histogram_K=1_model'

### Train data with parameter optimization

In [None]:
## Hyperparameter tuning 
def train_and_predict(X_train, y_train, X_test):
  param_grid = [
    {'C': [0.001, 0.1,1, 10, 100, 1000], 'kernel': ['linear']},
    {'C': [0.001, 0.1,1, 10, 100, 1000], 'gamma': [0.1, 0.01, 0.001, 0.0001], 'kernel': ['rbf']},
  ]
  svc = svm.SVC()
  clf_1 = GridSearchCV(svc, param_grid)
  clf_1.fit(X_train, y_train)
  y_pred = clf_1.predict(X_test)
  clf_1.cv_results_


{'mean_fit_time': array([0.06803446, 0.07021046, 0.06904874, 0.06972165, 0.10033922,
        0.07199745, 0.10063853, 0.09146914, 0.09807591, 0.09048657,
        0.0996201 , 0.09017715]),
 'std_fit_time': array([0.00217346, 0.00435268, 0.00268807, 0.00355212, 0.00214941,
        0.0019632 , 0.00156084, 0.00772654, 0.00272445, 0.00211562,
        0.00248929, 0.00234566]),
 'mean_score_time': array([0.0089695 , 0.00919747, 0.00916166, 0.00877028, 0.02893066,
        0.02854662, 0.02982264, 0.02854056, 0.02895327, 0.02773275,
        0.02932382, 0.02751198]),
 'std_score_time': array([8.94222150e-04, 3.89967124e-04, 4.05987888e-04, 3.95791111e-04,
        1.23794126e-05, 2.33226459e-03, 1.90924974e-03, 1.84408646e-03,
        6.34864911e-04, 3.86203735e-04, 7.97400795e-04, 4.91948267e-04]),
 'param_C': masked_array(data=[1, 10, 100, 1000, 1, 1, 10, 10, 100, 100, 1000, 1000],
              mask=[False, False, False, False, False, False, False, False,
                    False, False, False,

In [None]:
def plot_grid_search(cv_results, grid_param_1, grid_param_2, name_param_1, name_param_2):
    # Get Test Scores Mean and std for each grid search
    scores_mean = cv_results['mean_test_score']
    scores_mean = np.array(scores_mean).reshape(len(grid_param_2),len(grid_param_1))

    scores_sd = cv_results['std_test_score']
    scores_sd = np.array(scores_sd).reshape(len(grid_param_2),len(grid_param_1))

    # Plot Grid search scores
    _, ax = plt.subplots(1,1)

    # Param1 is the X-axis, Param 2 is represented as a different curve (color line)
    for idx, val in enumerate(grid_param_2):
        ax.plot(grid_param_1, scores_mean[idx,:], '-o', label= name_param_2 + ': ' + str(val))

    ax.set_title("Grid Search Scores", fontsize=20, fontweight='bold')
    ax.set_xlabel(name_param_1, fontsize=16)
    ax.set_ylabel('CV Average Score', fontsize=16)
    ax.legend(loc="best", fontsize=15)
    ax.grid('on')
    

### Plot the mean test score against each parameter setting and confusion matrix

In [None]:
from json import load


X_train = load_data('data/kmeans=4_train.npy').reshape((720,-1))
y_train = load_data('data/y_train.npy')
X_test = load_data('data/kmeans=4_test.npy').reshape((240,-1))
y_test = load_data('data/Y_test.npy')

X_val = load_data('data/kmeans=4_val.npy').reshape((240,-1))
X_train = np.concatenate((X_train, X_val), 0)
y_val = load_data('data/y_val.npy')
y_train = np.concatenate((y_train, y_val), 0)

model_name = 'svc_histogram_K=4_model'

param_grid = [
  {'C': [0.001, 0.1,1, 10, 100, 1000], 'kernel': ['linear']},
   {'C': [0.001, 0.1,1, 10, 100, 1000], 'gamma': [0.1, 0.01, 0.001, 0.0001], 'kernel': ['rbf']},
 ]
svc = svm.SVC()
clf = GridSearchCV(svc, param_grid)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
data = {}

for i in range(len(clf.cv_results_['param_C'])):
  C = clf.cv_results_['param_C'][i]
  param_kernel = clf.cv_results_['param_kernel'][i]
  param_gamma = clf.cv_results_['param_gamma'][i] if clf.cv_results_['param_gamma'][i] != '--' else 'Nan'

  data['kernel=' + param_kernel + '-' + 'C=' + str(C) + 'Gamma='+str(param_gamma)] = clf.cv_results_['mean_test_score'][i] 

import matplotlib.pyplot as plt
from sklearn.metrics import plot_confusion_matrix

plot_confusion_matrix(clf, X_test, y_test)  
print("Classification report for - \n{}:\n{}\n".format(
    clf, metrics.classification_report(y_test, y_pred)))
plt.show()
plt.savefig("K=4_CM.png")



<IPython.core.display.Javascript object>

Classification report for - 
GridSearchCV(estimator=SVC(),
             param_grid=[{'C': [0.001, 0.1, 1, 10, 100, 1000],
                          'kernel': ['linear']},
                         {'C': [0.001, 0.1, 1, 10, 100, 1000],
                          'gamma': [0.1, 0.01, 0.001, 0.0001],
                          'kernel': ['rbf']}]):
              precision    recall  f1-score   support

           0       0.26      0.44      0.33        16
           1       0.33      0.36      0.35        25
           2       0.68      0.50      0.58        26
           3       0.53      0.41      0.46        22
           4       0.36      0.30      0.33        33
           5       0.79      0.71      0.75        21
           6       0.52      0.52      0.52        29
           7       0.71      0.81      0.76        21
           8       0.39      0.61      0.48        18
           9       0.64      0.48      0.55        29

    accuracy                           0.50       240
   ma

### Save model

In [None]:
from joblib import dump, load

def save_model(clf, model_name):
    dump(clf, model_name)

### load model

In [None]:
def load_model(model_name):
    clf = load(model_name)
    return clf

### Predict

In [None]:
def predict_(clf, test_set):
    y_pred = clf.predict(test_set)

### Report

In [None]:
print("Classification report for - \n{}:\n{}\n".format(
    clf, metrics.classification_report(y_test, y_pred)))

Classification report for - 
GridSearchCV(estimator=SVC(),
             param_grid=[{'C': [1, 10, 100, 1000], 'kernel': ['linear']},
                         {'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001],
                          'kernel': ['rbf']}]):
              precision    recall  f1-score   support

           0       0.26      0.44      0.33        16
           1       0.33      0.36      0.35        25
           2       0.68      0.50      0.58        26
           3       0.53      0.41      0.46        22
           4       0.36      0.30      0.33        33
           5       0.79      0.71      0.75        21
           6       0.52      0.52      0.52        29
           7       0.71      0.81      0.76        21
           8       0.39      0.61      0.48        18
           9       0.64      0.48      0.55        29

    accuracy                           0.50       240
   macro avg       0.52      0.51      0.51       240
weighted avg       0.52      0.50     

### Confusion Matrix And Classification Report

In [None]:

param_grid = [
  {'C': [0.001, 0.1,1, 10, 100, 1000], 'kernel': ['linear']},
   {'C': [0.001, 0.1,1, 10, 100, 1000], 'gamma': [0.1, 0.01, 0.001, 0.0001], 'kernel': ['rbf']},
 ]
svc = svm.SVC()
clf = GridSearchCV(svc, param_grid)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)


In [None]:
print(metrics.confusion_matrix(y_test, y_pred))
print("Classification report for - \n{}:\n{}\n".format(
    clf, metrics.classification_report(y_test, y_pred)))

check = {}
for value in y_test:
    if value in check:
        check[value] += 1
    else:
        check[value] = 1
print(check)

[[ 7  3  0  0  2  1  0  0  2  1]
 [ 4  9  0  0  6  0  1  3  1  1]
 [ 1  1 13  0  2  0  4  0  5  0]
 [ 2  1  1  9  3  1  0  0  3  2]
 [ 8  3  0  1 10  0  6  3  0  2]
 [ 2  1  1  0  0 15  2  0  0  0]
 [ 1  6  1  1  0  1 15  0  3  1]
 [ 1  0  0  1  2  0  0 17  0  0]
 [ 0  1  1  3  0  1  0  0 11  1]
 [ 1  2  2  2  3  0  1  1  3 14]]
Classification report for - 
GridSearchCV(estimator=SVC(),
             param_grid=[{'C': [1, 10, 100, 1000], 'kernel': ['linear']},
                         {'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001],
                          'kernel': ['rbf']}]):
              precision    recall  f1-score   support

           0       0.26      0.44      0.33        16
           1       0.33      0.36      0.35        25
           2       0.68      0.50      0.58        26
           3       0.53      0.41      0.46        22
           4       0.36      0.30      0.33        33
           5       0.79      0.71      0.75        21
           6       0.52      0.5

### check if the accuray on training example are 100%

In [None]:
y_pred_train = clf.predict(X_train)
print("Classification report for - \n{}:\n{}\n".format(
    clf, metrics.classification_report(y_train, y_pred_train)))
print(metrics.confusion_matrix(y_train, y_pred_train))


Classification report for - 
GridSearchCV(estimator=SVC(),
             param_grid=[{'C': [1, 10, 100, 1000], 'kernel': ['linear']},
                         {'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001],
                          'kernel': ['rbf']}]):
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        80
           1       1.00      1.00      1.00        69
           2       1.00      1.00      1.00        66
           3       1.00      1.00      1.00        80
           4       1.00      1.00      1.00        63
           5       1.00      1.00      1.00        79
           6       1.00      1.00      1.00        67
           7       1.00      1.00      1.00        72
           8       1.00      1.00      1.00        71
           9       1.00      1.00      1.00        73

    accuracy                           1.00       720
   macro avg       1.00      1.00      1.00       720
weighted avg       1.00      1.00     

### Retrain the model but with binarize y_label
#### And plot the ROC curve

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from itertools import cycle

from sklearn import svm, datasets
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import label_binarize
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import roc_auc_score

from json import load


X_train = load_data('data/kmeans=4_train.npy').reshape((720,-1))
y_train = load_data('data/y_train.npy')
X_test = load_data('data/kmeans=4_test.npy').reshape((240,-1))
y_test = load_data('data/Y_test.npy')

X_val = load_data('data/kmeans=4_val.npy').reshape((240,-1))
X_train = np.concatenate((X_train, X_val), 0)
y_val = load_data('data/y_val.npy')
y_train = np.concatenate((y_train, y_val), 0)

model_name = 'svc_histogram_K=4_model'
n_classes = 10

y_train = label_binarize(y_train, classes=[0, 1, 2,3,4,5,6,7,8,9])
y_test = label_binarize(y_test, classes=[0, 1, 2,3,4,5,6,7,8,9])


## to utilize the ROC curve AUC we need to calculate the probability using oneVsRestClassifier
classifier = OneVsRestClassifier(
    svm.SVC(kernel="linear")
)
y_score = classifier.fit(X_train, y_train).decision_function(X_test)
y_pred = classifier.predict(X_test)



# # Learn to predict each class against the other
# classifier = OneVsRestClassifier(
#     svm.SVC(kernel="linear", probability=True, random_state=random_state)
# )
# y_score = classifier.fit(X_train, y_train).decision_function(X_test)

# Compute ROC curve and ROC area for each class
fpr = dict()
tpr = dict()
roc_auc = dict()

for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

# Compute micro-average ROC curve and ROC area
fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_score.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])

plt.figure()
lw = 2
plt.plot(
    fpr[2],
    tpr[2],
    color="darkorange",
    lw=lw,
    label="ROC curve (area = %0.2f)" % roc_auc[2],
)
plt.plot([0, 1], [0, 1], color="navy", lw=lw, linestyle="--")
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Receiver operating characteristic example")
plt.legend(loc="lower right")
plt.show()
# First aggregate all false positive rates
all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))

# Then interpolate all ROC curves at this points
mean_tpr = np.zeros_like(all_fpr)
for i in range(n_classes):
    mean_tpr += np.interp(all_fpr, fpr[i], tpr[i])

# Finally average it and compute AUC
mean_tpr /= n_classes

fpr["macro"] = all_fpr
tpr["macro"] = mean_tpr
roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])

# Plot all ROC curves
plt.figure()
# plt.plot(
#     fpr["micro"],
#     tpr["micro"],
#     label="micro-average ROC curve (area = {0:0.2f})".format(roc_auc["micro"]),
#     color="deeppink",
#     linestyle=":",
#     linewidth=4,
# )

plt.plot(
    fpr["macro"],
    tpr["macro"],
    label="macro-average ROC curve (area = {0:0.2f})".format(roc_auc["macro"]),
    color="darkorange",
    linestyle=":",
    linewidth=4,
)

def get_cmap(n, name='hsv'):
    '''Returns a function that maps each index in 0, 1, ..., n-1 to a distinct 
    RGB color; the keyword argument name must be a standard mpl colormap name.'''
    return plt.cm.get_cmap(name, n)

cmap = get_cmap(10)
for i in range(n_classes):
    plt.plot(
        fpr[i],
        tpr[i],
        color=cmap(i),
        lw=lw,
        label="ROC curve of class {0} (area = {1:0.2f})".format(i, roc_auc[i]),
    )

plt.plot([0, 1], [0, 1], "k--", lw=lw)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Receiver operating characteristic to multiclass")
plt.legend(loc="lower right")
plt.show()
plt.savefig('ROC_SVM')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
X_train = load_data('data/kmeans=4_train.npy').reshape((720,-1))
y_train = load_data('data/y_train.npy')
X_test = load_data('data/kmeans=4_test.npy').reshape((240,-1))
y_test = load_data('data/Y_test.npy')

X_val = load_data('data/kmeans=4_val.npy').reshape((240,-1))
X_train = np.concatenate((X_train, X_val), 0)
y_val = load_data('data/y_val.npy')
y_train = np.concatenate((y_train, y_val), 0)

model_name = 'svc_histogram_K=4_model'
# check y_test label
check = {}
for value in y_test:
    if value in check:
        check[value] += 1
    else:
        check[value] = 1
print(check)

{9: 29, 8: 18, 0: 16, 3: 22, 2: 26, 1: 25, 6: 29, 4: 33, 7: 21, 5: 21}
