In [16]:
import cv2
import numpy as np
import glob
import matplotlib
import matplotlib.pyplot as plt
from PIL import Image
from time import time
from sklearn import svm, preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split
from sklearn.metrics import classification_report, f1_score
from keras.utils.np_utils import to_categorical
import sys,os
sys.path.append(os.path.expanduser('ProjectFunctions'))
from ProjectFunctions import LoadShapes as LS
from ProjectFunctions import GridReport as GR
from sklearn.neighbors import KNeighborsClassifier

In [2]:
#TODO load the dataset:
start = time()

X, y = LS.getShapes()

t = time()-start
print(f"OK, Load time={t:0.1f}")

OK, Load time=50.1


## Preprocessing and split

In [3]:
X = np.array(X)
X_shape = X.shape
print("The shape of X: ", X_shape)

The shape of X:  (14970, 200, 200)


In [4]:
#split
X_r = X.reshape(14970, 200*200)

X_rShape = X_r.shape
print("The new shape of the reshaped X: ", X_rShape)

input_X_rShape = X_rShape[1:]
print("The shape of X ready to be inputed in the CNN: ", input_X_rShape)

The new shape of the reshaped X:  (14970, 40000)
The shape of X ready to be inputed in the CNN:  (40000,)


In [5]:
X_neuralReady=np.array(X_r)/255

print("Done preparing data for neural networks.")

Done preparing data for neural networks.


In [6]:
from sklearn.model_selection import train_test_split

test_size=0.3
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42, shuffle=True)

print("Done splitting train and test data.")

Done splitting train and test data.


In [7]:

Xnr_train, Xnr_test, ynr_train, ynr_test = train_test_split(X_neuralReady, y, test_size=test_size, random_state=42, shuffle=True)

print("Done splitting train and test data ready for neural networks.")

Done splitting train and test data ready for neural networks.


In [9]:
Xnr_train, Xnr_test, ynr_train, ynr_test = Xnr_train[:1000], Xnr_test[:1000], ynr_train[:1000], ynr_test[:1000]
print("Done splitting train into length of 1000.")

In [10]:
# Setup search parameters
model = svm.SVC(gamma=0.001) # NOTE: gamma="scale" does not work in older Scikit-learn frameworks, 
                             # FIX:  replace with model = svm.SVC(gamma=0.001)


tuning_parameters = {
    'kernel':('linear', 'rbf'), 
    'C':[0.1, 1, 10]
}


CV=5
VERBOSE=0
# Run GridSearchCV for the model
start = time()
grid_tuned = GridSearchCV(model, tuning_parameters, cv=CV, scoring='f1_micro', verbose=VERBOSE, n_jobs=-1, iid=True)
grid_tuned.fit(Xnr_train, ynr_train)
t = time()-start



In [11]:
# Report result
b0, m0= GR.FullReport(grid_tuned , Xnr_test, ynr_test, t)
print('OK')

SEARCH TIME: 379.60 sec

Best model set found on train set:

	best parameters={'C': 1, 'kernel': 'rbf'}
	best 'f1_micro' score=1.0
	best index=3

Best estimator CTOR:
	SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.001, kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

Grid scores ('f1_micro') on development set:
	[ 0]: 0.998 (+/-0.005) for {'C': 0.1, 'kernel': 'linear'}
	[ 1]: 0.997 (+/-0.012) for {'C': 0.1, 'kernel': 'rbf'}
	[ 2]: 0.998 (+/-0.005) for {'C': 1, 'kernel': 'linear'}
	[ 3]: 1.000 (+/-0.000) for {'C': 1, 'kernel': 'rbf'}
	[ 4]: 0.998 (+/-0.005) for {'C': 10, 'kernel': 'linear'}
	[ 5]: 1.000 (+/-0.000) for {'C': 10, 'kernel': 'rbf'}

Detailed classification report:
	The model is trained on the full development set.
	The scores are computed on the full evaluation set.

              precision    recall  f1-score   support

      Circle       1.00   

In [13]:
# Setup search parameters
model = svm.SVC(gamma=0.001) # NOTE: gamma="scale" does not work in older Scikit-learn frameworks, 
                             # FIX:  replace with model = svm.SVC(gamma=0.001)

tuning_parameters = {
    'kernel':('linear', 'rbf'), 
    'C':[0.1, 1, 10]
}

CV=5
VERBOSE=0
# Run RandomizedSearchCV for the model
start = time()
random_tuned = RandomizedSearchCV(model, tuning_parameters, random_state=42, n_iter=4, cv=CV, scoring='f1_micro', verbose=VERBOSE, n_jobs=-1, iid=True)
random_tuned.fit(Xnr_train, ynr_train)
t = time()-start



In [14]:
# Report result
b0, m0= GR.FullReport(random_tuned , Xnr_test, ynr_test, t)
print('OK')

SEARCH TIME: 275.86 sec

Best model set found on train set:

	best parameters={'kernel': 'rbf', 'C': 10}
	best 'f1_micro' score=1.0
	best index=2

Best estimator CTOR:
	SVC(C=10, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.001, kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

Grid scores ('f1_micro') on development set:
	[ 0]: 0.998 (+/-0.005) for {'kernel': 'linear', 'C': 0.1}
	[ 1]: 0.997 (+/-0.012) for {'kernel': 'rbf', 'C': 0.1}
	[ 2]: 1.000 (+/-0.000) for {'kernel': 'rbf', 'C': 10}
	[ 3]: 0.998 (+/-0.005) for {'kernel': 'linear', 'C': 1}

Detailed classification report:
	The model is trained on the full development set.
	The scores are computed on the full evaluation set.

              precision    recall  f1-score   support

      Circle       1.00      1.00      1.00       218
      Square       1.00      1.00      1.00       273
        Star       1.00     

In [19]:
model = SGDClassifier(eta0=0.1) 
tuning_parameters = {
    'alpha' : [0.001, 0.01, 0.1, 1],
    'max_iter': [1, 10, 100, 1000],
    'learning_rate': ('constant','optimal','invscaling','adaptive'),
    'n_iter_no_change': [5, 10, 15],
    'loss': ('hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron', 'squared_loss', 'epsilon_insensitive'),
    
}

CV=5
VERBOSE=0
# Run RandomizedSearchCV for the model
start = time()
SGDRandom_tuned = RandomizedSearchCV(model, tuning_parameters, random_state=42, n_iter=8, cv=CV, scoring='f1_micro', verbose=VERBOSE, n_jobs=-1, iid=True)
SGDRandom_tuned.fit(Xnr_train, ynr_train)
t = time()-start



In [21]:
# Report result
b0, m0= GR.FullReport(SGDRandom_tuned , Xnr_test, ynr_test, t)
print('OK')

SEARCH TIME: 1184.06 sec

Best model set found on train set:

	best parameters={'n_iter_no_change': 15, 'max_iter': 500, 'loss': 'huber', 'learning_rate': 'adaptive', 'alpha': 0.01}
	best 'f1_micro' score=0.999
	best index=4

Best estimator CTOR:
	SGDClassifier(alpha=0.01, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.1, fit_intercept=True,
              l1_ratio=0.15, learning_rate='adaptive', loss='huber',
              max_iter=500, n_iter_no_change=15, n_jobs=None, penalty='l2',
              power_t=0.5, random_state=None, shuffle=True, tol=0.001,
              validation_fraction=0.1, verbose=0, warm_start=False)

Grid scores ('f1_micro') on development set:
	[ 0]: 0.732 (+/-0.425) for {'n_iter_no_change': 10, 'max_iter': 100, 'loss': 'huber', 'learning_rate': 'adaptive', 'alpha': 0.01}
	[ 1]: 0.874 (+/-0.058) for {'n_iter_no_change': 10, 'max_iter': 1, 'loss': 'log', 'learning_rate': 'invscaling', 'alpha': 0.1}
	[ 2]: 0.294 (+/-0.100) 

In [17]:
# Setup search parameters
model = KNeighborsClassifier()

tuning_parameters = {
    'n_neighbors' : [3, 5, 8],
    'weights' : ['uniform', 'distance'],
    'metric' : ['minkowski', 'euclidean', 'manhattan'],
    'leaf_size' : [10, 30, 50]
}
    
CV=5
VERBOSE=0
# Run RandomizedSearchCV for the model
start = time()
KNRandom_tuned = RandomizedSearchCV(model, tuning_parameters, random_state=42, n_iter=6, cv=CV, scoring='f1_micro', verbose=VERBOSE, n_jobs=-1, iid=True)
KNRandom_tuned.fit(Xnr_train, ynr_train)
t = time()-start

In [18]:
# Report result
b0, m0= GR.FullReport(random_tuned , Xnr_test, ynr_test, t)
print('OK')

SEARCH TIME: 169.08 sec

Best model set found on train set:

	best parameters={'weights': 'distance', 'n_neighbors': 3, 'metric': 'minkowski', 'leaf_size': 30}
	best 'f1_micro' score=1.0
	best index=0

Best estimator CTOR:
	KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=3, p=2,
                     weights='distance')

Grid scores ('f1_micro') on development set:
	[ 0]: 1.000 (+/-0.000) for {'weights': 'distance', 'n_neighbors': 3, 'metric': 'minkowski', 'leaf_size': 30}
	[ 1]: 1.000 (+/-0.000) for {'weights': 'distance', 'n_neighbors': 3, 'metric': 'manhattan', 'leaf_size': 50}
	[ 2]: 1.000 (+/-0.000) for {'weights': 'uniform', 'n_neighbors': 3, 'metric': 'manhattan', 'leaf_size': 50}
	[ 3]: 1.000 (+/-0.000) for {'weights': 'uniform', 'n_neighbors': 3, 'metric': 'manhattan', 'leaf_size': 10}
	[ 4]: 1.000 (+/-0.000) for {'weights': 'uniform', 'n_neighbors': 5, 'metric': 'euclidean', 'leaf_size':

### Conclusion on optimization

For optimizing of chosen dataset, GridSearchCV and RandomSearchCV was used, in search of hyperparameters. Both optimization method originates from scikit-learn. To find the best optimization, the search methods was used with different combinations of tuning parameters, implemented in a parameter_grid. the search methods will try to optimize the models using the parameters given, which then results in scores to given parameters. The scores indicates which tuning parameters are best for optimizing. The best scoring parameters is known as the best parameters for the given dataset.

When comparing the GridSearch and RandomSearch, the RandomSearch is slightly faster, which is expected as it doesn't try with all combinations. As seen in the results above, the search found the best estimator for optimizing by trying combinations with the given tuning parameters. The various combinations saw scores of 0.997 and 0.998 for SVM model.
With the best estimator for optimization a score of 1, which is a perfect score,was achieved. The tuning parameter for the best estimator is found to be: Kernel=rbf and C-gamma=1.
The SGDClassifier gave various results with for optimization. As seen in the results, the score could swing 0.294 up to 0.999. If given more iterations, it could be assumed the hyperparameter tuning would find the best estimator for given dataset. 
As for KNeighbor model, each parameter gave a score of 1, which can be very questionable, whether it is a valid tuning model for these types of data.

No further optimization method was used as the grid and random search achieved a perfect score. Other methods could used to speed up the optimization, such as different types of regression, but may cost effectiveness on the score.
Regression may not be very suitable for this type of data, as it consists of images. Regression is typically used for data consisting of values, used to predict for example house value. 

But with a score of 1, it could be questionable whether the system is overfitted.

