In [1]:
import cv2
import numpy as np
import glob
import os
import matplotlib
import matplotlib.pyplot as plt
from PIL import Image
from time import time
from sklearn import svm, preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split
from sklearn.metrics import classification_report, f1_score
from keras.utils.np_utils import to_categorical
import tensorflow as tf

Using TensorFlow backend.


In [2]:
currmode="N/A" # GLOBAL var!

def SearchReport(model): 
    
    def GetBestModelCTOR(model, best_params):
        def GetParams(best_params):
            r=""          
            for key in sorted(best_params):
                value = best_params[key]
                t = "'" if str(type(value))=="<class 'str'>" else ""
                if len(r)>0:
                    r += ','
                r += f'{key}={t}{value}{t}'  
            return r            
        try:
            p = GetParams(best_params)
            return type(model).__name__ + '(' + p + ')' 
        except:
            return "N/A(1)"
        
    print("\nBest model set found on train set:")
    print()
    print(f"\tbest parameters={model.best_params_}")
    print(f"\tbest '{model.scoring}' score={model.best_score_}")
    print(f"\tbest index={model.best_index_}")
    print()
    print(f"Best estimator CTOR:")
    print(f"\t{model.best_estimator_}")
    print()
    try:
        print(f"Grid scores ('{model.scoring}') on development set:")
        means = model.cv_results_['mean_test_score']
        stds  = model.cv_results_['std_test_score']
        i=0
        for mean, std, params in zip(means, stds, model.cv_results_['params']):
            print("\t[%2d]: %0.3f (+/-%0.03f) for %r" % (i, mean, std * 2, params))
            i += 1
    except:
        print("WARNING: the random search do not provide means/stds")
    
    global currmode                
    assert "f1_micro"==str(model.scoring), f"come on, we need to fix the scoring to be able to compare model-fits! Your scoreing={str(model.scoring)}...remember to add scoring='f1_micro' to the search"   
    return f"best: dat={currmode}, score={model.best_score_:0.5f}, model={GetBestModelCTOR(model.estimator,model.best_params_)}", model.best_estimator_ 

def ClassificationReport(model, X_test, y_test, target_names=None):
    Xx_test, yy_test=np.array(X_test),np.array(y_test)
    assert Xx_test.shape[0]==yy_test.shape[0]
    print("\nDetailed classification report:")
    print("\tThe model is trained on the full development set.")
    print("\tThe scores are computed on the full evaluation set.")
    print()
    y_true, y_pred = y_test, model.predict(X_test)                 
    print(classification_report(y_true, y_pred, target_names))
    print()
    

def FullReport(model, X_test, y_test, t):
    print(f"SEARCH TIME: {t:0.2f} sec")
    beststr, bestmodel = SearchReport(model)
    ClassificationReport(model, X_test, y_test)    
    print(f"CTOR for best model: {bestmodel}\n")
    print(f"{beststr}\n")
    return beststr, bestmodel


In [3]:
#TODO load the dataset:
import sys,os
from time import time
sys.path.append(os.path.expanduser('ProjectFunctions'))
from ProjectFunctions import LoadShapes as LS


start = time()

X, y = LS.getShapes()

t = time()-start
print(f"OK, Load time={t:0.1f}")

OK, Load time=15.2


## Preprocessing and split

In [4]:
X = np.array(X)
X_shape = X.shape
print("The shape of X: ", X_shape)

The shape of X:  (14970, 200, 200)


In [5]:
#split
X_r = X.reshape(14970, 200*200)

X_rShape = X_r.shape
print("The new shape of the reshaped X: ", X_rShape)

input_X_rShape = X_rShape[1:]
print("The shape of X ready to be inputed in the CNN: ", input_X_rShape)

The new shape of the reshaped X:  (14970, 40000)
The shape of X ready to be inputed in the CNN:  (40000,)


In [6]:
X_neuralReady=np.array(X_r)/255

print("Done preparing data for neural networks.")

Done preparing data for neural networks.


In [7]:
from sklearn.model_selection import train_test_split

test_size=0.3
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42, shuffle=True)

print("Done splitting train and test data.")

Done splitting train and test data.


In [8]:

Xnr_train, Xnr_test, ynr_train, ynr_test = train_test_split(X_neuralReady, y, test_size=test_size, random_state=42, shuffle=True)

print("Done splitting train and test data ready for neural networks.")

Done splitting train and test data ready for neural networks.


In [9]:
# Setup search parameters
model = svm.SVC(gamma=0.001) # NOTE: gamma="scale" does not work in older Scikit-learn frameworks, 
                             # FIX:  replace with model = svm.SVC(gamma=0.001)



In [10]:
Xnr_train, Xnr_test, ynr_train, ynr_test = Xnr_train[:1000], Xnr_test[:1000], ynr_train[:1000], ynr_test[:1000]

In [11]:
tuning_parameters = {
    'kernel':('linear', 'rbf'), 
    'C':[0.1, 1, 10]
}


CV=5
VERBOSE=0
# Run GridSearchCV for the model
start = time()
grid_tuned = GridSearchCV(model, tuning_parameters, cv=CV, scoring='f1_micro', verbose=VERBOSE, n_jobs=-1, iid=True)
grid_tuned.fit(Xnr_train, ynr_train)
t = time()-start



In [12]:
# Report result
b0, m0= FullReport(grid_tuned , Xnr_test, ynr_test, t)
print('OK')

SEARCH TIME: 256.93 sec

Best model set found on train set:

	best parameters={'C': 1, 'kernel': 'rbf'}
	best 'f1_micro' score=1.0
	best index=3

Best estimator CTOR:
	SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.001, kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

Grid scores ('f1_micro') on development set:
	[ 0]: 0.998 (+/-0.005) for {'C': 0.1, 'kernel': 'linear'}
	[ 1]: 0.997 (+/-0.012) for {'C': 0.1, 'kernel': 'rbf'}
	[ 2]: 0.998 (+/-0.005) for {'C': 1, 'kernel': 'linear'}
	[ 3]: 1.000 (+/-0.000) for {'C': 1, 'kernel': 'rbf'}
	[ 4]: 0.998 (+/-0.005) for {'C': 10, 'kernel': 'linear'}
	[ 5]: 1.000 (+/-0.000) for {'C': 10, 'kernel': 'rbf'}

Detailed classification report:
	The model is trained on the full development set.
	The scores are computed on the full evaluation set.

              precision    recall  f1-score   support

      Circle       1.00   

In [13]:
# Setup search parameters
model = svm.SVC(gamma=0.001) # NOTE: gamma="scale" does not work in older Scikit-learn frameworks, 
                             # FIX:  replace with model = svm.SVC(gamma=0.001)

tuning_parameters = {
    'kernel':('linear', 'rbf'), 
    'C':[0.1, 1, 10]
}

CV=5
VERBOSE=0
# Run RandomizedSearchCV for the model
start = time()
random_tuned = RandomizedSearchCV(model, tuning_parameters, random_state=42, n_iter=20, cv=CV, scoring='f1_micro', verbose=VERBOSE, n_jobs=-1, iid=True)
random_tuned.fit(Xnr_train, ynr_train)
t = time()-start

# Report result
b0, m0= FullReport(random_tuned , Xnr_test, ynr_test, t)
print('OK')



SEARCH TIME: 238.46 sec

Best model set found on train set:

	best parameters={'kernel': 'rbf', 'C': 1}
	best 'f1_micro' score=1.0
	best index=3

Best estimator CTOR:
	SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.001, kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

Grid scores ('f1_micro') on development set:
	[ 0]: 0.998 (+/-0.005) for {'kernel': 'linear', 'C': 0.1}
	[ 1]: 0.997 (+/-0.012) for {'kernel': 'rbf', 'C': 0.1}
	[ 2]: 0.998 (+/-0.005) for {'kernel': 'linear', 'C': 1}
	[ 3]: 1.000 (+/-0.000) for {'kernel': 'rbf', 'C': 1}
	[ 4]: 0.998 (+/-0.005) for {'kernel': 'linear', 'C': 10}
	[ 5]: 1.000 (+/-0.000) for {'kernel': 'rbf', 'C': 10}

Detailed classification report:
	The model is trained on the full development set.
	The scores are computed on the full evaluation set.

              precision    recall  f1-score   support

      Circle       1.00   

### Conclusion on optimization

For optimizing of chosen dataset, GridSearchCV and RandomSearchCV was used. Both optimization method originates from scikit-learn. To find the best optimization, the search methods was used with different combinations of tuning parameters, implemented in a parameter_grid. the search methods will try to optimize the models using the parameters given, which then results in scores to given parameters. The scores indicates which tuning parameters are best for optimizing. The best scoring parameters is known as the best parameters for the given dataset.

When comparing the GridSearch and RandomSearch, the RandomSearch is slightly faster, which is expected as it doesn't try with all combinations. As seen in the results above, the search found the best estimator for optimizing by trying combinations with the given tuning parameters. The various combinations saw scores of 0.997 and 0.998. 
With the best estimator for optimization a score of 1, which is a perfect score,was achieved. The tuning parameter for the best estimator is found to be: Kernel=rbf and C-gamma=1.

But with a score of 1, it could be questionable whether the system is overfitted.

