In [1]:
import numpy as np
import pandas as pd
import cv2 as cv
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import VotingClassifier
import sys
import os
from sklearn.svm import SVC
from sklearn.ensemble import BaggingClassifier
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
import pickle as pkl
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
import time

In [2]:
def getimg(image):#get the cut image 
    width = 0
    
    gray = cv.cvtColor(image, cv.COLOR_BGR2GRAY)
    blur = cv.GaussianBlur(gray, (5,5), 0)
    thresh = cv.adaptiveThreshold(blur, 255, 1, 1, 11, 2)
    contours, _ = cv.findContours(thresh, cv.RETR_TREE, cv.CHAIN_APPROX_SIMPLE)
    max_area = 3500
    c = 0
    global img_cut
    for i in contours:
            area = cv.contourArea(i)
            if area > max_area:
                max_area = area
                best_cnt = i
                x, y, w, h = cv.boundingRect(best_cnt)
                # For critical points coordinates
                topleft = [x,y]
                topright = [x+w,y]
                downleft = [x,y+h]
                downright = [x+w,y+h]
                cv.rectangle(image,(x,y),(x+w,y+h),(0,255,0),6)
                try: #if import error, skip it
                    img_cut = image[y+width:y+h-width,x+width:x+w-width]
                except UnboundLocalError:
                    pass

                cv.waitKey(0)
    
    return preprocess(img_cut)#call the preprocess to get the data

In [4]:
def preprocess(image):#preprocess
    
    edges=cv.Canny(image,70,70) #canny edge detection
    #Sobel
    x = cv.Sobel(edges, cv.CV_16S, 1, 0) #Gray weighted difference between left and right adjacent points
    y = cv.Sobel(edges, cv.CV_16S, 0, 1) #Gray weighted difference of upper and lower adjacent points
    absX = cv.convertScaleAbs(x) #Converts to an unsigned 8-bit type.
    absY = cv.convertScaleAbs(y)
    dst = cv.addWeighted(absX, 0.5, absY, 0.5, 0)#overlap
    
    #Adaptive threshold binarization
    image = cv.adaptiveThreshold(dst, 255, cv.ADAPTIVE_THRESH_GAUSSIAN_C, cv.THRESH_BINARY, 25, 3)
    
    #reshape the image
    image=cv.resize(image,(30,30))
    image=image.ravel() #flatting the two-dimensional matrix into one-dimensional matrix
    image[image>10]=1
    
    return image

In [5]:
def svc_model(X, y):

    # grid search range
    # kernel_range : ['rbf', 'linear']
    # C_range : [0.001, 0.01, 0.1]
    
    # the best parameters: kernel (linear), C = 0.01
    kernel_range = ['linear']
    C_range = [0.01]
    
    param_grid = dict(kernel=kernel_range, C=C_range)

    # GridsearchCV
    clf = SVC()
    grid = GridSearchCV(clf, param_grid, cv=10, scoring='accuracy')
    grid.fit(X, y)

    return grid.best_estimator_, grid.best_params_

In [6]:
def random_forest_model(X, y):

    # grid search range
    # max_features_range : range(10, 31, 10)
    # max_depth_range : range(8, 13, 2)
    
    #the best parameter: max_depth = 10, max_feature = 30
    max_features_range = [30]
    max_depth_range = [10]
    
    param_grid = dict(max_features=max_features_range, max_depth=max_depth_range)

    # GridsearchCV
    clf = RandomForestClassifier(n_estimators=100)
    grid = GridSearchCV(clf, param_grid, cv=10, scoring='accuracy')
    grid.fit(X, y)

    return grid.best_estimator_, grid.best_params_

In [7]:
def gradient_boosting_model(X, y):

    # grid search range
    # max_depth_range = range(1, 8, 2)
    
    #The best parameter: max_depth = 5
    max_depth_range = [5]
    
    param_grid = dict(max_depth=max_depth_range)

    # GridsearchCV
    clf = GradientBoostingClassifier(n_estimators=100)
    grid = GridSearchCV(clf, param_grid, cv=10, scoring='accuracy')
    grid.fit(X, y)

    return grid.best_estimator_, grid.best_params_

In [8]:
def model_train_and_evaluation(X_train, y_train, X_test, y_test, model):

    start1 = time.time()

    # model selection
    if model == 'svm':
        # bagging base on svm
        base_svm, base_params = svc_model(X_train, y_train)
        clf = BaggingClassifier(base_estimator=base_svm, n_estimators=10, random_state=0).fit(X_train, y_train)
        params = clf.get_params()
    elif model == 'rf':
        # random forest
        clf, params = random_forest_model(X_train, y_train)
    elif model == 'gb':
        # adaboost base on decision tree
        clf, params = gradient_boosting_model(X_train, y_train)
    elif model == 'vc':
        clf1 = BaggingClassifier(base_estimator=SVC(kernel='linear', C=0.01), n_estimators=10, random_state=0)
        clf2 = RandomForestClassifier(n_estimators=100, max_depth = 10, max_features = 30)
        clf3 = GradientBoostingClassifier(n_estimators=100, max_depth = 5)
        #voting classifier
        clf = VotingClassifier(estimators=[('svm',clf1),('rf',clf2),('gb',clf3)],voting='hard').fit(X_train,y_train)
        params = clf.get_params()
    
    #print the score
    #record the time
    end1 = time.time()
    print('\n')
    print(clf)
    print('Training Time: ', end1 - start1)
    
    start2 = time.time()
    
    y_pred = clf.predict(X_test)
    
    end2 = time.time()
    
    print('Predict Time: ', end2 - start2)
    
    # model evaluation
    print(params)
    print('Accuracy:\t', accuracy_score(y_test, y_pred))
    print('F1 score:\t', f1_score(y_test, y_pred, average='macro'))
    
    return clf


In [9]:


img_mat=[]# img container

#Process all images under this path
file_pathname=["./TrainData/A","./TrainData/B","./TrainData/C","./TrainData/D"]
for i in range(4):
    for filename in os.listdir(file_pathname[i]):
        img = cv.imread(file_pathname[i]+'/'+filename)
        pre_img = getimg(img)
        pre_ravel = pre_img.ravel()
        list1 = list(pre_ravel)
        list1.append(i+1)
        pre_ravel = np.array(list1)
        img_mat.append(pre_ravel)

In [10]:
data = pd.DataFrame(img_mat)

(1240, 901)

In [12]:
data[data>10]=1;

In [14]:
#split the labels and features
X = data.iloc[:, 0:900]
y = data.iloc[:,[900]]

In [15]:
y=y.values.ravel() #change the format 

In [16]:
# separate feature and label
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

In [17]:
classifiers = ['svm', 'rf', 'gb', 'vc'] #'svm', 'rf', 'gb'

#training the classifier and export the model 
for clf in classifiers:
    model = model_train_and_evaluation(X_train, y_train, X_test, y_test, clf)
    
    f = open("./pkl/"+clf+'.pkl', 'wb')
    pkl.dump(model, f)
    f.close()




BaggingClassifier(base_estimator=SVC(C=0.01, kernel='linear'), random_state=0)
Training Time:  8.344329595565796
Predict Time:  1.0412163734436035
{'base_estimator__C': 0.01, 'base_estimator__break_ties': False, 'base_estimator__cache_size': 200, 'base_estimator__class_weight': None, 'base_estimator__coef0': 0.0, 'base_estimator__decision_function_shape': 'ovr', 'base_estimator__degree': 3, 'base_estimator__gamma': 'scale', 'base_estimator__kernel': 'linear', 'base_estimator__max_iter': -1, 'base_estimator__probability': False, 'base_estimator__random_state': None, 'base_estimator__shrinking': True, 'base_estimator__tol': 0.001, 'base_estimator__verbose': False, 'base_estimator': SVC(C=0.01, kernel='linear'), 'bootstrap': True, 'bootstrap_features': False, 'max_features': 1.0, 'max_samples': 1.0, 'n_estimators': 10, 'n_jobs': None, 'oob_score': False, 'random_state': 0, 'verbose': 0, 'warm_start': False}
Accuracy:	 0.9419354838709677
F1 score:	 0.9393368559830951


RandomForestClassi