#Projeto 1
#Objetivo:

Expor a ideia e como será estruturado as funcionalidades

In [None]:
#GLOBAIS

import os
import glob
import cv2
import numpy as np
from sklearn.cluster import MiniBatchKMeans

#parametros globais
train_folder = 'dataset/dataset_updated/training_set/'
test_folder = 'dataset/dataset_updated/validation_set/'
image_format = '.jpg'

In [None]:
#AUXILIARES

def getFolders(data_base):
  data_folders = []
  for name in os.listdir(data_base):
    if(os.path.isdir(data_base + name)):
      data_folders.append(name)
  print(data_folders)

  return data_folders

def load_images(folder):
    images = []
    for filename in os.listdir(folder):
        img = cv2.imread(os.path.join(folder,filename))
        
        #img = cv2.cvtColor(img,cv2.COLOR_BGR2GRAY)
        if img is not None:
            images.append(img)
    return images

def resize_all_images(images):
    width = 0
    height =  0
    resized_imgs = []
    
    for im in images:
        h, w, d = im.shape
        height += h
        width += w
        
    width = int(width/len(images))
    height = int(height/len(images))
                
    for im in images:
        resized_imgs.append(cv2.resize(im,(width,height)))
    return resized_imgs
        

In [None]:
#Tarefa 1: Pre-processamento

In [None]:
def choosePreProcess(im, param):
    if (param['pre_process']['method'] == 'Clahe'):
        return doClahe(im, param)
    elif (param['pre_process']['method'] == 'Eq_Hist'):
        return doEqualizazaoHistograma(im, param)
    elif (param['pre_process']['method'] == 'Quant_Linear'):
        return doQuantizacaoLinear(im,param)
    elif (param['pre_process']['method'] == 'Median'):
        return doMedian(im,param)
    elif (param['pre_process']['method'] == 'Gaussian'):
        return doGaussian(im,param)
    elif (param['pre_process']['method'] == 'Blur'):
        return doBlur(im,param)
    elif (param['pre_process']['method'] == 'Bilateral'):
        return doBilateral(im,param)
    elif (param['pre_process']['method'] == 'BrilhoContraste'):
        return doBrilhoContraste(im,param)

        
def doBlur(im,params):
    return cv2.blur(im,(5,5))

def doMedian(im, params):
    return cv2.medianBlur(im,5)

def doGaussian(im, params):
    return cv2.GaussianBlur(im,(5,5),0)

def doBilateral(im, params):
    return cv2.bilateralFilter(im,9,75,75)

def doClahe(im, params):
    clahe = cv2.createCLAHE(clipLimit=2.0,tileGridSize=(7,7))
    hsv = cv2.cvtColor(im, cv2.COLOR_BGR2HSV)
    hsv[:,:,2] = clahe.apply(hsv[:,:,2])
    rgb = cv2.cvtColor(hsv,cv2.COLOR_HSV2BGR)
    return im

def doEqualizazaoHistograma(im, params):
    hsv = cv2.cvtColor(im, cv2.COLOR_BGR2HSV)
    hsv[:,:,2] = cv2.equalizeHist(hsv[:,:,2])
    rgb = cv2.cvtColor(hsv,cv2.COLOR_HSV2BGR)
    
    return rgb 

def doQuantizacaoLinear(im, params):
    
    (h,w) = im.shape[:2]
    
    im = cv2.cvtColor(im, cv2.COLOR_BGR2LAB)
    
    im = im.reshape((im.shape[0] * im.shape[1],3))
    
    clt = MiniBatchKMeans(n_clusters = params['K_Value'])
    labels = clt.fit_predict(im)
    quant = clt.cluster_centers_.astype("uint8")[labels]
    
    quant = quant.reshape((h,w,3))
    quant = cv2.cvtColor(quant,cv2.COLOR_LAB2BGR)
    
    '''im2 = im.flatten()
    im2 = np.transpose(im2)
    im2= np.float32(im2)
    k = params['K_Value']
    criterio = (cv2.TERM_CRITERIA_EPS + cv2.TERM_CRITERIA_MAX_ITER, 10, 1.0)
    compactness, labels, centers = cv2.kmeans(im2,k,None,criterio,10, cv2.KMEANS_PP_CENTERS)

    centers = np.uint8(centers)
    res = centers[labels.flatten()]
    im_quant = res.reshape((im.shape))'''
    return quant

def doBrilhoContraste(im, params):
    
    if params['brilho'] != 0:
        if params['brilho'] > 0:
            s = params['brilho']
            highlight = 255
        else:
            s = 0
            highlight = 255 + params['brilho']
        alpha_b = (highlight - s)/255
        gamma_b = s
        
        buf = cv2.addWeighted(im, alpha_b, im, 0, gamma_b)
    else:
        buf = im.copy()
    
    if params['contraste'] != 0:
        f = 131*(params['contraste'] + 127)/(127*(131-params['contraste']))
        alpha_c = f
        gamma_c = 127*(1-f)
        
        buf = cv2.addWeighted(buf, alpha_c, buf, 0, gamma_c)

    return buf



In [None]:
#TESTE
images = load_images('dataset/dataset_updated/training_set/teste/')
images = resize_all_images(images)


params_teste = {
    'pre_process':'Gaussian',
    'K_Value' : 10,
    'brilho': 64,
    'contraste' : 64
    
}

images_pre_processed = []
for image in images:
    images_pre_processed.append(choosePreProcess(image,params_teste))
    
cv2.imshow("test", images_pre_processed[0])
cv2.waitKey(0)
cv2.destroyAllWindows()


TypeError: string indices must be integers

In [None]:
#Tarefa 2: Extração de Característica 

In [None]:
from skimage.feature import local_binary_pattern
import cv2

def chooseFeats(im, params):
    if params["feat"] == "lbp":
        return doLBP(im, params)
    elif params["feat"] == "color_hist":
        return doColorHistogram(im, params)
    elif params["feat"] == "glcm":
        return doGLCM(im, params)


def doLBP(im, params):
    lbp = local_binary_pattern(im, params["n_points"], params["radius"], params["methods"])
    hist = cv2.calcHist([lbp], [0, None, [256], [0, 256]])
    return hist.ravel()

def doColorHistogram(im, params):

    if params['color_space'] == 'LAB':
        im = cv2.cvtColor(im, cv2.COLOR_RGB2LAB)
    elif params['color_space'] == 'YCrCb':
        im = cv2.cvtColor(im, cv2.COLOR_RGB2YCrCb)
    elif params['color_space'] == 'HSV':
        im = cv2.cvtColor(im, cv2.COLOR_RGB2HSV)
    elif params['color_space'] == 'RGB':
        pass
        
    channels = cv2.split(im)
    hist = []

    for channel in channels:
        channel_hist = cv2.calcHist([channel],[0],None,[256],[0,256])
        hist.extend(channel_hist)

    return np.array(hist).ravel()

def doGLCM(im, params):
    glcm = greycomatrix(im, params["distances"], params["angles"])
    features = [
        greycoprops(glcm, "contrast"),
        greycoprops(glcm, "dissimilarity"),
        greycoprops(glcm, "homogeneity"),
        greycoprops(glcm, "energy"),
        greycoprops(glcm, "correlation"),
        greycoprops(glcm, "ASM")
    ]

    return np.array(features).ravel()


#Executa em conjunto as duas primeiras etapas

In [None]:
def getPreProcessAndExtractFeatures(path_folder, params):
	classes_folders = getFolders(path_folder)

	data = []
	labels = []
	for f in classes_folders:
		dataset = glob.glob(path_folder + f + "/*" + image_format)
		for arq in dataset:
			im = cv2.imread(arq)			
		
			im = choosePreProcess(im, params)
			feats = chooseFeats(im, params)

			data.append(feats)
			labels.append(f)
	return np.asarray(data), np.asarray(labels)

X_base, y_base = getBase()	
print(X_base.shape)
print(y_base.shape)



['2', '1', '0']


#Tarefa 3: Seleção de Característica

In [None]:
from sklearn.feature_selection import RFECV
from sklearn.linear_model import LogisticRegression


#nesse ponto, a base já deve ter passado pela etapa 1 e etapa 2
def chooseBestFeats(params):
  if (param['selection'] == 'rfe')
    doRFE()

def evalBestFeats():
  #1. faz treinamento com validação cruzada  
  #2. retorna o valor do score

#problema aqui: o pre-processamento deve ser aplicado a toda a base
#mas, até escolher o melhor, ele não deve aplicar as alterações
#ou seja, a base original deve permanecer inalterada
def doRFE():
  parametros = dict(estimator=[ ... ],
                    step=[ ...],
                    min_features_to_select=[ ...] )
  
  lr = LogisticRegression(random_state=42, solver='liblinear')
  rfecv = RFECV(estimator=lr, 
              step=1, 
              cv=5,
              min_features_to_select = 100,
              scoring='accuracy')
  rfecv.fit(X_train, y_train)

  #Se vamos usar o do sklearn: precisamos criar o estimator que substitui a função eval
  #Podemos fazer o nosso, passando por parâmetro a função eval e o dicionário de parâmetros
  #RandomSearch ... ?
  #GridSearch   ... ?

  #onde está sendo avaliado 
  #na forma de grid
  for  ...
    for ...
      evalBestFeats()


  #deve retornar a os vetores de caracteristicas filtrados pela técnica com os melhores parâmetros aplicados

#Tarefa 4: Seleção de classificador

In [None]:
import xgboost as xgb
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV, KFold, RandomizedSearchCV
from sklearn.metrics import precision_score, recall_score, accuracy_score, cohen_kappa_score, confusion_matrix

import pandas as pd
import numpy as np
import cv2

import os, glob

from joblib import dump, load


def report_best_scores(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")

#nesse ponto, a base já deve ter passado pela etapa 1, etapa 2 e etapa 3
def chooseBestClassifier(params):
  if (params['classifier'] == 'randomforest'):
    doRandomForest()


class XGBoost_Classifier:
    
    def __init__(self, qtd_classes, best_scores):
        self.num_class = qtd_classes
        self.best_scores = best_scores
    
    def search_model(self, X, y, grid, steps):        
        xgb_model = xgb.XGBClassifier(objective='multi:softprob', random_state=42)
        
        search = RandomizedSearchCV(xgb_model, param_distributions=grid, random_state=42, n_iter=200, cv=3, verbose=1, n_jobs=1, return_train_score=True)

        search.fit(X, y)
        
        self.best_scores(search.cv_results_, 1)
    
    def train_single(self, X_train, Y_train, X_test, Y_test, params, steps):
        model = xgb.train(param, D_train, steps)
        preds = model.predict(D_test)
        preds = np.asarray([np.argmax(line) for line in preds])
        acc = accuracy_score(Y_test, preds)
        kpp = cohen_kappa_score(Y_test, preds)
        print("Accuracy = {}".format(acc))
        print("Precision = {}".format(precision_score(Y_test, preds, average='macro')))
        print("Recall = {}".format(recall_score(Y_test, preds, average='macro')))
        print("Kappa = {}".format(kpp))
        
class SVM_Classifier:
    
    def __init__(self, best_scores):
        self.best_scores = best_scores
    
    def search_model(self, X, y, grid):        
        svc = SVC(probability=True)
        
        search = RandomizedSearchCV(svc, param_distributions=grid, random_state=42, n_iter=2, cv=3, verbose=3, n_jobs=1, return_train_score=True)
        
        search.fit(X, y)
        
        self.best_scores(search.cv_results_, 1)
    
    def train_single(self, X_train, Y_train, X_test, Y_test, params):
        svc = SVC(probability=True)
        svc.set_params(params)
        
        model = svc.fit(X_train, Y_train)
        y_predicted = model.predict(X_test)
        
        acc = sk.metrics.accuracy_score(Y_test, y_predicted)
        prec = sk.metrics.precision_score(Y_test, y_predicted, average=None)[1]
        rec = sk.metrics.recall_score(Y_test, y_predicted, average=None)[1]
        kpp = sk.metrics.cohen_kappa_score(Y_test, y_predicted)
        print("Accuracy: {:.1%}".format(acc))
        print("Precision: {:.1%}".format(prec))
        print("Recall: {:.1%}".format(rec))
        print("Kappa: {:.1%}".format(kpp))

class KNN_Classifier:
    
    def __init__(self, best_scores):
        self.best_scores = best_scores
    
    def search_model(self, X, y, grid):        
        knn = KNeighborsClassifier()
        
        search = RandomizedSearchCV(knn, param_distributions=grid, random_state=42, n_iter=200, cv=3, verbose=1, n_jobs=1, return_train_score=True)
        
        search.fit(X, y)
        
        self.best_scores(search.cv_results_, 1)
    
    def train_single(self, X_train, Y_train, X_test, Y_test, params):
        knn = KNeighborsClassifier()
        knn.set_params(params)
        
        model = knn.fit(X_train, Y_train)
        y_predicted = model.predict(X_test)
        
        acc = sk.metrics.accuracy_score(Y_test, y_predicted)
        prec = sk.metrics.precision_score(Y_test, y_predicted, average=None)[1]
        rec = sk.metrics.recall_score(Y_test, y_predicted, average=None)[1]
        kpp = sk.metrics.cohen_kappa_score(Y_test, y_predicted)
        print("Accuracy: {:.1%}".format(acc))
        print("Precision: {:.1%}".format(prec))
        print("Recall: {:.1%}".format(rec))
        print("Kappa: {:.1%}".format(kpp))
        
class RandomForest_Classifier:
    
    def __init__(self, best_scores):
        self.best_scores = best_scores
    
    def search_model(self, X, y, grid):        
        rfc = RandomForestClassifier()
        
        search = RandomizedSearchCV(rfc, param_distributions=grid, random_state=42, n_iter=200, cv=3, verbose=1, n_jobs=1, return_train_score=True)
        
        search.fit(X, y)
        
        self.best_scores(search.cv_results_, 1)
    
    def train_single(self, X_train, Y_train, X_test, Y_test, params):
        rfc = RandomForestClassifier()
        rfc.set_params(params)
        
        model = rfc.fit(X_train, Y_train)
        y_predicted = model.predict(X_test)
        
        acc = sk.metrics.accuracy_score(Y_test, y_predicted)
        prec = sk.metrics.precision_score(Y_test, y_predicted, average=None)[1]
        rec = sk.metrics.recall_score(Y_test, y_predicted, average=None)[1]
        kpp = sk.metrics.cohen_kappa_score(Y_test, y_predicted)
        print("Accuracy: {:.1%}".format(acc))
        print("Precision: {:.1%}".format(prec))
        print("Recall: {:.1%}".format(rec))
        print("Kappa: {:.1%}".format(kpp))

#Em comum: otimizador

In [None]:
BASE_PATH_TRAIN = "dataset/dataset_updated/training_set/"
BASE_PATH_VALIDATION = "dataset/dataset_updated/validation_set"

def load_database(path: str, params: dict):
    labels = []

    folders = os.listdir(path)
    labels = folders
    
    features = []
    labels = []
    error_images = []
    max_size = 0
    for f in folders:
        images = glob.glob(path + f + "/*.*")    
        for img in images:
            image = cv2.imread(img)

            if image is not None:
                width = 224
                height = 224
                #resized = resized_image(image, width, height) #TODO: fazer o zero padding e deixar ela do tamanho desejado
                
                image = choosePreProcess(image, params) #escolhendo o pré-processamento
                #feature = chooseFeats(image, params) #escolhendo a extração de caracteristicas

                #features.append(feature)
                labels.append(folders.index(f))
            else:
                error_images.append(img)
    
    features = np.array(features)
    labels = np.array(labels)
    
    return features, labels

In [None]:
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt import hp
import random

param_space ={
    'pre_process' : hp.choice('preprocess',[
        {
            'method': 'Clahe'
        },
        {
            'method' : 'Eq_Hist'
        },
        {
            'method' : 'Quant_Linear',
            'K_Value' : hp.choice('K_Value',[8,16,32,64])
        },
        {
            'method' : 'Median'
        },
        {
            'method' : 'Gaussian'
        },
        {
            'method' : 'Blur'
        },
        {
            'method' : 'Bilateral'
        },
        {
            'method' : 'BrilhoContrate',
            'brilho' : hp.choice('brilho',[0,32,64,127]),
            'contraste' : hp.choice('contraste',[0,32,64,127])
        }
    ]),
    'feature_extractor' : hp.choice('feature_extractor',[
        {
            'method' : 'lbp',
            'n_points': hp.choice('n_points', range(2, 9)),
            'radius': hp.choice('radius', range(2, 9)),
            'method': hp.choice('method', ['default', 'ror'. 'uniform', 'nri_uniform', 'var']),
         
        },
        {
            'method' : 'glcm',
            'distance': hp.choice('distance', range(2, 9)),
            'angles': hp.choice('angles', [0, np.pi/4, np.pi/3, np.pi/2, (3*np.pi)/4])
        },
        {
            'method': 'color_hist',
            'color_space': hp.choice('color_space', ['LAB', 'YCrCb', 'HSV', 'RGB'])
        }
    ])
    }

def acc_model(params):
    print (params)
    return random.uniform(0.8, 1.0)

def hyperopt_fitness(params: dict):
    print(params)
    features_train, labels_train = load_database(BASE_PATH_TRAIN, params)
    features_test, labels_test = load_database(BASE_PATH_VALIDATION, params)
    
    '''scaler = StandardScaler()
    scaler.fit(features_train)
    
    features_train = scaler.transform(features_train)
    features_test = scaler.transform(features_test)
    
    selector_model = chooseFeaturesSelection(features_train, labels_train, params)
    selector_model.transform(features_train)
    selector_model.transform(features_test)
        
    classifier, classifier_params = chooseBestClassifier(features_selected, labels, params)
    scores = classifier.train_single(features_train, labels_train, features_test, labels_test, classifier_params)'''
    return 0.85

trials = Trials()
best = fmin(hyperopt_fitness, 
            param_space, 
            algo=tpe.suggest, 
            max_evals=10, 
            trials=trials)

#Predict
#...

{'feature_extractor': {'method': 'methodname1', 'params ..': '...'}, 'pre_process': {'brilho': 0, 'contraste': 0, 'method': 'BrilhoContrate'}}
{'feature_extractor': {'method': 'methodname1', 'params ..': '...'}, 'pre_process': {'method': 'Bilateral'}}           
{'feature_extractor': {'method': 'methodname2', 'params ..': '...'}, 'pre_process': {'method': 'Clahe'}}               
 20%|████████████▍                                                 | 2/10 [04:47<17:25, 130.63s/trial, best loss: 0.85]