### 1. Import the essential libraries

In [7]:
import os
import sys
import cv2
import random
import joblib
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

from sklearn.cluster import KMeans
from sklearn import metrics
from scipy.spatial.distance import cdist

from skimage.color import rgb2lab, lab2rgb
from skimage.segmentation import slic
from skimage.segmentation import mark_boundaries
from skimage.util import img_as_float

from tqdm import tqdm_notebook

### 1.1 Define some helpful functions

In [8]:
def surf_feature_extraction_v2(image, indices, hessianThreshold=400, nOctaves=3, n_keypoints=10, surf_window=20):
    # construct a mask for the segment
    surf = cv2.xfeatures2d.SURF_create(hessianThreshold=hessianThreshold, nOctaves=nOctaves)
    key_points = [cv2.KeyPoint(y, x, surf_window) for (x,y) in indices]
    key_points = sorted(key_points, key=lambda x: -x.response)[:n_keypoints]
    key_points, description = surf.compute(image, key_points)
    
    if len(key_points) != 0:
        description = description.flatten()
    else:
        description = np.array([])
        
    fearute_vector_size = (n_keypoints * 64)
    if description.size < fearute_vector_size:
        description = np.concatenate([description, np.zeros(fearute_vector_size - description.size)])
    
    return (key_points, description)

In [9]:
def calc_energy_and_amplitude(filtered_img):
    '''
    Return a list of values:
      * Local Energy = summing up the squared value of each matrix value from a response matrix
      * Mean Amplitude = sum of absolute values of each matrix value from a response matrix
    '''
    local_energy = np.sum(np.square(filtered_img))
    mean_amplitude = np.sum(np.absolute(filtered_img))
    return [local_energy, mean_amplitude]

def gabor_feature_extraction_v2(image_lab_L, theta_range=[0, np.pi/6, np.pi/4, np.pi/3, np.pi/2, 2*np.pi/3, 3*np.pi/4, 5*np.pi/6], scale_range=[3, 6, 13, 28, 58]):
    sp_local_energy = []
    sp_mean_amplitude = []
    for scale in scale_range:
        for angle in theta_range:
            g_kernel = cv2.getGaborKernel(ksize=(20, 20), sigma=scale, theta=angle, lambd=7, gamma=0.9, psi=1.5, ktype=cv2.CV_32F)
            filtered_img = cv2.filter2D(image_lab_L, cv2.CV_8UC3, g_kernel)
            
            lcl_enrgy, mn_amplitude = calc_energy_and_amplitude(filtered_img)
            
            sp_local_energy.extend([lcl_enrgy])
            sp_mean_amplitude.extend([mn_amplitude])
            
    return (sp_local_energy, sp_mean_amplitude)

### 2. Set the paths for *train* and *test* data

In [10]:
train_images_path = os.path.join("/media/andretri7/WD/Datasets/flower_images/flower_images")
res_path          = os.path.join('.', 'res_all')

### 2.1. Get the Train Images & the Color Palette

In [11]:
train_images = joblib.load(os.path.join(res_path,'trainImages.joblib'))
colorPalette = joblib.load(os.path.join(res_path,'colorPalette.joblib'))
print (f'{type(train_images)}\n{type(colorPalette)}')

<class 'numpy.ndarray'>
<class 'sklearn.cluster.k_means_.MiniBatchKMeans'>


In [12]:
np.sort(train_images)

array(['0024.png', '0090.png', '0103.png', '0142.png', '0191.png'],
      dtype='<U8')

### 3. Get Images and Perform the Preprocessing (cont.)
  * #### 3.1: Read Image
  * #### 3.2: Normalize Values from [0, 255] to [0, 1]
  * #### 3.3: Convert Image from RGB to LAB
  * #### 3.4: Get the Superpixels for an Image using the SLIC Segmentation Algorithm
  * #### 3.5: Extract the SURF Features from an Image per Superpixel
  * #### 3.6: Extract the Gabor Features from an Image per Superpixel
  * #### 3.7: Get the Dominant Color per Superpixel
  * #### 3.8: Associate each feature vector {SURF, Gabor} with the corresponding color class of the dominant color in the color palette

In [14]:
dataset = None

for img in tqdm_notebook(train_images):
    # 3.1: Read Image
    trainImage = cv2.imread(os.path.join(train_images_path, img))
    trainImage = cv2.cvtColor(trainImage, cv2.COLOR_BGR2RGB)
    # 3.2: Normalize Values from [0, 255] to [0, 1]
    trainImageScaled = trainImage/255
    # 3.3: Convert Image from RGB to LAB
    trainImageLAB = rgb2lab(trainImageScaled)
    # 3.4: Get the Superpixels for an Image using the SLIC Segmentation Algorithm
    segments = slic(trainImageLAB, convert2lab=False, n_segments=600, sigma=5) 
    # 3.5: Extract the SURF Features from an Image per Superpixel
    # 3.6: Extract the Gabor Features from an Image per Superpixel
    # 3.7: Get the Dominant Color per Superpixel
    for (i, segment) in enumerate(np.unique(segments)):
        # Construct a mask for the segment
        mask = np.zeros(trainImage.shape[:2], dtype = "uint8")
        mask[segments == segment] = 255
        superpixel = cv2.bitwise_and(trainImageLAB, trainImageLAB, mask=mask)
        
        superpixel_colors = pd.DataFrame(np.vstack(superpixel[:, :, 1:3]), columns=['a', 'b'])    
        
        maximal_color = KMeans(n_clusters=5, n_init=4, max_iter=100, n_jobs=-1)
        maximal_color.fit(superpixel_colors.values)
        superpixel_colors['y_color'] = maximal_color.labels_
        superpixel_dominant_color = superpixel_colors.loc[superpixel_colors['y_color'] == superpixel_colors['y_color']  
                                                     .value_counts().index[1]].apply(lambda x: x.median())[['a', 'b']]  

        superpixel_dominant_color_class = colorPalette.predict(superpixel_dominant_color.values.reshape(1,-1))
          
        surf_vector = surf_feature_extraction_v2(np.uint8(trainImageLAB[:,:,0]), np.argwhere(segments == segment), n_keypoints=7, surf_window=20)[1] 
        gabor_vector = np.hstack(gabor_feature_extraction_v2(np.uint8(superpixel[:,:,0])))
        color_vector = np.array(superpixel_dominant_color_class[0])
        feature_vector = np.hstack((surf_vector, gabor_vector, color_vector))
        
        if (dataset is None):
            dataset = np.array(feature_vector)
        else:
            dataset = np.row_stack((dataset, feature_vector))

HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

  segments_z = grid_z[slices]
  segments_y = grid_y[slices]
  segments_x = grid_x[slices]
  segments_z = grid_z[slices]
  segments_y = grid_y[slices]
  segments_x = grid_x[slices]
  return_n_iter=True)
  return_n_iter=True)
  segments_z = grid_z[slices]
  segments_y = grid_y[slices]
  segments_x = grid_x[slices]
  return_n_iter=True)
  segments_z = grid_z[slices]
  segments_y = grid_y[slices]
  segments_x = grid_x[slices]
  return_n_iter=True)
  return_n_iter=True)
  return_n_iter=True)
  return_n_iter=True)
  return_n_iter=True)
  return_n_iter=True)
  return_n_iter=True)
  return_n_iter=True)
  return_n_iter=True)
  return_n_iter=True)
  return_n_iter=True)
  return_n_iter=True)
  return_n_iter=True)
  return_n_iter=True)


  return_n_iter=True)
  return_n_iter=True)
  return_n_iter=True)
  return_n_iter=True)
  return_n_iter=True)
  return_n_iter=True)
  return_n_iter=True)
  return_n_iter=True)
  return_n_iter=True)
  return_n_iter=True)
  return_n_iter=True)
  return_n_iter=True)
  return_n_iter=True)
  return_n_iter=True)
  return_n_iter=True)
  return_n_iter=True)
  return_n_iter=True)
  return_n_iter=True)
  return_n_iter=True)
  segments_z = grid_z[slices]
  segments_y = grid_y[slices]
  segments_x = grid_x[slices]





In [15]:
pdDataset = pd.DataFrame(dataset)
pdDataset

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,519,520,521,522,523,524,525,526,527,528
0,-0.001012,-0.000482,0.001262,0.001717,0.006265,-0.010716,0.041482,0.015858,-0.003335,0.060695,...,2202.0,4567.0,3304.0,8351.0,14883.0,17389.0,15243.0,7870.0,2322.0,12.0
1,0.000772,0.001499,0.000772,0.001499,-0.020149,-0.001801,0.028495,0.012724,0.012219,-0.020807,...,3164.0,5228.0,5018.0,7441.0,11197.0,10079.0,9224.0,6163.0,3273.0,12.0
2,0.002808,0.005985,0.002856,0.006100,-0.024481,0.021553,0.029481,0.022344,0.025054,0.008829,...,9915.0,15769.0,9368.0,7052.0,5135.0,2603.0,5930.0,8160.0,10263.0,3.0
3,0.003650,0.008153,0.004745,0.010317,-0.012625,0.053723,0.029653,0.055881,0.035501,0.022364,...,9272.0,15627.0,10352.0,7787.0,6353.0,3801.0,5598.0,6965.0,9633.0,3.0
4,0.000605,-0.000584,0.001735,0.005733,-0.001821,0.056498,0.025669,0.075816,-0.004531,0.017722,...,16763.0,25608.0,17387.0,14265.0,10209.0,548.0,10206.0,14137.0,17377.0,3.0
5,-0.001519,-0.008229,0.001617,0.008955,-0.008237,-0.030063,0.024122,0.036915,0.014272,-0.041966,...,19719.0,26993.0,20215.0,16370.0,10890.0,1582.0,11054.0,16710.0,20446.0,2.0
6,-0.011139,-0.010851,0.028236,0.026329,-0.027620,-0.139666,0.143627,0.182505,0.004287,-0.004671,...,15970.0,24093.0,16396.0,12661.0,8017.0,1320.0,8134.0,12844.0,16533.0,2.0
7,-0.025543,0.000101,0.032016,0.029305,-0.052878,-0.147229,0.101631,0.189707,0.031995,-0.045220,...,4914.0,7634.0,5530.0,4618.0,3336.0,115.0,3070.0,4238.0,5095.0,0.0
8,-0.005508,0.002500,0.005508,0.002525,0.009428,-0.004328,0.054499,0.019139,-0.004889,0.002986,...,3335.0,6316.0,3910.0,2832.0,2539.0,1224.0,2240.0,2363.0,3480.0,0.0
9,0.004934,-0.004679,0.005467,0.005213,-0.012408,0.009639,0.018832,0.017417,-0.003799,0.004467,...,10647.0,17745.0,11590.0,8578.0,5316.0,1429.0,5002.0,8087.0,11050.0,3.0


# CHECKPOINT
* ### Save the train image dataset for the following notebooks

In [16]:
if not os.path.exists(os.path.join('.', 'res_all')):
    os.makedirs(os.path.join('.', 'res_all'))
    
save_path = os.path.join('.', 'res_all')

In [17]:
pdDataset.to_pickle(os.path.join(save_path, 'pdDataset.pkl'))