# Libraries

In [25]:
import os
from pathlib import Path
import numpy as np
import pickle
import pandas as pd

from sklearn.preprocessing import StandardScaler, Normalizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
import scipy

In [26]:
features_path = Path.cwd()
notebooks_path = features_path.parent
repo_path = notebooks_path.parent
os.chdir(str(features_path))
#print current working directory
print(notebooks_path)

/home/ricardino/Documents/MAIA/tercer_semestre/CAD/Projecte/Machine_Learning/notebooks


# Functions and classes

In [27]:
class path_label():
    """Class to access paths and labels from csv
    """
    def __init__(self, meta=pd.read_csv(str(repo_path) + '/data/meta_info.csv', sep='\t'), classif='binary', set_name='train') -> None:
        meta = meta.loc[meta['classif'] == classif] #Filter by classif
        meta = meta.loc[meta['set'] == set_name] #Filter by set
        self.paths = list(meta.path)
        self. labels = np.array(meta.label)
        self.FOV_x1 = np.array(meta.FOV_x1, dtype=np.int16)
        self.FOV_x2 = np.array(meta.FOV_x2, dtype=np.int16)
        self.FOV_y1 = np.array(meta.FOV_y1, dtype=np.int16)
        self.FOV_y2 = np.array(meta.FOV_y2, dtype=np.int16)

In [28]:
def save_pickle(file, filename):
    """save as pickle

    Args:
        file (obj): object to save
        filename (str): path of the object to save
    """
    with open(filename, 'wb') as handle:
        pickle.dump(file, handle, pickle.HIGHEST_PROTOCOL)

def open_pickle(filename):
    """open pickle file

    Args:
        filename (str): path of pickle file

    Returns:
        obj: object extracted form pickle
    """
    with open(filename, 'rb') as handle:
        return pickle.load(handle)

In [38]:
def classifier(method, grid_type='normal', scaler=StandardScaler(), verbose=4):
    """get pipe and grid for classifier

    Args:
        method (str): classifier name
        scaler (scikit object, optional): type of scaler. Defaults to StandardScaler().
        verbose (int, optional): verbose level. Defaults to 4.

    Returns:
        _type_: _description_
    """
    #ML Training setting
    if(method=='KNN'):
        param_grid = {'classifier__n_neighbors': list(range(1,40))}
        pipe = Pipeline([('scaler', scaler),('classifier',KNeighborsClassifier())])
        grid = GridSearchCV(pipe, param_grid, verbose = verbose)
        return grid, pipe
    elif(method=='RF'):
        param_grid = {'classifier__n_estimators': [100, 200, 400],}
        pipe = Pipeline([('scaler', scaler),('classifier',RandomForestClassifier())])
        grid = GridSearchCV(pipe, param_grid, verbose = verbose)
        return grid, pipe
    elif(method=='SVM'):
        if(grid_type=='normal'):
            param_grid = {'classifier__C': [0.1, 1, 10, 100],
                    'classifier__gamma': [100, 10, 1, 0.1, 0.01, 0.001, 0.0001],
                    'classifier__kernel': ['rbf'],'classifier__class_weight':['balanced']}
            pipe = Pipeline([('scaler', scaler),('classifier',SVC())])
            grid = GridSearchCV(pipe, param_grid, scoring='accuracy', verbose = verbose)
            return grid, pipe
        elif(grid_type=='random'):
            parameters = {'classifier__C': scipy.stats.expon(scale=10), 'classifier__gamma': scipy.stats.expon(scale=.001), #Parameters for grid search
            'classifier__kernel': ['rbf'], 'classifier__class_weight':['balanced']}
            pipe = Pipeline([('scaler', scaler),('classifier',SVC())]) #Definition of pipeline
            grid = RandomizedSearchCV(pipe, parameters,n_iter=100, scoring='accuracy', verbose=verbose, return_train_score=False) #Random search
            return grid, pipe

# IMP

In [30]:
def color_data_load(classif, color_space, f_name):
    # Read train and valiadation data
    meta = pd.read_csv(str(repo_path) + '/data/meta_info.csv', sep='\t') #For labels
    f_type = 'color'

    #We read the training data
    X_train = open_pickle(str(repo_path)+ f'/data/features/{f_type}/{color_space}/{classif}_train_{f_type}_{f_name}_fv.p')
    y_train = path_label(meta, classif, set_name='train').labels
    #Print shapes to be sure that dimensions are the same
    print(f'The shape of the training data is {X_train.shape} and its labels are {y_train.shape}')

    #We read the validation data
    X_val = open_pickle(str(repo_path)+ f'/data/features/{f_type}/{color_space}/{classif}_val_{f_type}_{f_name}_fv.p')
    y_val = path_label(meta, classif, set_name='val').labels
    print(f'The shape of the valdiation data is {X_val.shape} and its labels are {y_val.shape}')
    
    return X_train, X_val, y_train, y_val

In [41]:
classif = 'binary'; color_space = 'RGB'; f_name = 'ColorStats' #To locate feature matrices

X_train, X_val, y_train, y_val = color_data_load(classif, color_space, f_name)

grid, pipe = classifier('KNN', grid_type='random', scaler=StandardScaler(), verbose=4)
model = grid.fit(X_train,y_train)
save_pickle(model, str(repo_path) + f'/data/models/{classif}_{pipe.steps[-1][-1]}_{f_name}_{color_space}.p') #Save model
y_pred = model.predict(X_val) #predict
acc = np.mean(y_pred == y_val) #accuracy
print(acc)

The shape of the training data is (15195, 54) and its labels are (15195,)
The shape of the valdiation data is (3796, 54) and its labels are (3796,)
Fitting 5 folds for each of 39 candidates, totalling 195 fits
[CV 1/5] END .........classifier__n_neighbors=1;, score=0.782 total time=   0.5s
[CV 2/5] END .........classifier__n_neighbors=1;, score=0.763 total time=   0.4s
[CV 3/5] END .........classifier__n_neighbors=1;, score=0.727 total time=   0.4s
[CV 4/5] END .........classifier__n_neighbors=1;, score=0.702 total time=   0.4s
[CV 5/5] END .........classifier__n_neighbors=1;, score=0.733 total time=   0.4s
[CV 1/5] END .........classifier__n_neighbors=2;, score=0.759 total time=   0.4s
[CV 2/5] END .........classifier__n_neighbors=2;, score=0.759 total time=   0.4s
[CV 3/5] END .........classifier__n_neighbors=2;, score=0.726 total time=   0.4s
[CV 4/5] END .........classifier__n_neighbors=2;, score=0.708 total time=   0.4s
[CV 5/5] END .........classifier__n_neighbors=2;, score=0.732

In [42]:
model.best_estimator_

Pipeline(steps=[('scaler', StandardScaler()),
                ('classifier', KNeighborsClassifier(n_neighbors=19))])