# Features Classification

In [1]:
# Requirements
import os
from tqdm.notebook import tqdm
import requests
import zipfile
import pydub
import numpy as np
import IPython
import matplotlib.pyplot as plt
import librosa
import librosa.display
import pandas as pd
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [12]:
from tqdm.keras import TqdmCallback
import tensorflow as tf
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler, LabelEncoder
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

In [3]:
# Load datasets previously created
features = pd.read_csv("features.csv", index_col=0)
more_features = pd.read_csv("more_features.csv", index_col=0)

In [28]:
class ClipsClassifier():
    """The purpose of this class is to collect all the necessary steps and functions to construct a classification
    model for our clips. 
    In particular, all the necessary steps to prepare the input dataset for the training process will 
    be implemented:
    * standardization
    * PCA
    * One Hot Encoding
    * train test split
    Then a k-fold cross validation can be made in order to test several combination of hyperparameters
    without constructing directly a validation set.
    In the end, the performances will be shown in term of accuracy/loss also over different macro-categories
    to finally quantify the quality of the model constructed.
    """
    
    def __init__(self, dataset):
        """Initialize some global parameters.
        Dataset is a pandas dataframe with several "features" columns and one "label" column, 
        that contains the data that we want to fit."""
        
        self.data = dataset        
        self.Setup_Classifier()
        
        self.setup_completed = False
        
        self.best_model = None
        self.hyperparameters_tuning_dict = {}
        self.nested_scores = []
        
        self.confusion_matrix = None
        
        
    def Setup_Classifier(self, pca_percentage=0.99, n_folds=5, n_jobs=-1, verbose=2,
                         scaler_method='standard', encoder_method='onehot'):
        """Change the value of some parameters/methods used during data pre-processing and training step."""
        
        self.n_jobs = n_jobs
        self.verbose = verbose
        self.scaler = scaler_method
        self.encoder = encoder_method
        self.pca_percentage = pca_percentage
        self.n_folds = n_folds
        
        # The sequent analysis will be performed via nested cross validation
        self.inner_cv = KFold(n_splits=self.n_folds, shuffle=True, random_state=42)
        self.outer_cv = KFold(n_splits=self.n_folds, shuffle=True, random_state=42)
        
        return self
        

    def _Setup_Data(self):
        """Performs standardization, PCA and label encoding."""
        
        # Standardize data
        if self.scaler == 'standard':
            std_data = StandardScaler().fit_transform(self.data.drop(['label'], axis=1))
        elif self.scaler == 'minmax':
            std_data = MinMaxScaler(feature_range=(-1,1)).fit_transform(self.data.drop(['label'], axis=1))
        else:
            print('Invalid value of the scaler. Available: standard, minmax')
            return
        
        # Let's apply the PCA keeping just a percentage of the information
        if (self.pca_percentage>0) and (self.pca_percentage<1):
            self.pca = PCA().fit(std_data)
            self.cev = np.cumsum(self.pca.explained_variance_ratio_)
            pca_data = PCA(n_components=np.argmax(self.cev>self.pca_percentage)).fit_transform(std_data)
        else:
            pca_data = std_data

        # Encode the labels
        if self.encoder == 'onehot':
            labels = OneHotEncoder(sparse=False).fit_transform(self.data[['label']].to_numpy())
        elif self.encoder == 'label':
            labels = LabelEncoder().fit_transform(self.data['label'])
        else:
            print('Invalid value of the encoder. Available: onehot, label')
            return
        
        self.setup_completed = True
        self.X = pca_data
        self.Y = labels
        
        
    def PCA_Variance_Ratio(self):
        """When PCA option is active, it plot the comulative sum of the variance ratio, in order
        to represent the amount of information stored in the first principal components."""
        
        if not self.setup_completed: self._Setup_Data()
        
        plt.plot(self.cev, color='red', lw=3, label='cev')
        plt.axvline(np.argmax(cev>self.pca_percentage), ls='--', c='black', lw=1, 
                    label='cev = {}'.format(round(self.cev[np.argmax(cev>0.95)], 2)))
        plt.xlabel('Number of components')
        plt.ylabel('Cumulative explained variance')
        plt.title('Study on the number of principal components')
        plt.legend();

        
    def Run_Grid_Search(self, model, parameters):
        """Because of the small amount of data available, the analysis will be performed running a 
        nested cross validation over the clip set. For tuning the hyperparameters an "inner" K-fold
        splitting will be defined to check which combination works better over the dataset."""
       
        # Standardize, encode and eventually apply pca on the dataset
        if not self.setup_completed: self._Setup_Data()
        
        # Non_nested parameter search and scoring
        clf = GridSearchCV(estimator=model, param_grid=parameters, n_jobs=self.n_jobs, 
                           verbose=self.verbose, cv=self.inner_cv)
        clf.fit(self.X, self.Y)
        self.hyperparameters_tuning_dict = clf.cv_results_
        self.best_model = clf.best_estimator_
        
        if self.verbose > 0:
            print("Optimal set of hyperparameters: ")
            print(clf.best_params_)
        
        
    
    def Run_Cross_Validation(self, model=None):
        """Because of the small amount of data available, the analysis will be performed running a 
        nested cross validation over the clip set. To estimate the performances of a model, an "outer" 
        K-fold splitting will be defined in order to compute the effective generalized accuracy as 
        the average of the validation values obtained among various folds.
        Because of the stochastic nature of the approach, it may be better to repeat several times the 
        run to check if the results are compatible between themselves."""
        
        # Standardize, encode and eventually apply pca on the dataset
        if not self.setup_completed: self._Setup_Data()
            
        # If not specified, run the validation for the best model found by the grid search
        if model is None: model = self.best_model
        
        # Nested CV cross validation
        self.nested_scores = cross_val_score(model, X=self.X, y=self.Y, n_jobs=self.n_jobs, 
                                             verbose=2, cv=self.outer_cv)
        best_accuracy = np.mean(self.nested_scores)
        
        if self.verbose > 0:
            print("Average final accuracy estimated: {}%".format(round(best_accuracy*100, 2)))  
            
    # Define another function macro accuracy, that computes ypred via cross_val_predict and you
    # convert them to macro
        
        
    def Compute_Confusion_Matrix(self, model=None):
        """Compute the confusion matrix according to the input model or the best one found by a 
        previous grid search."""
        
        # Standardize, encode and eventually apply pca on the dataset
        if not self.setup_completed: self._Setup_Data()
            
        # If not specified, run the validation for the best model found by the grid search
        if model is None: model = self.best_model
        
        if self.confusion_matrix is None:
            y_pred = cross_val_predict(model, self.X, self.Y, cv=self.outer_cv)
            self.confusion_matrix = confusion_matrix(self.Y, y_pred)
            
        return self.confusion_matrix
        

In [6]:
classifiers_performances = {'random_forest':{}, 'multi_layer_perceptron':{}, 'k_neighbors_classifier':{},
                           'support_vector_machine':{}}

### Random Forest

In [7]:
params_RF = {'n_estimators': [500, 1000],
             'bootstrap': [True, False],
             'max_samples' : [0.5, None],
             'max_features': ['sqrt']}

params_RF = {'n_estimators': [500],
             'bootstrap': [True],
             'max_samples' : [0.5],
             'max_features': ['sqrt']}

# RandomForest underperform with One Hot Encoding, so you need to change to LabelEncoder
rf_cc = ClipsClassifier(dataset = features)
rf_cc.Setup_Classifier(encoder_method='label', verbose=1)
rf_cc.Run_Grid_Search(model = RandomForestClassifier(), parameters = params_RF)
rf_cc.Run_Cross_Validation()

classifiers_performances['random_forest']['features'] = rf_cc.nested_scores

rf_cc = ClipsClassifier(dataset = more_features)
rf_cc.Setup_Classifier(encoder_method='label', verbose=1)
rf_cc.Run_Grid_Search(model = RandomForestClassifier(), parameters = params_RF)
rf_cc.Run_Cross_Validation()

classifiers_performances['random_forest']['more_features'] = rf_cc.nested_scores

Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.2s remaining:    6.2s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    5.6s finished


Optimal set of hyperparameters: 
{'bootstrap': True, 'max_features': 'sqrt', 'max_samples': 0.5, 'n_estimators': 500}


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.2s remaining:    6.3s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    5.4s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    5.4s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Average final accuracy estimated: 36.55%
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    7.6s remaining:   11.4s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    9.4s finished


Optimal set of hyperparameters: 
{'bootstrap': True, 'max_features': 'sqrt', 'max_samples': 0.5, 'n_estimators': 500}


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    7.7s remaining:   11.6s


Average final accuracy estimated: 52.95%


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    9.7s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    9.7s finished


### Multi-layer Perceptron

In [14]:
params_MLP = {'hidden_layer_sizes':[128, 256, 512],
             'activation':['logistic', 'relu'],
             'solver':['sgd', 'adam'],
              'learning_rate_init':[0.01, 0.001]}

params_MLP = {'hidden_layer_sizes':[512],
             'activation':['relu'],
             'solver':['adam'],
              'learning_rate_init':[0.01]}

mlp_cc = ClipsClassifier(dataset = features)
mlp_cc.Run_Grid_Search(model = MLPClassifier(), parameters = params_MLP)
mlp_cc.Run_Cross_Validation()

classifiers_performances['multi_layer_perceptron']['features'] = mlp_cc.nested_scores

mlp_cc = ClipsClassifier(dataset = more_features)
mlp_cc.Run_Grid_Search(model = MLPClassifier(), parameters = params_MLP)
mlp_cc.Run_Cross_Validation()

classifiers_performances['multi_layer_perceptron']['more_features'] = mlp_cc.nested_scores

Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:   10.2s remaining:   15.2s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   12.9s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   12.9s finished


Optimal set of hyperparameters: 
{'activation': 'relu', 'hidden_layer_sizes': 512, 'learning_rate_init': 0.01, 'solver': 'adam'}


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:   11.6s remaining:   17.4s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   13.5s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   13.5s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Average final accuracy estimated: 23.75%
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:   13.9s remaining:   20.8s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   14.6s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   14.6s finished


Optimal set of hyperparameters: 
{'activation': 'relu', 'hidden_layer_sizes': 512, 'learning_rate_init': 0.01, 'solver': 'adam'}


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:   12.7s remaining:   19.1s


Average final accuracy estimated: 43.4%


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   14.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   14.0s finished


### K-Neighbors Classifier

In [15]:
params_KNC = {'n_neighbors':[2,5,8,10],
             'weights':['uniform', 'distance'],
             'algorithm':['auto', 'ball_tree', 'kd_tree', 'brute'],
             'leaf_size':[10, 30, 50, 100]}

params_KNC = {'n_neighbors':[2],
             'weights':['distance'],
             'algorithm':['auto'],
             'leaf_size':[10]}

# RandomForest underperform with One Hot Encoding, so you need to change to LabelEncoder
knc_cc = ClipsClassifier(dataset = features)
knc_cc.Run_Grid_Search(model = KNeighborsClassifier(), parameters = params_KNC)
knc_cc.Run_Cross_Validation()

classifiers_performances['k_neighbors_classifier']['features'] = knc_cc.nested_scores

knc_cc = ClipsClassifier(dataset = more_features)
knc_cc.Run_Grid_Search(model = KNeighborsClassifier(), parameters = params_KNC)
knc_cc.Run_Cross_Validation()

classifiers_performances['k_neighbors_classifier']['more_features'] = knc_cc.nested_scores

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    0.2s remaining:    0.3s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.3s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.3s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Optimal set of hyperparameters: 
{'algorithm': 'auto', 'leaf_size': 10, 'n_neighbors': 2, 'weights': 'distance'}


[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    0.2s remaining:    0.3s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.2s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.2s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Average final accuracy estimated: 28.75%
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    0.3s remaining:    0.4s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.3s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.3s finished


Optimal set of hyperparameters: 
{'algorithm': 'auto', 'leaf_size': 10, 'n_neighbors': 2, 'weights': 'distance'}


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Average final accuracy estimated: 46.85%


[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    0.3s remaining:    0.4s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.3s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.3s finished


### Support Vector Machine

In [16]:
params_SVM = {'C':[0.1, 0.5, 1],
             'kernel':['linear', 'poly', 'rbf', 'sigmoid']}

# SVM doesn't work with One Hot Encoding, so you need to change to LabelEncoder
svm_cc = ClipsClassifier(dataset = features)
svm_cc.Setup_Classifier(encoder_method='label', verbose=1)
svm_cc.Run_Grid_Search(model = SVC(), parameters = params_SVM)
svm_cc.Run_Cross_Validation()

classifiers_performances['support_vector_machine']['features'] = svm_cc.nested_scores

svm_cc = ClipsClassifier(dataset = more_features)
svm_cc.Setup_Classifier(encoder_method='label', verbose=1)
svm_cc.Run_Grid_Search(model = SVC(), parameters = params_SVM)
svm_cc.Run_Cross_Validation()

classifiers_performances['support_vector_machine']['more_features'] = svm_cc.nested_scores

Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    4.2s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    7.0s finished


Optimal set of hyperparameters: 
{'C': 1, 'kernel': 'rbf'}


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    0.6s remaining:    0.9s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.7s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.8s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Average final accuracy estimated: 38.15%
Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    6.9s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:   10.4s finished


Optimal set of hyperparameters: 
{'C': 0.1, 'kernel': 'linear'}


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    0.5s remaining:    0.7s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.6s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.6s finished


Average final accuracy estimated: 61.25%


In [18]:
classifiers_performances

{'random_forest': {'features': array([0.375 , 0.34  , 0.365 , 0.37  , 0.3775]),
  'more_features': array([0.5175, 0.52  , 0.53  , 0.5325, 0.5475])},
 'multi_layer_perceptron': {'features': array([0.245 , 0.2075, 0.2525, 0.245 , 0.2375]),
  'more_features': array([0.425, 0.42 , 0.44 , 0.435, 0.45 ])},
 'k_neighbors_classifier': {'features': array([0.3025, 0.2825, 0.295 , 0.265 , 0.2925]),
  'more_features': array([0.4725, 0.4775, 0.4725, 0.4525, 0.4675])},
 'support_vector_machine': {'features': array([0.36  , 0.3975, 0.3625, 0.3875, 0.4   ]),
  'more_features': array([0.6025, 0.66  , 0.6125, 0.595 , 0.5925])}}

In [25]:
for i in classifiers_performances.values():print(i)

{'features': array([0.375 , 0.34  , 0.365 , 0.37  , 0.3775]), 'more_features': array([0.5175, 0.52  , 0.53  , 0.5325, 0.5475])}
{'features': array([0.245 , 0.2075, 0.2525, 0.245 , 0.2375]), 'more_features': array([0.425, 0.42 , 0.44 , 0.435, 0.45 ])}
{'features': array([0.3025, 0.2825, 0.295 , 0.265 , 0.2925]), 'more_features': array([0.4725, 0.4775, 0.4725, 0.4525, 0.4675])}
{'features': array([0.36  , 0.3975, 0.3625, 0.3875, 0.4   ]), 'more_features': array([0.6025, 0.66  , 0.6125, 0.595 , 0.5925])}


In [27]:
def flatten_dict(d):
    def items():
        for key, value in d.items():
            if isinstance(value, dict):
                for subkey, subvalue in flatten_dict(value).items():
                    yield key + "." + subkey, subvalue
            else:
                yield key, value

    return dict(items())
flatten(classifiers_performances)

TypeError: Undefined type for flatten: <class 'numpy.ndarray'>

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(10,6))
ax.set_ylim(0,1)

for method in classifiers_performances.values():
    sns.boxplot(y=method['features'])


In [None]:
fig, ax = plt.subplots(1, 1, figsize=(10,6))
for label in classifiers_performances_df['index']:
    sns.boxplot(x=label, y=classifiers_performances_df[classifiers_performances_df['index']==label])


In [None]:
fig, ax = plt.subplots(1, 1, figsize=(10,6))
sns.boxplot(x=1, y=classifiers_performances['random_forest']['more_features'], ax=ax)
sns.boxplot(x=2, y=classifiers_performances['multi_layer_perceptron']['more_features'], ax=ax)
sns.boxplot(y=classifiers_performances['k_neighbors_classifier']['more_features'], ax=ax)
sns.boxplot(y=classifiers_performances['support_vector_machine']['more_features'], ax=ax)

### Neural Network