In [5]:
# Librería de procesamiento de audio
import pyAudioAnalysis
# Matplotlib para gráficos
import matplotlib.pyplot as plt
# Librerías fundamentales
import numpy as np
import glob
import os
import pandas as pd
# Sklearn
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectFromModel, SelectKBest, VarianceThreshold, chi2
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, roc_curve, confusion_matrix
from sklearn.svm import SVC
from sklearn.decomposition import PCA

In [6]:
# Importa nombres de archivos y lee matriz de labels
path = "pyaudio/data"
filenames_ravdess = glob.glob(path+"/*.npy")
ravdess_labels = pd.read_csv('pyaudio/labels.csv',delimiter=',',names=['filename','label'])
ravdess_labels.head(5)

Unnamed: 0,filename,label
0,03-01-01-01-02-02-12_0,1
1,03-01-01-01-01-02-20_1,1
2,03-01-04-02-01-02-06_0,4
3,03-01-05-01-02-02-14_1,5
4,03-01-07-02-01-01-24_0,7


In [7]:
# Crea el dataset RAVDESS
dataset_ravdess = []
for filepath in filenames_ravdess:
    data = np.load(filepath).flatten()
    filename = filepath.split("/")[-1].split(".")[0]
    label = ravdess_labels[ravdess_labels['filename']==filename]['label'].to_numpy()[0]
    dataset_ravdess.append([data,label])  
dataset_ravdess = pd.DataFrame(dataset_ravdess,columns=['feats','label'])
dataset_ravdess.head(5)

Unnamed: 0,feats,label
0,"[0.04335139641517299, 0.0141725719049604, 0.01...",6
1,"[0.2275948311796582, 0.03793247186327636, 0.03...",4
2,"[0.0358482701125469, 0.0283451438099208, 0.011...",6
3,"[0.0425177157148812, 0.05168820341809087, 0.06...",5
4,"[0.023759899958315966, 0.07461442267611505, 0....",2


In [8]:
# Separación de dataset en conjuntos de entrenamiento, validación y test
X = dataset_ravdess['feats']
y = dataset_ravdess['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, 
                                                  random_state=42)

In [9]:
# Definición de experimentos a realizar 
def experiment_1():
    param_grid= {'svm__kernel':('linear', 'rbf'), 'svm__C':[0.1, 1, 10, 20]}
    pipe = Pipeline(steps=[('scaler',StandardScaler()),
                         ('svm',SVC(gamma='auto'))])
    clf = GridSearchCV(pipe,param_grid,cv=5,n_jobs=-1)
    clf.fit(X_train.to_list(),y_train)
    print("Mejores parámetros encontrados: ",clf.best_params_)
    return clf
    
def experiment_2():
    param_grid= {'svm__kernel':('linear', 'rbf'), 'svm__C':[0.1, 1, 10, 20]}
    pipe = Pipeline(steps=[('svm',SVC(gamma='auto'))])
    clf = GridSearchCV(pipe,param_grid,cv=5,n_jobs=-1)
    clf.fit(X_train.to_list(),y_train)
    print("Mejores parámetros encontrados: ", clf.best_params_)
    return clf

def experiment_3():
    clf = RandomForestClassifier(n_estimators=1000)
    clf.fit(X_train.to_list(),y_train)
    return clf

def experiment_4():
    clf = Pipeline([('feat_selection',VarianceThreshold(threshold=(.8 * (1 - .8)))),
                    ('svm',SVC(gamma='auto',C=10,kernel='rbf'))])
    clf.fit(X_train.to_list(),y_train)
    return clf

def experiment_5():
    clf = Pipeline([('feat_selection',SelectFromModel(RandomForestClassifier(n_estimators=500))),
                    ('svm',SVC(gamma='auto',C=10,kernel='rbf'))])
    clf.fit(X_train.to_list(),y_train)
    return clf
    
def experiment_6():
    param_grid= {'feat_selection__k':[100,500,1000]}
    pipe = Pipeline([('feat_selection',SelectKBest()),
                    ('svm',SVC(gamma='auto',C=10,kernel='rbf'))])
    clf = GridSearchCV(pipe,param_grid,cv=5,n_jobs=-1)
    clf.fit(X_train.to_list(),y_train)
    print("Mejor Cantidad de Features: ",clf.best_params_)
    return clf

def experiment_7():
    param_grid= {'pca__n_components':[10,50,100,500]}
    pipe = Pipeline([('feat_selection',SelectFromModel(RandomForestClassifier(n_estimators=500))),
                     ('pca',PCA()),
                    ('svm',SVC(gamma='auto',C=10,kernel='rbf'))])
    clf = GridSearchCV(pipe,param_grid,cv=5,n_jobs=-1)
    clf.fit(X_train.to_list(),y_train)
    print("Mejor Reducción de Dimensionalidad: ",clf.best_params_)
    return clf

def experiment_8():
    param_grid= {'pca__n_components':[10,50,100,500,1000]}
    pipe = Pipeline([('pca',PCA()),
                    ('svm',SVC(gamma='auto',C=10,kernel='rbf'))])
    clf = GridSearchCV(pipe,param_grid,cv=5,n_jobs=-1)
    clf.fit(X_train.to_list(),y_train)
    print("Mejor Reducción de Dimensionalidad: ",clf.best_params_)
    return clf

In [10]:
# Ejecuta un experimento
clf = experiment_6()

  940  960  980 1000 1020 1040 1060 1080 1100 1120 1140 1160 1180 1200
 1220 1240 1260 1280 1300 1320 1340] are constant.
  f = msb / msw


Mejor Cantidad de Features:  {'feat_selection__k': 500}


In [11]:
# Evalúa sobre el conjunto de validación
pred = clf.predict(X_val.to_list())
print(classification_report(y_val, pred))
print("MATRIZ DE CONFUSIÓN:\n\n{}".format(confusion_matrix(y_val,pred)))

              precision    recall  f1-score   support

           1       0.44      0.48      0.46        25
           2       0.75      0.71      0.73        75
           3       0.33      0.40      0.36        53
           4       0.36      0.46      0.40        59
           5       0.59      0.68      0.63        72
           6       0.54      0.36      0.43        73
           7       0.57      0.65      0.61        72
           8       0.49      0.36      0.41        73

    accuracy                           0.52       502
   macro avg       0.51      0.51      0.50       502
weighted avg       0.53      0.52      0.52       502

MATRIZ DE CONFUSIÓN:

[[12  5  0  7  0  0  1  0]
 [ 6 53  4  7  0  1  3  1]
 [ 2  4 21  6  5  4  5  6]
 [ 3  3  6 27  6  3  9  2]
 [ 3  0  5  3 49  2  6  4]
 [ 0  4 10  8  7 26  7 11]
 [ 0  0  4  8  5  5 47  3]
 [ 1  2 13  9 11  7  4 26]]
