## Speech Recognition

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import IPython.display as ipd
import librosa


Constantes

In [None]:
DATASET_ROOT='/Users/ginovalverde/projects/deep_learning/data/raw/16000_pcm_speeches'
BENJAMIN_DATA=os.path.join(DATASET_ROOT, 'Benjamin_Netanyau')
JENS_DATA=os.path.join(DATASET_ROOT, 'Jens_Stoltenberg')
JULIA_DATA=os.path.join(DATASET_ROOT, 'Julia_Gillard')
MARGARET_DATA=os.path.join(DATASET_ROOT, 'Magaret_Tarcher')
NELSON_DATA=os.path.join(DATASET_ROOT, 'Nelson_Mandela')

In [None]:
os.path.exists(DATASET_ROOT), f"Dataset root {DATASET_ROOT} does not exist"

Play audios from speech files

In [None]:
ipd.Audio(os.path.join(BENJAMIN_DATA, '22.wav'))

In [None]:
wav, sr = librosa.load(os.path.join(BENJAMIN_DATA, '22.wav'))

In [None]:
print(wav)

In [None]:
print(sr)

In [None]:
print('the audio long is',len(wav)/sr, "seconds")

Wav visualization

In [None]:
plt.plot(wav)
plt.show()

In [None]:
plt.plot(wav[1000:1200])

In [None]:
wav,sr = librosa.load(os.path.join(BENJAMIN_DATA, '22.wav'), sr=None)
print('Sampling rate: {} Hz'.format(sr))    

## Dataset Preparation

In [None]:
def parse_dataset(dataset_paths):
    """
    Parse the dataset and return a DataFrame with the file paths and labels.
    """
    X = []
    y = []
    for index, dataset in enumerate (dataset_paths):
        print("[+] Prsing {} data ...".format(dataset))
        for fname in os.listdir(dataset):
            wav,sr = librosa.load(os.path.join(dataset, fname), sr=None)
            X.append(wav)
            y.append(index)
    return(X, y)
    

In [None]:
X,y = parse_dataset([BENJAMIN_DATA, JENS_DATA])

In [None]:
print("Dataframe shape:",len(X), len(y))

## Dataset split

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05 , random_state=42)

In [None]:
print("Lenght of train set:", len(X_train))
print("Lenght of test set:", len(X_test))

## Model

In [None]:
from sklearn.neural_network import MLPClassifier

clf= MLPClassifier(activation='logistic',hidden_layer_sizes=(10,),solver='sgd')
clf.fit(X_train, y_train)

In [None]:
y_pred = clf.predict(X_test)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix,f1_score

In [None]:
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(f1_score(y_test, y_pred, average='binary'))

## Part 2. Another sound representation : Spectrograms

In [None]:
#Fourier transformation
D= librosa.amplitude_to_db(np.abs(librosa.stft(wav)), ref=np.max)
librosa.display.specshow(D, y_axis='linear')
plt.show()

* **librosa.stft**: Calcula la transformada de fourie. Los valores de retorno son una matriz donde X son los numeros de ventada e Y son las frecuencias.
* **np.abs**: toma el absoluto del stft en caso de numero complejo devuelve el absoluto de la parte real. 
* **librosa.amplirud_a_db**: Convierte los valores a Decibelios.
* **librosa.display.specshow**: Muestra el espectrograma.

In [None]:
D.shape

## Data Prep

In [None]:
def parse_dataset(dataset_paths):
    """
    Parse the dataset and return a DataFrame with the file paths and labels.
    """
    X = []
    y = []
    for index, dataset in enumerate (dataset_paths):
        print("[+] Prsing {} data ...".format(dataset))
        for fname in os.listdir(dataset):
            wav,sr = librosa.load(os.path.join(dataset, fname), sr=None)
            D= librosa.amplitude_to_db(np.abs(librosa.stft(wav)), ref=np.max)
            X.append(D)
            y.append(index)
    return(X, y)
    

In [None]:
#X_prep, y_prep = parse_dataset([BENJAMIN_DATA, JENS_DATA])

X_prep, y_prep = parse_dataset([BENJAMIN_DATA, JENS_DATA,JULIA_DATA,MARGARET_DATA,NELSON_DATA])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_prep, y_prep, test_size=0.05 , random_state=42)

In [None]:
X_train[0].shape

In [None]:
X_train_prep=np.array(X_train).reshape((len(X_train),1025*32))
X_train_prep=np.array(X_train_prep).astype('float')/255 #Normalization
y_train_prep=np.array(y_train)

X_test_prep=np.array(X_test).reshape((len(X_test),1025*32))
X_test_prep=np.array(X_test_prep).astype('float')/255 #Normalization
y_test_prep=np.array(y_test)

In [None]:
clf= MLPClassifier(activation='logistic',hidden_layer_sizes=(10,),solver='sgd')
clf.fit(X_train_prep, y_train_prep)

In [None]:
y_pred=clf.predict(X_test_prep)

In [None]:
f1_score(y_test_prep, y_pred, average='weighted')