In [11]:
from google.colab import drive
import numpy as np
import librosa as lr
import os,glob
import wave
import struct
import matplotlib.pyplot as plt

from keras.models import Model, load_model, Sequential
from keras.layers import Conv2D, MaxPooling2D, Dense, Flatten, AveragePooling2D
from keras.layers import Dropout, Input, BatchNormalization
from keras.optimizers import Nadam, SGD
from keras.preprocessing.image import ImageDataGenerator
from keras.utils import np_utils
from keras.utils.np_utils import to_categorical   

drive.mount("/content/gdrive")


Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


##Generación dataset melspectrograms
 Generamos el dataset de entrenamiento a partir de los audios. Para esto tomamos melspectograms de cada audio y lo guardamos en un array con el idioma que era.

In [0]:
def get_wav_samples(wav): ##https://stackoverflow.com/questions/7769981/how-to-convert-wav-file-to-float-amplitude
    astr = wav.readframes(wav.getnframes())
    a = struct.unpack("%ih" % (wav.getnframes() * wav.getnchannels()), astr)
    a = [float(val) / pow(2, 15) for val in a]
    return a

src_dir="/content/gdrive/Shared drives/Procesamieto de Voz/TP2 - Language identification/Dataset 5 secs/"
window_time = 0.0085;                 # 20ms para la voz con overlap es todo lo que está bien
rate = 16000;                       # Asegurarse que windowsize se pueda dividir por 2! (Para el overlap.
window_size = int(window_time * rate)
audio_len = 5 * rate                # Todos son de 5 segundos
n_windows= int(2 * (audio_len - window_size) / window_size)
nmels=39                            # Cantidad de valores para el mel spectrogram

n_audios_per_lang = 1000

whole_dataset = np.zeros((n_audios_per_lang * 3, nmels, 589))
dict = {0 : "de", 1 : "en", 2 : "es"}
index = 0
for lang in range(3):
    os.chdir(src_dir + dict[lang])
    for file in glob.glob("*.wav"):
        win = wave.open(file, 'r')
        speech_samples = np.array(get_wav_samples(win)) 
        spec = lr.feature.melspectrogram(speech_samples, sr=rate, n_mels=128, hop_length=window_size, power=2.0)
        whole_dataset[index,:,:] = lr.core.amplitude_to_db(spec)
        win.close()
        index += 1
        if index % n_audios_per_lang == 0:
          break

Ahora ya tenemos el dataset, dividamoslo en training, validación y test. Por último se guarda el mismo en un dataset de hdf5, para después ser leído 

In [0]:
dataset_target = np.zeros((1, n_audios_per_lang * 3))
dataset_target[0, n_audios_per_lang : n_audios_per_lang * 2] = 1;
dataset_target[0, n_audios_per_lang * 2 : n_audios_per_lang * 3] = 2; #Se ponen los targets

In [0]:
y = to_categorical(dataset_target, num_classes=3)
y = y[0, :, :]
print(y.shape)

(3000, 3)


In [0]:
x = whole_dataset
print(x.shape)

(3000, 39, 589)


In [0]:
# index_permut = np.random.permutation(n_audios_per_lang * 3) # Se desordenan los datos aleatoriamente
# dataset_target = dataset_target[0,index_permut]
# whole_dataset = whole_dataset[index_permut,:,:];

# Se divide el dataset en train valid y test:

# Se divide el dataset en train valid y test:
fdset_dir="/content/gdrive/Shared drives/Procesamieto de Voz/TP2 - Language identification/Dataset Melspectrogram/"

np.savez(fdset_dir + "X&Y39", x, y)

##Recuperando los datos


In [0]:
dataset_path="/content/gdrive/Shared drives/Procesamieto de Voz/TP2 - Language identification/Dataset Melspectrogram/X&Y.npz"
npzfile = np.load(dataset_path)
ordered_x = npzfile['arr_0']
ordered_y = npzfile['arr_1']
print(ordered_x.shape, ordered_y.shape)

(3000, 128, 501) (3000, 3)


In [0]:
# Se expande la dimensionalidad de x
x = np.expand_dims(ordered_x, axis = 3)
y = ordered_y
print(x.shape)

(3000, 128, 501, 1)


In [0]:
# Se desordenan los datos aleatoriamente
index_permut = np.random.permutation(x.shape[0])
y = y[index_permut, :]
x = x[index_permut, :, :, :]



# Se divide el dataset en train, validation y test
tr_ratio = 0.8
va_ratio = 0.1
te_ratio = 0.1

a = int(tr_ratio * x.shape[0])
b = int((tr_ratio + va_ratio) * x.shape[0])

x_tr=x[:a, :, :, :]
y_tr=y[:a, :]
x_va=x[a:b, :, :, :]
y_va=y[a:b, :]
x_te=x[b:, :, :, :]
y_te=y[b:, :]
print(x_tr.shape, y_tr.shape)
print(x_va.shape, y_va.shape)
print(x_te.shape, y_te.shape)

(2400, 128, 501, 1) (2400, 3)
(300, 128, 501, 1) (300, 3)
(300, 128, 501, 1) (300, 3)


##Entrenando la red


In [0]:
in_dim = (128, 501, 1)
out_dim = (3)

In [0]:
# # Create model
# model = Sequential()

# # Add model layers
# model.add(Conv2D(64, kernel_size=3, activation='elu', padding='same', input_shape=in_dim))
# model.add(MaxPooling2D(pool_size=2))
# model.add(Conv2D(32, kernel_size=3, padding='same', activation='elu'))
# model.add(MaxPooling2D(pool_size=2))
# model.add(Conv2D(16, kernel_size=6, padding='same', activation='elu'))
# model.add(MaxPooling2D(pool_size=2))
# model.add(Conv2D(8, kernel_size=6, padding='same', activation='elu'))
# model.add(MaxPooling2D(pool_size=2))
# model.add(Flatten())
# model.add(Dropout(0.5))
# model.add(Dense(512, activation='elu'))
# model.add(Dense(out_dim, activation='softmax'))

#Arquitectura adaptada de http://yerevann.github.io/2016/06/26/combining-cnn-and-rnn-for-spoken-language-identification/
model = Sequential()
model.add(Conv2D(16, kernel_size=7, activation='relu', padding='same', input_shape=in_dim))
model.add(MaxPooling2D(pool_size=3, strides=2, padding='same'))
model.add(BatchNormalization(axis=3))
model.add(Conv2D(32, kernel_size=5, activation='relu', padding='same'))
model.add(MaxPooling2D(pool_size=3, strides=2, padding='same'))
model.add(BatchNormalization(axis=3))
model.add(Conv2D(64, kernel_size=3, activation='relu', padding='same'))
model.add(MaxPooling2D(pool_size=3, strides=2, padding='same'))
model.add(BatchNormalization(axis=3))
model.add(Conv2D(128, kernel_size=3, activation='relu', padding='same'))
model.add(MaxPooling2D(pool_size=3, strides=2, padding='same'))
model.add(BatchNormalization(axis=3))
model.add(Conv2D(256, kernel_size=3, activation='relu', padding='same'))
model.add(MaxPooling2D(pool_size=3, strides=2, padding='same'))
model.add(BatchNormalization(axis=3))
model.add(Flatten())
model.add(BatchNormalization())
model.add(Dropout(0.5))
model.add(Dense(out_dim, activation='softmax'))

In [0]:
model.summary()

Model: "sequential_45"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_140 (Conv2D)          (None, 128, 501, 16)      800       
_________________________________________________________________
max_pooling2d_117 (MaxPoolin (None, 64, 251, 16)       0         
_________________________________________________________________
batch_normalization_35 (Batc (None, 64, 251, 16)       64        
_________________________________________________________________
conv2d_141 (Conv2D)          (None, 64, 251, 32)       12832     
_________________________________________________________________
max_pooling2d_118 (MaxPoolin (None, 32, 126, 32)       0         
_________________________________________________________________
batch_normalization_36 (Batc (None, 32, 126, 32)       128       
_________________________________________________________________
conv2d_142 (Conv2D)          (None, 32, 126, 64)     

In [0]:
model.compile(loss='categorical_crossentropy', optimizer=Nadam(lr=1e-3), metrics=['accuracy'])
model.fit(x_tr, y_tr, epochs=10, verbose=1, validation_data=(x_va, y_va))

Train on 2400 samples, validate on 300 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f4ff8c31710>

In [0]:
model.evaluate(x_te, y_te)



[5.33796023050944, 0.6666666666666666]

In [0]:
model.compile(loss='categorical_crossentropy', optimizer=Nadam(lr=1e-4), metrics=['accuracy'])
model.fit(x_tr, y_tr, epochs=5, verbose=1, validation_data=(x_va, y_va))

Train on 2400 samples, validate on 300 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f4ff87f1b00>

In [0]:
  model.evaluate(x_te, y_te)



[5.33796023050944, 0.6666666666666666]

In [0]:
model.save("/content/gdrive/Shared drives/Procesamieto de Voz/TP2 - Language identification/Model Test.h5")

In [0]:
model = load_model("/content/gdrive/Shared drives/Procesamieto de Voz/TP2 - Language identification/Model Test.h5")

In [0]:
x_tr_original = x_tr
y_tr_original = y_tr
x_te_original = x_te
y_te_original = y_te
x_va_original = x_va
y_va_original = y_va
np.savez("/content/gdrive/Shared drives/Procesamieto de Voz/TP2 - Language identification/dataset.npz", x_tr, y_tr, x_te, y_te, x_va, y_va)

In [0]:
def get_wav_samples(wav): ##https://stackoverflow.com/questions/7769981/how-to-convert-wav-file-to-float-amplitude
    astr = wav.readframes(wav.getnframes())
    a = struct.unpack("%ih" % (wav.getnframes() * wav.getnchannels()), astr)
    a = [float(val) / pow(2, 15) for val in a]
    return a

src_dir="/content/gdrive/Shared drives/Procesamieto de Voz/TP2 - Language identification/Dataset Own/"
window_time = 0.0085;                 # 20ms para la voz con overlap es todo lo que está bien
rate = 16000;                       # Asegurarse que windowsize se pueda dividir por 2! (Para el overlap.
window_size = int(window_time * rate)
audio_len = 5 * rate                # Todos son de 5 segundos
n_windows= int(2 * (audio_len - window_size) / window_size)
nmels=128                            # Cantidad de valores para el mel spectrogram

archivos = 17

x_own = np.zeros((archivos, nmels, 501))
index = 0
os.chdir(src_dir)
for file in glob.glob("/content/gdrive/Shared drives/Procesamieto de Voz/TP2 - Language identification/Dataset Own/*.wav"):
    win = wave.open(file, 'r')
    speech_samples = np.array(get_wav_samples(win)) 
    spec = lr.feature.melspectrogram(speech_samples, sr=rate, n_mels=128, hop_length=window_size, power=2.0)
    x_own[index,:,:] = lr.core.amplitude_to_db(spec[:, 100:601])
    win.close()
    index += 1

y_own = [1, 1, 1, 2, 2, 2, 2, 1, 1, 1, 2, 2, 2, 1, 1, 2, 2]

# Se expande la dimensionalidad de x
x_own = np.expand_dims(x_own, axis = 3)
y_own = to_categorical(y_own, num_classes=3)
x_own /= 255
print(x_own.shape, y_own.shape)

(17, 128, 501, 1) (17, 3)


In [0]:
  print(model.evaluate(x_own, y_own))
  print(model.predict(x_own))
  print(y_own)

[2.854539632797241, 0.7058823704719543]
[[2.2194996e-03 9.9778050e-01 3.2410256e-08]
 [8.7271699e-13 9.8292887e-01 1.7071070e-02]
 [8.8828571e-09 1.0000000e+00 5.6174130e-13]
 [2.7988443e-12 9.9999583e-01 4.1967819e-06]
 [1.4651772e-17 8.0841540e-13 1.0000000e+00]
 [2.0879709e-04 9.9978846e-01 2.7640604e-06]
 [1.7282987e-09 1.0000000e+00 2.0356675e-10]
 [5.3301451e-06 9.9996245e-01 3.2213567e-05]
 [2.9195533e-06 9.9951625e-01 4.8080151e-04]
 [5.7350085e-06 1.8805169e-01 8.1194258e-01]
 [1.5271029e-11 2.7626423e-02 9.7237360e-01]
 [3.3131877e-14 1.0308078e-04 9.9989688e-01]
 [4.3856178e-07 1.8704574e-06 9.9999774e-01]
 [4.6486500e-12 9.9999619e-01 3.7760663e-06]
 [3.0761551e-13 9.5109159e-01 4.8908345e-02]
 [1.0449104e-08 9.9574357e-01 4.2564543e-03]
 [2.3486241e-08 4.8478966e-04 9.9951518e-01]]
[[0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 0. 1.]
 [0. 0. 1.]
 [0. 0. 1.]
 [0. 0. 1.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 0. 1.]
 [0. 0. 1.]
 [0. 0. 1.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 0. 1.]
 [

#Generando tablita
Esto es para generar la tablita para el informe

In [20]:
dataset_path="/content/gdrive/Shared drives/Procesamieto de Voz/TP2 - Language identification/dataset.npz"
model = load_model("/content/gdrive/Shared drives/Procesamieto de Voz/TP2 - Language identification/Model Test.h5")
npzfile = np.load(dataset_path)
x_tr, y_tr, x_te, y_te, x_va, y_va
x_tr = npzfile['arr_0']
y_tr = npzfile['arr_1']
x_te = npzfile['arr_2']
y_te = npzfile['arr_3']
x_va = npzfile['arr_4']
y_va = npzfile['arr_5']
model.evaluate(x_te, y_te)



[0.28088382463747014, 0.9366666666666666]

Entrenamos un poquito más para mejores resultados (No cambia mucho, así que se puede re omitir).

In [19]:
model.compile(loss='categorical_crossentropy', optimizer=Nadam(lr=1e-5), metrics=['accuracy'])
model.fit(x_tr, y_tr, epochs=5, verbose=1, validation_data=(x_va, y_va))

Train on 2400 samples, validate on 300 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
 288/2400 [==>...........................] - ETA: 2:44 - loss: 0.0894 - acc: 0.9896

KeyboardInterrupt: ignored

Ahora generamos la tablita:


In [29]:
from sklearn.metrics import classification_report, confusion_matrix
Y_pred = model.predict(x_te)
y_pred = np.argmax(Y_pred, axis=1)
y2_te = np.argmax(y_te, axis=1)
cm=confusion_matrix(y2_te, y_pred)
print('Confusion Matrix')
print(np.around(cm/cm.sum(axis=1, keepdims=True)*100,1))


Confusion Matrix
[[87.   9.8  3.3]
 [ 2.  97.   1. ]
 [ 0.   3.7 96.3]]


In [25]:
y2_te


array([1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 2, 1, 2, 0, 0, 1, 2,
       0, 0, 1, 2, 2, 0, 2, 2, 0, 0, 2, 0, 1, 1, 2, 2, 2, 1, 1, 0, 2, 1,
       1, 1, 1, 1, 2, 2, 0, 2, 2, 2, 1, 0, 1, 1, 0, 0, 2, 2, 1, 2, 1, 1,
       1, 2, 0, 0, 0, 1, 0, 2, 2, 1, 1, 2, 2, 2, 0, 1, 1, 1, 1, 2, 1, 0,
       2, 1, 0, 2, 2, 1, 2, 2, 2, 0, 1, 2, 1, 0, 0, 1, 2, 0, 1, 2, 2, 0,
       1, 2, 2, 0, 1, 1, 2, 2, 0, 0, 1, 2, 1, 0, 2, 2, 0, 2, 0, 1, 2, 2,
       1, 2, 1, 0, 1, 1, 0, 2, 0, 2, 0, 1, 1, 2, 0, 2, 0, 1, 2, 0, 0, 1,
       0, 1, 1, 0, 0, 2, 1, 0, 1, 1, 0, 1, 1, 1, 0, 2, 1, 2, 0, 2, 1, 0,
       0, 1, 2, 2, 0, 0, 0, 2, 0, 2, 1, 0, 2, 2, 2, 2, 0, 2, 2, 1, 0, 0,
       0, 2, 0, 1, 2, 0, 0, 2, 0, 1, 1, 2, 1, 1, 2, 0, 1, 0, 2, 2, 2, 2,
       2, 0, 2, 2, 0, 1, 1, 1, 0, 1, 2, 1, 1, 0, 2, 1, 1, 2, 0, 1, 2, 2,
       2, 2, 0, 1, 0, 2, 0, 0, 1, 2, 1, 2, 2, 1, 2, 2, 0, 0, 1, 0, 0, 2,
       2, 1, 2, 0, 1, 2, 0, 2, 1, 2, 2, 0, 2, 1, 0, 0, 2, 1, 1, 1, 2, 2,
       1, 2, 1, 2, 2, 2, 0, 1, 0, 0, 0, 1, 1, 2])