In [1]:
import numpy as np
import pandas as pd
import glob
import librosa
import random
from collections import Counter

import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, Activation, LSTM
from keras.layers import Conv2D, MaxPooling2D, BatchNormalization
from keras.models import load_model

from sklearn.model_selection import train_test_split

Using TensorFlow backend.


In [4]:
path_to_audio = '../data_cut/'
path_to_validation = '/test_c/'

def_w, def_h = 0, 0 # Default width and height of spectogram images
num_classes = 5
skip = 5 # Useful in signal[skip::] to shrink data size, not necessary right now

# Returns correct int from file name
def parse_number(file_path):
  return int(''.join(ch for ch in list(file_path) if ch.isdigit()))

# Return list of tuples (file_path, correct number)
def list_of_audios(dir_path):
  arr = glob.glob(dir_path + '*.wav')
  random.shuffle(arr) # Shuffled data is better for training
  return list(map(lambda x: (x, parse_number(x)), arr))

df = pd.DataFrame(list_of_audios(path_to_audio), columns = ['file_name', 'correct'])
df.head()

Unnamed: 0,file_name,correct
0,../data_cut/nino-chan-1f.wav,1
1,../data_cut/levan-gela-4h.wav,4
2,../data_cut/rezi-mesh-2c.wav,2
3,../data_cut/nanuka-altu-5b.wav,5
4,../data_cut/rezi-mesh-2a.wav,2


In [11]:
def audios_to_spectograms(file_names):
  # Save different shapes in a set
  x, shapes = [], set()

  # Enumerate for logging
  for indx, audio_file in enumerate(file_names):
    # Use mfcc algorithm for spectograms
    signal, sampling_rate = librosa.load(audio_file) 
    matrix = librosa.feature.mfcc(signal, sampling_rate)

    x.append(matrix)
    shapes.add(matrix.shape)
    if (indx+1) % 50 == 0: print('{} analyzed out of {}'.format(indx+1, len(file_names))) # Log progress
    
  return x, shapes

def choose_max_shapes(shapes):
  # Iterate over shapes and choose biggest possible width and height
  w, h = 0, 0
  for shape in shapes:
    w = max(w, shape[0])
    h = max(h, shape[1])
  return w, h

matrices, shapes = audios_to_spectograms(df['file_name'])
print('Different shapes:', shapes)
def_w, def_h = choose_max_shapes(shapes)
print('Every spectogram should be size of:', (def_w, def_h))

50 analyzed out of 680
100 analyzed out of 680
150 analyzed out of 680
200 analyzed out of 680
250 analyzed out of 680
300 analyzed out of 680
350 analyzed out of 680
400 analyzed out of 680
450 analyzed out of 680
500 analyzed out of 680
550 analyzed out of 680
600 analyzed out of 680
650 analyzed out of 680
Different shapes: {(20, 20), (20, 31), (20, 28), (20, 24)}
Every spectogram should be size of: (20, 31)


In [12]:
def pad_spectogram(matrix):
  # Since width is always 20 in mfcc, we only check for height difference
  if matrix.shape[1] < def_h:
    diff = def_h - matrix.shape[1]
    # Append half of the difference in beginning
    matrix = np.append(np.zeros((matrix.shape[0], diff//2), dtype=float), matrix, axis=1)
    #Append res in the end
    matrix = np.append(matrix, np.zeros((matrix.shape[0], diff - diff//2), dtype=float), axis=1)
  return matrix

x = np.array([pad_spectogram(matrix) for matrix in matrices])
print(x.shape)

(680, 20, 31)


In [13]:
# One hot encode correct numbers
y = np.matrix([[0] * (num-1) + [1] + [0] * (num_classes - num) for num in df['correct'].values])
print(y)

[[1 0 0 0 0]
 [0 0 0 1 0]
 [0 1 0 0 0]
 ...
 [0 0 0 1 0]
 [0 1 0 0 0]
 [0 0 1 0 0]]


In [14]:
x_r = x.reshape(*x.shape, 1)
y_r = y.reshape(*y.shape, 1)
#x_r = (x_r - x_r.mean()) / x_r.std()
input_shape = x_r.shape[1:]
print(x_r.shape, y_r.shape, input_shape)

(680, 20, 31, 1) (680, 5) (20, 31, 1)
