# CNN for audio classification

##### SOURCES
[1] [CNN for audio (MEDIUM)](https://towardsdatascience.com/cnns-for-audio-classification-6244954665ab)  
[2] [Types of DCT](https://docs.scipy.org/doc/scipy/reference/generated/scipy.fftpack.dct.html#scipy.fftpack.dct)

Default sampling rate of `librosa` is 22050, while for `scipy` is 44100. This implies that we have half of the length of the sequence with the former library (110250 against 220500).  

In [1]:
import os
import sys
import torch
import librosa
import numpy as np
import pandas as pd
import seaborn as sns
import tensorflow as tf
from scipy.io import wavfile
from tensorflow import keras
import matplotlib.pyplot as plt

import evaluation
import CNN_support as cnns

%load_ext autoreload
%autoreload 2

In [2]:
data = pd.read_csv('.\\data\\meta\\esc50.csv')
data.head()

Unnamed: 0,filename,fold,target,category,esc10,src_file,take
0,1-100032-A-0.wav,1,0,dog,True,100032,A
1,1-100038-A-14.wav,1,14,chirping_birds,False,100038,A
2,1-100210-A-36.wav,1,36,vacuum_cleaner,False,100210,A
3,1-100210-B-36.wav,1,36,vacuum_cleaner,False,100210,B
4,1-101296-A-19.wav,1,19,thunderstorm,False,101296,A


In [None]:
#pppp = cnns.SoundPreprocessing(n_fft = 1000, n_mfcc = 10, sr = 441000, max_size = (200, 1000))

#W, Z = pppp.get_features(df = data, filepath = ".\\data\\audio")

In [None]:
s, a = wavfile.read(".\\data\\audio\\{}".format(data.loc[10, "filename"]))
a = a.astype(np.float32)

In [None]:
mfcc_try = librosa.feature.mfcc(y = a, sr = s, hop_length = 512, n_mfcc = 60)
mfcc_try = np.divide(mfcc_try, np.linalg.norm(mfcc_try))
chromogram_try = librosa.feature.chroma_stft(y = a, sr = s, hop_length = 512,
                                             win_length = 1024, n_chroma = 60)

## Preprocessing

In [3]:
# Get data for CNN
X = []
y = np.zeros(shape = (len(data), 1))

for i in data.index:
    
    sr, aud = wavfile.read(".\\data\\audio\\{}".format(data.loc[i, "filename"]))
    aud = aud.astype(np.float32)
    
    MFCC = librosa.feature.mfcc(y = aud, sr = sr, hop_length = 512, n_mfcc = 60)
    chromagram = librosa.feature.chroma_stft(y = aud.astype(np.float32), sr = sr,
                                             hop_length = 512, win_length = 1024,
                                             n_chroma = 60)
    delta = librosa.feature.delta(MFCC)
    
    instance = np.dstack((MFCC, chromagram, delta))
    
    X += [instance]
    
    y[i] = data.loc[i, "target"]
    
X = np.array(X)

  return f(*args, **kwargs)


In [4]:
# Data Augmentation
# SOURCE: https://medium.com/@makcedward/data-augmentation-for-audio-76912b01fdf6

np.random.seed(42)
indexed_samples = np.random.choice(X.shape[0], size = 4000,
                                   replace = True)
np.random.seed(101)
randn_seeds = np.random.choice(5000, size = 4000,
                               replace = False)

new_X = []
new_y = np.zeros(shape = (len(indexed_samples), 1))
for n, i in enumerate(indexed_samples):
    
    sr_sample, sample = wavfile.read(".\\data\\audio\\{}".format(data.loc[i, "filename"]))
    sample = sample.astype(np.float32)
    
    if n%5 == 0:
        # NOISE INJECTION
        np.random.seed(randn_seeds[n])
        noise = np.random.randn(len( sample ))
        augmented_data = (sample + noise).astype(np.float32)
        
    elif n%5 == 1:
        # TIME SHIFT: right shift
        np.random.seed(randn_seeds[n])
        shift = -1*np.random.randint(sr_sample * 0.2) # 0.2 length of sequence max
        augmented_data = np.roll(sample, shift)
        # Set to silence for heading/ tailing
        augmented_data[shift:] = 0
        
    elif n%5 == 2:
        # PITCH SHIFT: shift down by 3
        augmented_data = librosa.effects.pitch_shift(y = sample, sr = sr_sample,
                                                     n_steps = 3)
    elif n%5 == 3:
        # SPEED SHIFT: faster
        augmented_data = librosa.effects.time_stretch(y = sample, rate = 1.2)
        augmented_data = np.append(augmented_data,
                                   np.zeros(shape = len(sample) - len(augmented_data)))
    else:
        # SPEED SHIFT: slower (returns longer array)
        augmented_data = librosa.effects.time_stretch(y = sample, rate = 0.8)
        augmented_data = augmented_data[:len(sample)]
    
    
    new_MFCC = librosa.feature.mfcc(y = augmented_data, sr = sr_sample,
                                    hop_length = 512, n_mfcc = 60)
    new_chromagram = librosa.feature.chroma_stft(y = augmented_data, sr = sr_sample,
                                                 hop_length = 512, win_length = 1024,
                                                 n_chroma = 60)
    new_delta = librosa.feature.delta(new_MFCC)
    
    new_instance = np.dstack((new_MFCC, new_chromagram, new_delta))
    
    new_X += [new_instance]
    new_y[n] = y[i]
    
new_X = np.array(new_X)

  return f(*args, **kwargs)


In [5]:
# Get stuff together
X = np.vstack((X, new_X))
y = np.vstack((y, new_y))

In [6]:
X.shape, y.shape

((6000, 60, 431, 3), (6000, 1))

In [17]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = .75, random_state = 42)
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, train_size = .8, random_state = 42)

print("Training examples: {}".format(y_train.shape[0]))
print("Validation examples: {}".format(y_valid.shape[0]))
print("Test examples: {}".format(y_test.shape[0]))
print()
print("Input shape: {}".format(X_train[0].shape))

Training examples: 3600
Validation examples: 900
Test examples: 1500

Input shape: (60, 431, 3)


In [48]:
# -------------------- #
# LOAD DATA FROM FILES #
# -------------------- #

X_train = np.vstack((torch.load(".//data/CNN_X_train_1.pt"),
                     torch.load(".//data/CNN_X_train_2.pt"),
                     torch.load(".//data/CNN_X_train_3.pt")))
X_valid = torch.load(".//data/CNN_X_valid.pt")
X_test = torch.load(".//data/CNN_X_test.pt")
y_train = torch.load(".//data/CNN_y_train.pt")
y_valid = torch.load(".//data/CNN_y_valid.pt")
y_test = torch.load(".//data/CNN_y_test.pt")

In [61]:
sys.getsizeof(model1)

48

## Algo tries

In [62]:
model1 = keras.models.Sequential([keras.layers.Conv2D(filters = 40, kernel_size = [57, 63],
                                                      padding = "same", activation = "relu",
                                                      input_shape = [60, 431, 3]),
                                  keras.layers.MaxPool2D(pool_size = (12, 10), strides = (1, 3)),
                                  keras.layers.Dropout(rate = .5),
                                  keras.layers.Conv2D(filters = 40, kernel_size = (1, 3),
                                                      padding = "same", activation = "relu"),
                                  keras.layers.MaxPool2D(pool_size = (1, 3), strides = (2, 5)),
                                  keras.layers.Flatten(),
                                  #keras.layers.Dense(5000, activation = "relu"),
                                  #keras.layers.Dropout(rate = .5),
                                  keras.layers.Dense(50, activation = "relu"),
                                  keras.layers.Dropout(rate = .5),
                                  keras.layers.Dense(50, activation = "softmax") ], name = "M1")

model1.compile(loss = "sparse_categorical_crossentropy", metrics = ["accuracy"],
               optimizer = keras.optimizers.Adam(learning_rate = 1e-4))

print(model1.summary())

Model: "M1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d_12 (Conv2D)          (None, 60, 431, 40)       430960    
                                                                 
 max_pooling2d_12 (MaxPoolin  (None, 49, 141, 40)      0         
 g2D)                                                            
                                                                 
 dropout_14 (Dropout)        (None, 49, 141, 40)       0         
                                                                 
 conv2d_13 (Conv2D)          (None, 49, 141, 40)       4840      
                                                                 
 max_pooling2d_13 (MaxPoolin  (None, 25, 28, 40)       0         
 g2D)                                                            
                                                                 
 flatten_6 (Flatten)         (None, 28000)             0        

In [None]:
batch_size1 = int(X_train.shape[0]/10)
history1 = model1.fit(X_train, y_train, epochs = 100, batch_size = batch_size1,
                      validation_data = (X_valid, y_valid), verbose = 0,
                      callbacks = [keras.callbacks.EarlyStopping(patience = 5)])

evaluation.plot_loss(history1)
evaluation.plot_accuracy(history1)

scores1 = model1.evaluate(X_test, y_test, verbose = 2)
print("="*71)
print("Accuracy on test: {:.2f}%".format(scores1[1]*100))
print("Memory used: {:.1f} Mb".format(
    evaluation.keras_model_memory_usage_in_bytes(model = model1,
                                                 batch_size = batch_size1)/1e6))