First of all, we need to install kaggle api

In [None]:
!pip install -q kaggle
!mkdir ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

Then, download dataset with sounds

In [None]:
!kaggle datasets download -d chrisfilo/urbansound8k

Unpacking

In [None]:
!unzip urbansound8k.zip

Deleting archive

In [None]:
!rm urbansound8k.zip

Imports

In [None]:
import pandas as pd
import numpy as np

pd.plotting.register_matplotlib_converters()
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score

from sklearn.preprocessing import StandardScaler

In [None]:
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Conv2D, Flatten, Dense, MaxPool2D, Dropout, BatchNormalization, Conv1D, MaxPooling1D, SeparableConv2D, Input, LSTM, Activation
from tensorflow.keras.callbacks import ReduceLROnPlateau, ModelCheckpoint, EarlyStopping
from tensorflow.keras.utils import to_categorical 
from tensorflow.keras.applications import VGG16
from tensorflow import keras

from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

In [None]:
import os
import librosa
import librosa.display 
from tqdm.notebook import tqdm

Reading dataframe with folds and classIDs

In [None]:
df = pd.read_csv("UrbanSound8K.csv")
df

Unnamed: 0,slice_file_name,fsID,start,end,salience,fold,classID,class
0,100032-3-0-0.wav,100032,0.000000,0.317551,1,5,3,dog_bark
1,100263-2-0-117.wav,100263,58.500000,62.500000,1,5,2,children_playing
2,100263-2-0-121.wav,100263,60.500000,64.500000,1,5,2,children_playing
3,100263-2-0-126.wav,100263,63.000000,67.000000,1,5,2,children_playing
4,100263-2-0-137.wav,100263,68.500000,72.500000,1,5,2,children_playing
...,...,...,...,...,...,...,...,...
8727,99812-1-2-0.wav,99812,159.522205,163.522205,2,7,1,car_horn
8728,99812-1-3-0.wav,99812,181.142431,183.284976,2,7,1,car_horn
8729,99812-1-4-0.wav,99812,242.691902,246.197885,2,7,1,car_horn
8730,99812-1-5-0.wav,99812,253.209850,255.741948,2,7,1,car_horn


In [None]:
df["class"].value_counts()

dog_bark            1000
street_music        1000
engine_idling       1000
air_conditioner     1000
jackhammer          1000
drilling            1000
children_playing    1000
siren                929
car_horn             429
gun_shot             374
Name: class, dtype: int64

In [None]:
y, sr = librosa.load("air_conditioner.wav", sr=22050)
mfcc = np.mean(librosa.feature.mfcc(y, sr, n_mfcc=100).T, axis=0)
mfcc = mfcc.reshape((1, 20, 5))
predicts = model.predict(mfcc)
cl = np.argmax(predicts, axis=1)
classes = df[["classID", "class"]]
classes[classes.classID == cl[0]].iloc[0]

classID           3
class      dog_bark
Name: 0, dtype: object

On this stage I'm augmenting data using pitch shift and time stretch, simultaneously getting mfcc (mel frequency cepstral coefficient) with n_mfcc=100. Because of very large amount of time to process different features (such as mel spectrogram, chroma stft and etc.) I'm using only mfcc for classification task. It's worth noting that most often mfcc gives the greatest accuracy, relative to all other features

In [None]:
folds = ["x", "fold1/", "fold2/", "fold3/", "fold4/", "fold5/", "fold6/", "fold7/", "fold8/", "fold9/", "fold10/"]
savedir = '/content/'

def data_processing(fold):
  mfcc = []
  files = df[df["fold"] == fold]
  labels = []
  def _audio_features(y, sr):
    mfcc.append(np.mean(librosa.feature.mfcc(y, sr, n_mfcc=100).T, axis=0))

  fold = f"fold{fold}/"
  for wav in tqdm(os.listdir(savedir+fold)):
    label = files[files['slice_file_name'] == wav]['classID']
    y, sr = librosa.load(savedir+fold+wav)
    _audio_features(y, sr)
    labels.append(label.iloc[0]) 
    for i in [-2, -1, 1, 2]:
      y_aug = librosa.effects.pitch_shift(y, sr, i)
      _audio_features(y_aug, sr)
      labels.append(label.iloc[0])
    for i in [0.9, 1.1]:
      y_aug = librosa.effects.time_stretch(y, i)
      _audio_features(y_aug, sr)
      labels.append(label.iloc[0])
    for i in [-2, -1, 1, 2]:
      for j in [0.9, 1.1]:
        y_aug = librosa.effects.pitch_shift(y, sr, i)
        y_aug = librosa.effects.time_stretch(y_aug, j)
        _audio_features(y_aug, sr)
        labels.append(label.iloc[0])

  return mfcc, labels

save_dir = "/gdrive/MyDrive/ESCData/"
for i in range(10, 11):
  mfcc, labels = data_processing(i)  
  mfcc, labels = np.array(mfcc), np.array(labels)
  np.savez(f"{save_dir}{folds[i]}mfcc", mfcc)
  np.savez(f"{save_dir}{folds[i]}labels", labels)  



After augmentation and processing, I have 10 different files with labels and mfcc on my google drive. Processing even one fold takes about 40 minutes, so it's better to save processed data at once.

(I need to write how many data I get from augmentation)

In [None]:
from keras.utils.np_utils import to_categorical

loaddir = '/content/drive/MyDrive/ESCData/'
folds = ["fold1/", "fold2/", "fold3/", "fold4/", "fold5/", "fold6/", "fold7/", "fold8/", "fold9/", "fold10/"]
x_train = []
y_train = []
for i in range(1, 9):
  data = np.load(loaddir+folds[i]+"mfcc.npz", allow_pickle=True)
  labels = np.load(loaddir+folds[i]+"labels.npz", allow_pickle=True)
  x_train.append(data["arr_0"])
  y_train.append(labels["arr_0"])

x_test = np.load(loaddir+"fold10/"+"mfcc.npz", allow_pickle=True)["arr_0"]
x_test = x_test.reshape((x_test.shape[0], 20, 5))
y_test = np.load(loaddir+"fold10/"+"labels.npz", allow_pickle=True)["arr_0"]

x_train = np.array(x_train, dtype='object')
x_train = np.concatenate(x_train, axis=0).astype(np.float32)
x_train = x_train.reshape((x_train.shape[0], 20, 5))

y_train = np.array(y_train, dtype='object')
y_train = np.concatenate(y_train, axis = 0).astype(np.float32)

For using with StratifiedKFold, I need all data in one piece


In [None]:
x = np.concatenate([x_train, x_test], axis = 0)
y = np.concatenate([y_train, y_test], axis = 0)

In [None]:
model = keras.models.load_model("/content/drive/MyDrive/ESCData/esc_model9.hdf5")

In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score

fold = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
acc = []
i = 0

for train_ind, test_ind in tqdm(fold.split(x, y)):
  x_train, y_train = x[train_ind], y[train_ind]
  x_test, y_test = x[test_ind], y[test_ind]

  y_train = to_categorical(y_train, num_classes=10)
  y_test = to_categorical(y_test, num_classes=10)
  
  checkpoint = ModelCheckpoint(f"{model_savedir}esc_model{i}.hdf5", save_best_only=True)
  model = create_model()
  model.fit(x_train, y_train, batch_size=512, epochs=50, verbose=0, callbacks=[checkpoint], validation_data=(x_test, y_test))
  acc.append(model.evaluate(x_test, y_test))
  i += 1

model_savedir = '/content/drive/MyDrive/ESCData/'

StratifiedKFold results.</br>
Feed-forward: 81.5%</br>
LSTM: 99%</br>
CNN: 98.6$</br>
Results are much better with stratified fold and shuffle because of better data distribution.

There I'm using default folds, that was integrated into dataset, without any modifications. Because of bad data distribution, accuracy of all estimators much lower than with stratified k-fold

In [None]:
from sklearn.model_selection import KFold
from keras.utils.np_utils import to_categorical

loaddir = '/content/drive/MyDrive/ESCData/'
folds = ["fold1/", "fold2/", "fold3/", "fold4/", "fold5/", "fold6/", "fold7/", "fold8/", "fold9/", "fold10/"]

ind_folds = [i for i in range(1, 11)]
fold = KFold(10)
accuracy = []

reduce_lr = ReduceLROnPlateau(patience=3)
stopping = EarlyStopping(patience=6)

for create_neural in [create_lstm, create_model, create_feedforward]:
  for train_ind, test_ind in fold.split(ind_folds):
    
    train_x = []
    train_y = []
    for i in train_ind:
      train_x.append(np.load(loaddir+folds[i]+"mfcc.npz", allow_pickle=True)["arr_0"])
      train_y.append(np.load(loaddir+folds[i]+"labels.npz", allow_pickle=True)["arr_0"])

    x_train = np.array(train_x, dtype='object')
    x_train = np.concatenate(x_train, axis=0).astype(np.float32)
    x_train = x_train.reshape((x_train.shape[0], 20, 5))

    y_train = np.array(train_y, dtype='object')
    y_train = np.concatenate(y_train, axis = 0).astype(np.float32)
    y_train = to_categorical(y_train, num_classes=10)

    test_x = np.load(loaddir+folds[test_ind[0]]+"mfcc.npz", allow_pickle=True)["arr_0"]
    x_test = test_x.reshape((test_x.shape[0], 20, 5))
    test_y = np.load(loaddir+folds[test_ind[0]]+"labels.npz", allow_pickle=True)["arr_0"]
    y_test = to_categorical(test_y, num_classes=10)

    model = create_neural()
    model.fit(x_train, y_train, epochs = 50, batch_size = 512, validation_data=(x_test, y_test), callbacks=[reduce_lr, stopping], verbose=0)
    accuracy.append(model.evaluate(x_test, y_test))

accuracy = np.array(accuracy)
np.mean(accuracy[:, 1])

After several tests, I compared accuracy of models on default folds</br>
LSTM: 55.7%</br>
CNN: 62.7%</br>
Feed-forward: 55.9%</br>

As you can see, CNN is better performing on default data distribution

Feed-forward network composed from two dense layers with relu activation, one with 256 neurons and second with 512 neurons.</br>
In every model I'm using Adam optimizer with lr=1e-3, because models with that lr shows better results

In [None]:
input_shape = (100, )
def create_feedforward():
  input_shape = (100, )
  model = Sequential()
  model.add(Dense(256, activation='relu', input_shape=input_shape))
  model.add(Dense(512, activation='relu'))
  model.add(Dense(10, activation='softmax'))
  model.compile(optimizer = keras.optimizers.Adam(lr=1e-3), loss = 'categorical_crossentropy', metrics = ['accuracy'])
  return model

LSTM network is more complex. It contain two LSTM layers, with 128 and 64 units respectively.</br>
After all layers I'm using batch normalization to standardize the inputs to a next layer. It helps model to generalize and accelerates training.</br>
After batch normalization layer I'm using dropout, starting from 0.3 and ending with 0.5, these layers preventing model from overfitting. At each training stage, individual nodes (neurons) are either dropped out of the net with probability 1-p or kept with probability p.</br>
After LSTM layer comes two dense layers in time distributed wrapper, with 256 and 512 units accordingly. This wrapper allows to apply a layer to every temporal slice of an input.</br>
After time distributed comes three last layers: one flatten and two dense, one with relu and other, which is last, with softmax. Flatten layer is used to make input from 4d to 1d

In [None]:
input_shape = (20, 5)
def create_lstm():
  input_shape = (20, 5)
  model = Sequential()
  model.add(LSTM(128, input_shape=input_shape, return_sequences=True))
  model.add(keras.layers.TimeDistributed(layer=BatchNormalization()))
  model.add(LSTM(64, return_sequences=True))
  model.add(keras.layers.TimeDistributed(layer=BatchNormalization()))
  model.add(keras.layers.TimeDistributed(layer=Dropout(0.3)))

  model.add(keras.layers.TimeDistributed(layer=Dense(256, activation='relu')))
  model.add(keras.layers.TimeDistributed(layer=BatchNormalization()))
  model.add(keras.layers.TimeDistributed(layer=Dropout(0.4)))

  model.add(keras.layers.TimeDistributed(layer=Dense(512, activation='relu')))
  model.add(keras.layers.TimeDistributed(layer=BatchNormalization()))
  model.add(keras.layers.TimeDistributed(layer=Dropout(0.4)))

  model.add(Flatten())
  model.add(Dense(256, activation='relu', kernel_regularizer=keras.regularizers.l2(0.001)))
  model.add(Dropout(0.5)) 
  model.add(Dense(10, activation='softmax'))
  model.compile(optimizer = keras.optimizers.Adam(lr=1e-3), loss = 'categorical_crossentropy', metrics = ['accuracy'])
  return model

model = create_lstm()
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 20, 128)           68608     
_________________________________________________________________
time_distributed (TimeDistri (None, 20, 128)           512       
_________________________________________________________________
lstm_1 (LSTM)                (None, 20, 64)            49408     
_________________________________________________________________
time_distributed_1 (TimeDist (None, 20, 64)            256       
_________________________________________________________________
time_distributed_2 (TimeDist (None, 20, 64)            0         
_________________________________________________________________
time_distributed_3 (TimeDist (None, 20, 256)           16640     
_________________________________________________________________
time_distributed_4 (TimeDist (None, 20, 256)           1

In [None]:
def create_model():
  input_dim = (20, 5, 1)
  model = Sequential()
  model.add(Input(input_dim))
  model.add(Conv2D(64, (5, 5), padding='same', activation='relu'))
  model.add(keras.layers.SpatialDropout2D(0.2))
  model.add(BatchNormalization())
  model.add(keras.layers.MaxPooling2D(pool_size=(2, 2)))

  model.add(Conv2D(128, (5, 5), padding='same', activation='relu'))
  model.add(keras.layers.SpatialDropout2D(0.2))
  model.add(BatchNormalization())
  model.add(keras.layers.MaxPooling2D(pool_size=(2, 2)))

  model.add(Flatten())
  model.add(keras.layers.Dropout(0.3))
  model.add(Dense(256, activation='relu', kernel_regularizer=keras.regularizers.l2(0.001)))
  model.add(keras.layers.Dropout(0.4))
  model.add(Dense(512, activation='relu', kernel_regularizer=keras.regularizers.l2(0.001)))
  model.add(keras.layers.Dropout(0.5))
  model.add(Dense(10, activation = "softmax"))
  model.compile(optimizer = keras.optimizers.Adam(lr=1e-3), loss = 'categorical_crossentropy', metrics = ['accuracy'])
  return model