In [34]:
import pandas as pd
import numpy as np
from keras import losses, models, optimizers
from keras.layers import (Input, Dense, Convolution2D, GlobalAveragePooling2D, BatchNormalization, Flatten, GlobalMaxPool2D, MaxPool2D, concatenate, Activation)
from keras.utils import Sequence, to_categorical
from keras import backend as K
from keras.callbacks import (EarlyStopping, ModelCheckpoint, TensorBoard)
from keras.activations import relu, softmax

from sklearn.model_selection import StratifiedKFold

                                                                                                                                                                                                                                                                                                            import librosa
                                                                                                                                                                                                                                                                                                            import os
import shutil
import warnings

warnings.filterwarnings("ignore", category=FutureWarning) 

In [35]:
# Change this to True for full dataset and learning
COMPLETE_RUN = False
data_path = '../data'

In [36]:
train = pd.read_csv(data_path + "/train_curated.csv")
test = pd.read_csv(data_path + "/sample_submission.csv")

In [37]:
print("Number of train examples=", train.shape[0], "  Number of classes=", len(set(train['labels'])))
print("Number of test examples=", test.shape[0], "  Number of classes=", len(set(test.columns[1:])))

Number of train examples= 4970   Number of classes= 213
Number of test examples= 1120   Number of classes= 80


In [38]:
len(set(train['labels'])) # But now we're working with less than 7% of the original number of labels that describes more 83% of all the clips

213

In [39]:
# Same as with the curated set, we'll ignore multi labeled rows for now.
# Unlike the curated set though this will drastically cut down a large portion of the data set
train = train[train['labels'].isin(test.columns[1:])]
print(len(train))

4269


In [40]:
category_group = train.groupby(['labels']).count()
category_group.columns = ['counts']
print(len(category_group))

74


In [41]:
# The Configuration object stores those learning parameters that are shared
# between data generators models, and training functions. Anything that is
# global as far as the training is concerned can become the part of Configuration object.

class Config(object):
    def __init__(self,
                 sampling_rate=16000,
                 audio_duration=2, 
                 n_classes=len(category_group),
                 use_mfcc=False,
                 n_folds=10,
                 learning_rate=0.0001, 
                 max_epochs=50,
                 n_mfcc=20):
        self.sampling_rate = sampling_rate
        self.audio_duration = audio_duration
        self.n_classes = n_classes
        self.use_mfcc = use_mfcc
        self.n_mfcc = n_mfcc
        self.n_folds = n_folds
        self.learning_rate = learning_rate
        self.max_epochs = max_epochs

        self.audio_length = self.sampling_rate * self.audio_duration
        if self.use_mfcc:
            self.dim = (self.n_mfcc, 1 + int(np.floor(self.audio_length/512)), 1)
        else:
            self.dim = (self.audio_length, 1)

In [42]:
def get_2d_dummy_model(config):
    
    nclass = config.n_classes
    
    inp = Input(shape=(config.dim[0],config.dim[1],1))
    x = GlobalMaxPool2D()(inp)
    out = Dense(nclass, activation=softmax)(x)

    model = models.Model(inputs=inp, outputs=out)
    opt = optimizers.Adam(config.learning_rate)

    model.compile(optimizer=opt, loss=losses.categorical_crossentropy, metrics=['acc'])
    return model


def get_2d_conv_model(config):
    
    nclass = config.n_classes
    
    inp = Input(shape=(config.dim[0],config.dim[1],1))
    x = Convolution2D(32, (4,10), padding="same")(inp)
    x = BatchNormalization()(x)
    x = Activation("relu")(x)
    x = MaxPool2D()(x)
    
    x = Convolution2D(32, (4,10), padding="same")(x)
    x = BatchNormalization()(x)
    x = Activation("relu")(x)
    x = MaxPool2D()(x)
    
    x = Convolution2D(32, (4,10), padding="same")(x)
    x = BatchNormalization()(x)
    x = Activation("relu")(x)
    x = MaxPool2D()(x)
    
    x = Convolution2D(32, (4,10), padding="same")(x)
    x = BatchNormalization()(x)
    x = Activation("relu")(x)
    x = MaxPool2D()(x)

    x = Flatten()(x)
    x = Dense(64)(x)
    x = BatchNormalization()(x)
    x = Activation("relu")(x)
    out = Dense(nclass, activation=softmax)(x)

    model = models.Model(inputs=inp, outputs=out)
    opt = optimizers.Adam(config.learning_rate)

    model.compile(optimizer=opt, loss=losses.categorical_crossentropy, metrics=['acc'])
    return model

In [43]:
# It is important to convert raw labels to integer indices
train.head(1)

Unnamed: 0,fname,labels
0,0006ae4e.wav,Bark


In [44]:
LABELS = list(train['labels'].unique())
label_idx = {label: i for i, label in enumerate(LABELS)}

train.set_index("fname", inplace=True)
test.set_index("fname", inplace=True)

train["label_idx"] = train['labels'].apply(lambda x: label_idx[x])

if not COMPLETE_RUN:
    train = train[:2000]
    test = test[:2000]

In [55]:
# raw labels with integer indices
train.head(1)

Unnamed: 0_level_0,labels,label_idx
fname,Unnamed: 1_level_1,Unnamed: 2_level_1
0006ae4e.wav,Bark,0


In [46]:
config = Config(sampling_rate=44100, audio_duration=2, n_folds=10, 
                learning_rate=0.001, use_mfcc=True, n_mfcc=40)
if not COMPLETE_RUN:
    config = Config(sampling_rate=44100, audio_duration=2, n_folds=2, 
                    max_epochs=1, use_mfcc=True, n_mfcc=40)

In [58]:
def prepare_data(df, config, data_dir):
    X = np.empty(shape=(df.shape[0], config.dim[0], config.dim[1], 1))
    input_length = config.audio_length
    for i, fname in enumerate(df.index):
        file_path = data_dir + fname
        data, _ = librosa.core.load(file_path, sr=config.sampling_rate, res_type="kaiser_fast")

        # Random offset / Padding
        if len(data) > input_length:
            max_offset = len(data) - input_length
            offset = np.random.randint(max_offset)
            data = data[offset:(input_length+offset)]
        else:
            if input_length > len(data):
                max_offset = input_length - len(data)
                offset = np.random.randint(max_offset)
            else:
                offset = 0
            data = np.pad(data, (offset, input_length - len(data) - offset), "constant")

        data = librosa.feature.mfcc(data, sr=config.sampling_rate, n_mfcc=config.n_mfcc)
        data = np.expand_dims(data, axis=-1)
        X[i,] = data
    return X

In [67]:
data, _ = librosa.core.load( data_path + '/train_curated/0006ae4e.wav', sr=config.sampling_rate, res_type="kaiser_fast")
data = librosa.feature.mfcc(data, sr=config.sampling_rate, n_mfcc=config.n_mfcc)


In [70]:
data.shape

(40, 607)

In [71]:
config.dim

(40, 173, 1)

In [16]:
%time
X_train = prepare_data(train, config, data_path + '/train_curated/')
X_test = prepare_data(test, config, data_path + '/test/')
y_train = to_categorical(train.label_idx.astype('str'), num_classes=config.n_classes)

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 5.25 µs


In [78]:
train.label_idx

fname
0006ae4e.wav     0
0019ef41.wav     1
001ec0ad.wav     2
0026c7cb.wav     3
0026f116.wav     2
003be5db.wav     4
004ca909.wav     5
00713ec2.wav     6
00975c2a.wav     7
009ca29f.wav     8
00b0b76f.wav     9
00c17dd2.wav    10
00c40a6d.wav    11
00c4e82c.wav    12
00c7ff40.wav    13
00c91dfc.wav     8
00ffa0d2.wav     5
0110ba24.wav    14
012c15b5.wav    15
013b01b9.wav    16
01565c33.wav    17
015a50b9.wav     0
0164cba5.wav    18
0175a379.wav    19
019234bc.wav    20
0199c0a0.wav    21
02171503.wav    22
0217540b.wav    23
02286d70.wav    24
02356bfd.wav     3
                ..
7533287e.wav    37
754307e6.wav     5
7554eee4.wav     5
75ba74db.wav    30
75c81fe6.wav    13
75e092b5.wav    25
763d2655.wav    29
764b5c87.wav    19
7654ee3e.wav    66
766e8319.wav     8
766f150f.wav    28
76795e3e.wav    48
76841420.wav     4
76a91e03.wav    33
76ad78c3.wav    52
76b73017.wav    17
76bf1389.wav    52
76cb5441.wav    18
76d21a78.wav    17
76d32c4a.wav    51
76daf232.wav    46
76f877

In [74]:
X_train.shape

(2000, 40, 173, 1)

In [75]:
X_test.shape

(1120, 40, 173, 1)

In [79]:
y_train.shape

(2000, 80)

In [17]:
# Normalize data
mean = np.mean(X_train, axis=0)
std = np.std(X_train, axis=0)

X_train = (X_train - mean)/std
X_test = (X_test - mean)/std

In [18]:
# Training 2D Conv on MFCC

PREDICTION_FOLDER = "predictions_2d_conv"
if not os.path.exists(PREDICTION_FOLDER):
    os.mkdir(PREDICTION_FOLDER)
if os.path.exists('logs/' + PREDICTION_FOLDER):
    shutil.rmtree('logs/' + PREDICTION_FOLDER)

skf = StratifiedKFold(n_splits=config.n_folds)

for i, (train_split, val_split) in enumerate(skf.split(train.index, train.label_idx)):
    K.clear_session()
    X, y, X_val, y_val = X_train[train_split], y_train[train_split], X_train[val_split], y_train[val_split]
    checkpoint = ModelCheckpoint('best_%d.h5'%i, monitor='val_loss', verbose=1, save_best_only=True)
    early = EarlyStopping(monitor="val_loss", mode="min", patience=5)
    tb = TensorBoard(log_dir='./logs/' + PREDICTION_FOLDER + '/fold_%i'%i, write_graph=True)
    callbacks_list = [checkpoint, early, tb]
    print("#"*50)
    print("Fold: ", i)
    model = get_2d_conv_model(config)
    history = model.fit(X, y, validation_data=(X_val, y_val), callbacks=callbacks_list, 
                        batch_size=64, epochs=config.max_epochs)
    model.load_weights('best_%d.h5'%i)

    # Save train predictions
    predictions = model.predict(X_train, batch_size=64, verbose=1)
    np.save(PREDICTION_FOLDER + "/train_predictions_%d.npy"%i, predictions)

    # Save test predictions
    predictions = model.predict(X_test, batch_size=64, verbose=1)
    np.save(PREDICTION_FOLDER + "/test_predictions_%d.npy"%i, predictions)

    # Make a submission file
    top_3 = np.array(LABELS)[np.argsort(-predictions, axis=1)[:, :3]]
    predicted_labels = [' '.join(list(x)) for x in top_3]
    test['label'] = predicted_labels
    test[['label']].to_csv(PREDICTION_FOLDER + "/predictions_%d.csv"%i)

##################################################
Fold:  0
Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.
Train on 981 samples, validate on 1019 samples
Epoch 1/1

Epoch 00001: val_loss improved from inf to 4.62285, saving model to best_0.h5
##################################################
Fold:  1
Train on 1019 samples, validate on 981 samples
Epoch 1/1

Epoch 00001: val_loss improved from inf to 4.62418, saving model to best_1.h5


In [19]:
# Ensembling 2D Conv Predictions¶

# pred_list = []
# for i in range(config.n_folds):
#     pred_list.append(np.load("./predictions_2d_conv/test_predictions_%d.npy"%i))
# prediction = np.ones_like(pred_list[0])
# for pred in pred_list:
#     prediction = prediction*pred
# prediction = prediction**(1./len(pred_list))
# # Make a submission file
# top_3 = np.array(LABELS)[np.argsort(-prediction, axis=1)[:, :3]]
# predicted_labels = [' '.join(list(x)) for x in top_3]
# test = pd.read_csv(data_path + '/sample_submission.csv')
# test['label'] = predicted_labels
# test[['fname', 'label']].to_csv("2d_conv_ensembled_curated_submission.csv", index=False)