In [1]:
import pandas as pd
import numpy as np
from keras import losses, models, optimizers
from keras.layers import (Input, Dense, Convolution2D, GlobalAveragePooling2D, BatchNormalization, Flatten, GlobalMaxPool2D, MaxPool2D, concatenate, Activation)
from keras.utils import Sequence, to_categorical
from keras import backend as K
from keras.callbacks import (EarlyStopping, ModelCheckpoint, TensorBoard)
from keras.activations import relu, softmax

from sklearn.model_selection import StratifiedKFold

import librosa
import os
import shutil
import warnings

warnings.filterwarnings("ignore", category=FutureWarning) 

Using TensorFlow backend.


In [2]:
# Change this to True for full dataset and learning
COMPLETE_RUN = True
data_path = '../data'

In [3]:
train = pd.read_csv(data_path + "/train_noisy.csv")
test = pd.read_csv(data_path + "/sample_submission.csv")

In [4]:
train.sample(5)

Unnamed: 0,fname,labels
134,01a21197.wav,Trickle_and_dribble
8164,68f8b50a.wav,Buzz
14011,b41e77c4.wav,Screaming
8216,69933c48.wav,"Stream,Waves_and_surf"
5439,4612a9e9.wav,Microwave_oven


In [5]:
print("Number of train examples=", train.shape[0], "  Number of classes=", len(set(train['labels'])))
print("Number of test examples=", test.shape[0], "  Number of classes=", len(set(test.columns[1:])))

Number of train examples= 19815   Number of classes= 1168
Number of test examples= 1120   Number of classes= 80


In [6]:
test.columns

Index(['fname', 'Accelerating_and_revving_and_vroom', 'Accordion',
       'Acoustic_guitar', 'Applause', 'Bark', 'Bass_drum', 'Bass_guitar',
       'Bathtub_(filling_or_washing)', 'Bicycle_bell',
       'Burping_and_eructation', 'Bus', 'Buzz', 'Car_passing_by', 'Cheering',
       'Chewing_and_mastication', 'Child_speech_and_kid_speaking',
       'Chink_and_clink', 'Chirp_and_tweet', 'Church_bell', 'Clapping',
       'Computer_keyboard', 'Crackle', 'Cricket', 'Crowd',
       'Cupboard_open_or_close', 'Cutlery_and_silverware',
       'Dishes_and_pots_and_pans', 'Drawer_open_or_close', 'Drip',
       'Electric_guitar', 'Fart', 'Female_singing',
       'Female_speech_and_woman_speaking', 'Fill_(with_liquid)',
       'Finger_snapping', 'Frying_(food)', 'Gasp', 'Glockenspiel', 'Gong',
       'Gurgling', 'Harmonica', 'Hi-hat', 'Hiss', 'Keys_jangling', 'Knock',
       'Male_singing', 'Male_speech_and_man_speaking', 'Marimba_and_xylophone',
       'Mechanical_fan', 'Meow', 'Microwave_oven', 'Mo

In [7]:
np.set_printoptions(threshold=1168)
train['labels'].unique()

array(['Bathtub_(filling_or_washing)', 'Motorcycle',
       'Marimba_and_xylophone,Glockenspiel',
       'Water_tap_and_faucet,Sink_(filling_or_washing)', 'Raindrop',
       'Bass_guitar', 'Raindrop,Trickle_and_dribble',
       'Strum,Acoustic_guitar', 'Bass_drum,Hi-hat', 'Harmonica', 'Slam',
       'Scissors', 'Cheering,Crowd', 'Crackle', 'Purr', 'Crowd', 'Yell',
       'Female_singing', 'Printer', 'Finger_snapping', 'Microwave_oven',
       'Hi-hat', 'Run', 'Bass_drum', 'Squeak', 'Acoustic_guitar',
       'Male_speech_and_man_speaking', 'Tap', 'Microwave_oven,Buzz',
       'Water_tap_and_faucet,Frying_(food)', 'Fart',
       'Race_car_and_auto_racing', 'Frying_(food)',
       'Motorcycle,Microwave_oven', 'Walk_and_footsteps', 'Buzz',
       'Accelerating_and_revving_and_vroom', 'Stream', 'Male_singing',
       'Chirp_and_tweet', 'Clapping', 'Mechanical_fan',
       'Frying_(food),Dishes_and_pots_and_pans,Cutlery_and_silverware',
       'Mechanical_fan,Buzz', 'Female_speech_and_woman_

In [8]:
# Same as with the curated set, we'll ignore multi labeled rows for now.
# Unlike the curated set though this will drastically cut down a large portion of the data set
train = train[train['labels'].isin(test.columns[1:])]
print(len(train))

16566


In [9]:
len(set(train['labels'])) # But now we're working with less than 7% of the original number of labels that describes more 83% of all the clips

80

In [10]:
test.sample(1)

Unnamed: 0,fname,Accelerating_and_revving_and_vroom,Accordion,Acoustic_guitar,Applause,Bark,Bass_drum,Bass_guitar,Bathtub_(filling_or_washing),Bicycle_bell,...,Toilet_flush,Traffic_noise_and_roadway_noise,Trickle_and_dribble,Walk_and_footsteps,Water_tap_and_faucet,Waves_and_surf,Whispering,Writing,Yell,Zipper_(clothing)
1018,3c35237b.wav,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
category_group = train.groupby(['labels']).count()
category_group.columns = ['counts']
print(len(category_group))

80


In [12]:
# The Configuration object stores those learning parameters that are shared
# between data generators models, and training functions. Anything that is
# global as far as the training is concerned can become the part of Configuration object.

class Config(object):
    def __init__(self,
                 sampling_rate=16000,
                 audio_duration=2, 
                 n_classes=len(category_group),
                 use_mfcc=False,
                 n_folds=10,
                 learning_rate=0.0001, 
                 max_epochs=50,
                 n_mfcc=20):
        self.sampling_rate = sampling_rate
        self.audio_duration = audio_duration
        self.n_classes = n_classes
        self.use_mfcc = use_mfcc
        self.n_mfcc = n_mfcc
        self.n_folds = n_folds
        self.learning_rate = learning_rate
        self.max_epochs = max_epochs

        self.audio_length = self.sampling_rate * self.audio_duration
        if self.use_mfcc:
            self.dim = (self.n_mfcc, 1 + int(np.floor(self.audio_length/512)), 1)
        else:
            self.dim = (self.audio_length, 1)

In [13]:
# The DataGenerator class inherits from keras.utils.Sequence .
# It is useful for preprocessing and feeding the data to a Keras model.

# Note: Sequence are a safer way to do multiprocessing.
# This structure guarantees that the network will only train once on each
# sample per epoch which is not the case with generators.

class DataGenerator(Sequence):
    def __init__(self,
                 config,
                 data_dir,
                 list_IDs,
                 labels=None, 
                 batch_size=64,
                 preprocessing_fn=lambda x: x):
        self.config = config
        self.data_dir = data_dir
        self.list_IDs = list_IDs
        self.labels = labels
        self.batch_size = batch_size
        self.preprocessing_fn = preprocessing_fn
        self.on_epoch_end()
        self.dim = self.config.dim

    def __len__(self):
        # Once initialized with a batch_size, DataGenerator computes the number of batches in an epoch.
        # The __len__ method tells Keras how many batches to draw in each epoch.
        return int(np.ceil(len(self.list_IDs) / self.batch_size))

    def __getitem__(self, index):
        # The __getitem__ method takes an index (which is the batch number) and
        # returns a batch of the data (both X and y) after calculating the offset.
        # During test time, only X is returned.
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]
        list_IDs_temp = [self.list_IDs[k] for k in indexes]
        return self.__data_generation(list_IDs_temp)

    def on_epoch_end(self):
        # If we want to perform some action after each epoch (like shuffle the data,
        # or increase the proportion of augmented data), we can use the on_epoch_end method.
        self.indexes = np.arange(len(self.list_IDs))

    def __data_generation(self, list_IDs_temp):
        cur_batch_size = len(list_IDs_temp)
        X = np.empty((cur_batch_size, *self.dim))

        input_length = self.config.audio_length
        for i, ID in enumerate(list_IDs_temp):
            file_path = self.data_dir + ID
            
            # Read and Resample the audio
            data, _ = librosa.core.load(file_path, sr=self.config.sampling_rate, res_type='kaiser_fast')

            # Random offset / Padding
            if len(data) > input_length:
                max_offset = len(data) - input_length
                offset = np.random.randint(max_offset)
                data = data[offset:(input_length+offset)]
            else:
                if input_length > len(data):
                    max_offset = input_length - len(data)
                    offset = np.random.randint(max_offset)
                else:
                    offset = 0
                data = np.pad(data, (offset, input_length - len(data) - offset), "constant")
                
            # Normalization + Other Preprocessing
            if self.config.use_mfcc:
                data = librosa.feature.mfcc(data, sr=self.config.sampling_rate, n_mfcc=self.config.n_mfcc)
                data = np.expand_dims(data, axis=-1)
            else:
                data = self.preprocessing_fn(data)[:, np.newaxis]
            X[i,] = data

        if self.labels is not None:
            y = np.empty(cur_batch_size, dtype=int)
            for i, ID in enumerate(list_IDs_temp):
                y[i] = self.labels[ID]
            return X, to_categorical(y, num_classes=self.config.n_classes)
        else:
            return X

In [14]:
def get_2d_dummy_model(config):
    
    nclass = config.n_classes
    
    inp = Input(shape=(config.dim[0],config.dim[1],1))
    x = GlobalMaxPool2D()(inp)
    out = Dense(nclass, activation=softmax)(x)

    model = models.Model(inputs=inp, outputs=out)
    opt = optimizers.Adam(config.learning_rate)

    model.compile(optimizer=opt, loss=losses.categorical_crossentropy, metrics=['acc'])
    return model


def get_2d_conv_model(config):
    
    nclass = config.n_classes
    
    inp = Input(shape=(config.dim[0],config.dim[1],1))
    x = Convolution2D(32, (4,10), padding="same")(inp)
    x = BatchNormalization()(x)
    x = Activation("relu")(x)
    x = MaxPool2D()(x)
    
    x = Convolution2D(32, (4,10), padding="same")(x)
    x = BatchNormalization()(x)
    x = Activation("relu")(x)
    x = MaxPool2D()(x)
    
    x = Convolution2D(32, (4,10), padding="same")(x)
    x = BatchNormalization()(x)
    x = Activation("relu")(x)
    x = MaxPool2D()(x)
    
    x = Convolution2D(32, (4,10), padding="same")(x)
    x = BatchNormalization()(x)
    x = Activation("relu")(x)
    x = MaxPool2D()(x)

    x = Flatten()(x)
    x = Dense(64)(x)
    x = BatchNormalization()(x)
    x = Activation("relu")(x)
    out = Dense(nclass, activation=softmax)(x)

    model = models.Model(inputs=inp, outputs=out)
    opt = optimizers.Adam(config.learning_rate)

    model.compile(optimizer=opt, loss=losses.categorical_crossentropy, metrics=['acc'])
    return model

In [15]:
# It is important to convert raw labels to integer indices
train.head()

Unnamed: 0,fname,labels
0,00097e21.wav,Bathtub_(filling_or_washing)
1,000b6cfb.wav,Motorcycle
4,0019adae.wav,Raindrop
5,001b819d.wav,Bass_guitar
9,0020becb.wav,Harmonica


In [16]:
LABELS = list(train['labels'].unique())
label_idx = {label: i for i, label in enumerate(LABELS)}

train.set_index("fname", inplace=True)
test.set_index("fname", inplace=True)

train["label_idx"] = train['labels'].apply(lambda x: label_idx[x])

if not COMPLETE_RUN:
    train = train[:2000]
    test = test[:2000]

In [17]:
# raw labels with integer indices
train.head()

Unnamed: 0_level_0,labels,label_idx
fname,Unnamed: 1_level_1,Unnamed: 2_level_1
00097e21.wav,Bathtub_(filling_or_washing),0
000b6cfb.wav,Motorcycle,1
0019adae.wav,Raindrop,2
001b819d.wav,Bass_guitar,3
0020becb.wav,Harmonica,4


In [18]:
config = Config(sampling_rate=44100, audio_duration=2, n_folds=10, 
                learning_rate=0.001, use_mfcc=True, n_mfcc=40)
if not COMPLETE_RUN:
    config = Config(sampling_rate=44100, audio_duration=2, n_folds=2, 
                    max_epochs=1, use_mfcc=True, n_mfcc=40)

In [19]:
def prepare_data(df, config, data_dir):
    X = np.empty(shape=(df.shape[0], config.dim[0], config.dim[1], 1))
    input_length = config.audio_length
    for i, fname in enumerate(df.index):
        file_path = data_dir + fname
        data, _ = librosa.core.load(file_path, sr=config.sampling_rate, res_type="kaiser_fast")

        # Random offset / Padding
        if len(data) > input_length:
            max_offset = len(data) - input_length
            offset = np.random.randint(max_offset)
            data = data[offset:(input_length+offset)]
        else:
            if input_length > len(data):
                max_offset = input_length - len(data)
                offset = np.random.randint(max_offset)
            else:
                offset = 0
            data = np.pad(data, (offset, input_length - len(data) - offset), "constant")

        data = librosa.feature.mfcc(data, sr=config.sampling_rate, n_mfcc=config.n_mfcc)
        data = np.expand_dims(data, axis=-1)
        X[i,] = data
    return X

In [20]:
%time
X_train_noisy = prepare_data(train, config, data_path + '/train_noisy/')
X_test = prepare_data(test, config, data_path + '/test/')
y_train = to_categorical(train.label_idx.astype('str'), num_classes=config.n_classes)

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 5.72 µs


In [21]:
# Normalize data
mean = np.mean(X_train, axis=0)
std = np.std(X_train, axis=0)

X_train = (X_train - mean)/std
X_test = (X_test - mean)/std

In [22]:
# Training 2D Conv on MFCC

PREDICTION_FOLDER = "predictions_2d_conv"
if not os.path.exists(PREDICTION_FOLDER):
    os.mkdir(PREDICTION_FOLDER)
if os.path.exists('logs/' + PREDICTION_FOLDER):
    shutil.rmtree('logs/' + PREDICTION_FOLDER)

skf = StratifiedKFold(n_splits=config.n_folds)

for i, (train_split, val_split) in enumerate(skf.split(train.index, train.label_idx)):
    K.clear_session()
    X, y, X_val, y_val = X_train[train_split], y_train[train_split], X_train[val_split], y_train[val_split]
    checkpoint = ModelCheckpoint('best_%d.h5'%i, monitor='val_loss', verbose=1, save_best_only=True)
    early = EarlyStopping(monitor="val_loss", mode="min", patience=5)
    tb = TensorBoard(log_dir='./logs/' + PREDICTION_FOLDER + '/fold_%i'%i, write_graph=True)
    callbacks_list = [checkpoint, early, tb]
    print("#"*50)
    print("Fold: ", i)
    model = get_2d_conv_model(config)
    history = model.fit(X, y, validation_data=(X_val, y_val), callbacks=callbacks_list, 
                        batch_size=64, epochs=config.max_epochs)
    model.load_weights('best_%d.h5'%i)

    # Save train predictions
    predictions = model.predict(X_train, batch_size=64, verbose=1)
    np.save(PREDICTION_FOLDER + "/train_predictions_%d.npy"%i, predictions)

    # Save test predictions
    predictions = model.predict(X_test, batch_size=64, verbose=1)
    np.save(PREDICTION_FOLDER + "/test_predictions_%d.npy"%i, predictions)

    # Make a submission file
    top_3 = np.array(LABELS)[np.argsort(-predictions, axis=1)[:, :3]]
    predicted_labels = [' '.join(list(x)) for x in top_3]
    test['label'] = predicted_labels
    test[['label']].to_csv(PREDICTION_FOLDER + "/predictions_%d.csv"%i)

##################################################
Fold:  0
Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.
Train on 14872 samples, validate on 1694 samples
Epoch 1/50

Epoch 00001: val_loss improved from inf to 3.81063, saving model to best_0.h5
Epoch 2/50

Epoch 00002: val_loss improved from 3.81063 to 3.56456, saving model to best_0.h5
Epoch 3/50

Epoch 00003: val_loss improved from 3.56456 to 3.36451, saving model to best_0.h5
Epoch 4/50

Epoch 00004: val_loss did not improve from 3.36451
Epoch 5/50

Epoch 00005: val_loss improved from 3.36451 to 3.23346, saving model to best_0.h5
Epoch 6/50

Epoch 00006: val_loss improved from 3.23346 to 3.20686, saving model to best_0.h5
Epoch 7/50

Epoch 00007: val_loss did not improve from 3.20686
Epoch 8/50

Epoch 00008: val_loss improved from 3.20686 to 3.14198, saving model to best_0.h5
Epoch 9/50

Epoch 00009: val_loss did not improve from 3.14198
Epoch 10/50

Epoch 000


Epoch 00012: val_loss did not improve from 3.14119
Epoch 13/50

Epoch 00013: val_loss did not improve from 3.14119
Epoch 14/50

Epoch 00014: val_loss did not improve from 3.14119
##################################################
Fold:  3
Train on 14896 samples, validate on 1670 samples
Epoch 1/50

Epoch 00001: val_loss improved from inf to 3.70351, saving model to best_3.h5
Epoch 2/50

Epoch 00002: val_loss improved from 3.70351 to 3.26606, saving model to best_3.h5
Epoch 3/50

Epoch 00003: val_loss did not improve from 3.26606
Epoch 4/50

Epoch 00004: val_loss improved from 3.26606 to 3.21761, saving model to best_3.h5
Epoch 5/50

Epoch 00005: val_loss improved from 3.21761 to 3.16506, saving model to best_3.h5
Epoch 6/50

Epoch 00006: val_loss improved from 3.16506 to 3.15971, saving model to best_3.h5
Epoch 7/50

Epoch 00007: val_loss did not improve from 3.15971
Epoch 8/50

Epoch 00008: val_loss did not improve from 3.15971
Epoch 9/50

Epoch 00009: val_loss did not improve from 3

OSError: Unable to create file (file write failed: time = Mon May 13 20:58:43 2019
, filename = 'best_4.h5', file descriptor = 47, errno = 28, error message = 'No space left on device', buf = 0x55b0f39926c0, total write size = 96, bytes this sub-write = 96, bytes actually written = 18446744073709551615, offset = 0)

In [None]:
# Ensembling 2D Conv Predictions¶

pred_list = []
for i in range(config.n_folds):
    pred_list.append(np.load("./predictions_2d_conv/test_predictions_%d.npy"%i))
prediction = np.ones_like(pred_list[0])
for pred in pred_list:
    prediction = prediction*pred
prediction = prediction**(1./len(pred_list))
# Make a submission file
top_3 = np.array(LABELS)[np.argsort(-prediction, axis=1)[:, :3]]
predicted_labels = [' '.join(list(x)) for x in top_3]
test = pd.read_csv(data_path + '/sample_submission.csv')
test['label'] = predicted_labels
test[['fname', 'label']].to_csv("2d_conv_ensembled_noisy_submission.csv", index=False)

4

The history saving thread hit an unexpected error (OperationalError('database or disk is full',)).History will not be written to the database.
