In [1]:
# Change this to True for full dataset and learning
COMPLETE_RUN = True
data_path = '../data'

In [2]:
import numpy as np
np.random.seed(1001)

import os
import shutil
import warnings

# import IPython
import wave
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from tqdm import tqdm_notebook
from sklearn.model_selection import StratifiedKFold

%matplotlib inline
matplotlib.style.use('ggplot')
warnings.filterwarnings("ignore", category=FutureWarning) 

In [3]:
data_path + "/train_noisy.csv"

'../data/train_noisy.csv'

In [4]:
train = pd.read_csv(data_path + "/train_noisy.csv")
test = pd.read_csv(data_path + "/sample_submission.csv")

In [5]:
train.sample(5)

Unnamed: 0,fname,labels
19604,fcfcbd51.wav,Child_speech_and_kid_speaking
6930,592b3420.wav,Buzz
2531,20cf9968.wav,Shatter
15661,ca27d86b.wav,Stream
5806,4a8f7729.wav,Trickle_and_dribble


In [6]:
test.sample(5)

Unnamed: 0,fname,Accelerating_and_revving_and_vroom,Accordion,Acoustic_guitar,Applause,Bark,Bass_drum,Bass_guitar,Bathtub_(filling_or_washing),Bicycle_bell,...,Toilet_flush,Traffic_noise_and_roadway_noise,Trickle_and_dribble,Walk_and_footsteps,Water_tap_and_faucet,Waves_and_surf,Whispering,Writing,Yell,Zipper_(clothing)
581,23244f49.wav,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
114,06604d51.wav,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
438,1a5c4e84.wav,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
732,2d15ca77.wav,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
540,20e285c7.wav,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
print("Number of train examples=", train.shape[0], "  Number of classes=", len(set(train['labels'])))
print("Number of test examples=", test.shape[0], "  Number of classes=", len(set(test.columns[1:])))

Number of train examples= 19815   Number of classes= 1168
Number of test examples= 1120   Number of classes= 80


- note that due to multi-labeld records in train, the number of unique classes is 213 (which is more than the test set's 80)
- For simplicity, we will exclude multi-labeled records in train, so the number of unique label is 74 ( < 80 ).
When buliding a valid model, we must consider this.

In [8]:
train = train[train['labels'].isin(test.columns[1:])]
print(len(train))

16566


In [9]:
(4970 - 4269) / 4970 # # We lose roughly 14% of our data set in order to make this simplification

0.14104627766599598

In [10]:
category_group = train.groupby(['labels']).count()
category_group.columns = ['counts']
print(len(category_group))

80


In [11]:
category_group.sample(5)

Unnamed: 0_level_0,counts
labels,Unnamed: 1_level_1
Writing,175
Tap,248
Male_speech_and_man_speaking,263
Female_speech_and_woman_speaking,266
Bark,257


Reading Audio Files

- Bit-depth = 16: The amplitude of each sample in the audio is one of 2^16 (=65536) possible values.
- Samplig rate = 44.1 kHz: Each second in the audio consists of 44100 samples. So, if the duration of the audio file is 3.2 seconds, the audio will consist of 44100*3.2 = 141120 values.

Audio Length

In [12]:
train['nframes'] = train['fname'].apply(lambda f: wave.open(data_path + '/train_noisy/' + f).getnframes())
test['nframes'] = test['fname'].apply(lambda f: wave.open(data_path + '/test/' + f).getnframes())

In [13]:
train.sample(5)

Unnamed: 0,fname,labels,nframes
17721,e48807e5.wav,Bass_drum,661500
8255,6a35aa6f.wav,Skateboard,661500
11393,92dc74c3.wav,Crowd,661500
12373,9ede9f1b.wav,Gong,661500
2080,1adc2c14.wav,Race_car_and_auto_racing,354304


## Building a Model using Raw Wave

In [14]:
import librosa
import numpy as np
import scipy
from keras import losses, models, optimizers
from keras.activations import relu, softmax
from keras.callbacks import (EarlyStopping, LearningRateScheduler, ModelCheckpoint, TensorBoard, ReduceLROnPlateau)
from keras.layers import (Convolution1D, Dense, Dropout, GlobalAveragePooling1D, GlobalMaxPool1D, Input, MaxPool1D, concatenate)
from keras.utils import Sequence, to_categorical

Using TensorFlow backend.


In [27]:
# The Configuration object stores those learning parameters that are shared
# between data generators models, and training functions. Anything that is
# global as far as the training is concerned can become the part of Configuration object.

class Config(object):
    def __init__(self,
                 sampling_rate=16000,
                 audio_duration=2, 
                 n_classes=len(category_group),
                 use_mfcc=False,
                 n_folds=10,
                 learning_rate=0.0001, 
                 max_epochs=50,
                 n_mfcc=20):
        self.sampling_rate = sampling_rate
        self.audio_duration = audio_duration
        self.n_classes = n_classes
        self.use_mfcc = use_mfcc
        self.n_mfcc = n_mfcc
        self.n_folds = n_folds
        self.max_epochs = max_epochs
        self.learning_rate = learning_rate

        self.audio_length = self.sampling_rate * self.audio_duration
        if self.use_mfcc:
            self.dim = (self.n_mfcc, 1 + int(np.floor(self.audio_length/512)), 1)
        else:
            self.dim = (self.audio_length, 1)

In [16]:
# The DataGenerator class inherits from keras.utils.Sequence .
# It is useful for preprocessing and feeding the data to a Keras model.

# Note: Sequence are a safer way to do multiprocessing.
# This structure guarantees that the network will only train once on each
# sample per epoch which is not the case with generators.

class DataGenerator(Sequence):
    def __init__(self,
                 config,
                 data_dir,
                 list_IDs,
                 labels=None, 
                 batch_size=64,
                 preprocessing_fn=lambda x: x):
        self.config = config
        self.data_dir = data_dir
        self.list_IDs = list_IDs
        self.labels = labels
        self.batch_size = batch_size
        self.preprocessing_fn = preprocessing_fn
        self.on_epoch_end()
        self.dim = self.config.dim

    def __len__(self):
        # Once initialized with a batch_size, DataGenerator computes the number of batches in an epoch.
        # The __len__ method tells Keras how many batches to draw in each epoch.
        return int(np.ceil(len(self.list_IDs) / self.batch_size))

    def __getitem__(self, index):
        # The __getitem__ method takes an index (which is the batch number) and
        # returns a batch of the data (both X and y) after calculating the offset.
        # During test time, only X is returned.
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]
        list_IDs_temp = [self.list_IDs[k] for k in indexes]
        return self.__data_generation(list_IDs_temp)

    def on_epoch_end(self):
        # If we want to perform some action after each epoch (like shuffle the data,
        # or increase the proportion of augmented data), we can use the on_epoch_end method.
        self.indexes = np.arange(len(self.list_IDs))

    def __data_generation(self, list_IDs_temp):
        cur_batch_size = len(list_IDs_temp)
        X = np.empty((cur_batch_size, *self.dim))

        input_length = self.config.audio_length
        for i, ID in enumerate(list_IDs_temp):
            file_path = self.data_dir + ID
            
            # Read and Resample the audio
            data, _ = librosa.core.load(file_path, sr=self.config.sampling_rate, res_type='kaiser_fast')

            # Random offset / Padding
            if len(data) > input_length:
                max_offset = len(data) - input_length
                offset = np.random.randint(max_offset)
                data = data[offset:(input_length+offset)]
            else:
                if input_length > len(data):
                    max_offset = input_length - len(data)
                    offset = np.random.randint(max_offset)
                else:
                    offset = 0
                data = np.pad(data, (offset, input_length - len(data) - offset), "constant")
                
            # Normalization + Other Preprocessing
            if self.config.use_mfcc:
                data = librosa.feature.mfcc(data, sr=self.config.sampling_rate, n_mfcc=self.config.n_mfcc)
                data = np.expand_dims(data, axis=-1)
            else:
                data = self.preprocessing_fn(data)[:, np.newaxis]
            X[i,] = data

        if self.labels is not None:
            y = np.empty(cur_batch_size, dtype=int)
            for i, ID in enumerate(list_IDs_temp):
                y[i] = self.labels[ID]
            return X, to_categorical(y, num_classes=self.config.n_classes)
        else:
            return X

In [17]:
# Normalization is a crucial preprocessing step.
# The simplest method is rescaling the range of features to scale the range in [0, 1].

def audio_norm(data):
    max_data = np.max(data)
    min_data = np.min(data)
    data = (data-min_data)/(max_data-min_data+1e-6)
    return data - 0.5

In [18]:
# The dummy model is just for debugging purpose.

def get_1d_dummy_model(config):
    
    nclass = config.n_classes
    input_length = config.audio_length
    
    inp = Input(shape=(input_length,1))
    x = GlobalMaxPool1D()(inp)
    out = Dense(nclass, activation=softmax)(x)

    model = models.Model(inputs=inp, outputs=out)
    opt = optimizers.Adam(config.learning_rate)

    model.compile(optimizer=opt, loss=losses.categorical_crossentropy, metrics=['acc'])
    return model

In [19]:
# Our 1D Conv model is fairly deep and is trained
# using Adam Optimizer with a learning rate of 0.0001

def get_1d_conv_model(config):
    
    nclass = config.n_classes
    input_length = config.audio_length
    
    inp = Input(shape=(input_length,1))
    x = Convolution1D(16, 9, activation=relu, padding="valid")(inp)
    x = Convolution1D(16, 9, activation=relu, padding="valid")(x)
    x = MaxPool1D(16)(x)
    x = Dropout(rate=0.1)(x)
    
    x = Convolution1D(32, 3, activation=relu, padding="valid")(x)
    x = Convolution1D(32, 3, activation=relu, padding="valid")(x)
    x = MaxPool1D(4)(x)
    x = Dropout(rate=0.1)(x)
    
    x = Convolution1D(32, 3, activation=relu, padding="valid")(x)
    x = Convolution1D(32, 3, activation=relu, padding="valid")(x)
    x = MaxPool1D(4)(x)
    x = Dropout(rate=0.1)(x)
    
    x = Convolution1D(256, 3, activation=relu, padding="valid")(x)
    x = Convolution1D(256, 3, activation=relu, padding="valid")(x)
    x = GlobalMaxPool1D()(x)
    x = Dropout(rate=0.2)(x)

    x = Dense(64, activation=relu)(x)
    x = Dense(1028, activation=relu)(x)
    out = Dense(nclass, activation=softmax)(x)

    model = models.Model(inputs=inp, outputs=out)
    opt = optimizers.Adam(config.learning_rate)

    model.compile(optimizer=opt, loss=losses.categorical_crossentropy, metrics=['acc'])
    return model

In [20]:
# It is important to convert raw labels to integer indices
train.head()

Unnamed: 0,fname,labels,nframes
0,00097e21.wav,Bathtub_(filling_or_washing),661500
1,000b6cfb.wav,Motorcycle,661500
4,0019adae.wav,Raindrop,661500
5,001b819d.wav,Bass_guitar,661500
9,0020becb.wav,Harmonica,687104


In [21]:
LABELS = list(train['labels'].unique())
label_idx = {label: i for i, label in enumerate(LABELS)}

train.set_index("fname", inplace=True)
test.set_index("fname", inplace=True)

train["label_idx"] = train['labels'].apply(lambda x: label_idx[x])

if not COMPLETE_RUN:
    train = train[:2000]
    test = test[:2000]

In [22]:
# raw labels with integer indices
train.head()

Unnamed: 0_level_0,labels,nframes,label_idx
fname,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
00097e21.wav,Bathtub_(filling_or_washing),661500,0
000b6cfb.wav,Motorcycle,661500,1
0019adae.wav,Raindrop,661500,2
001b819d.wav,Bass_guitar,661500,3
0020becb.wav,Harmonica,687104,4


10-Fold Training

In [29]:
config = Config(sampling_rate=16000, audio_duration=2, n_folds=10, learning_rate=0.001)

if not COMPLETE_RUN:
    config = Config(sampling_rate=100, audio_duration=1, n_folds=2, max_epochs=1)

In [None]:
PREDICTION_FOLDER = "predictions_1d_conv"
if not os.path.exists(PREDICTION_FOLDER):
    os.mkdir(PREDICTION_FOLDER)
if os.path.exists('logs/' + PREDICTION_FOLDER):
    shutil.rmtree('logs/' + PREDICTION_FOLDER)

# We use from sklearn.model_selection.StratifiedKFold for splitting the trainig data into 10 folds.
skf = StratifiedKFold(n_splits=config.n_folds)

for i, (train_split, val_split) in enumerate(skf.split(train.index, train.label_idx)):
    train_set = train.iloc[train_split]
    val_set = train.iloc[val_split]
    
    # We use some Keras callbacks to monitor the training.
    # ModelCheckpoint saves the best weight of our model (using validation data).
    # We use this weight to make test predictions.
    checkpoint = ModelCheckpoint('best_noisy_%d.h5'%i, monitor='val_loss', verbose=1, save_best_only=True)
    # EarlyStopping stops the training once validation loss ceases to decrease
    early = EarlyStopping(monitor="val_loss", mode="min", patience=5)
    # TensorBoard helps us visualize training and validation loss and accuracy.
    tb = TensorBoard(log_dir='./logs/' + PREDICTION_FOLDER + '/fold_%d'%i, write_graph=True)

    callbacks_list = [checkpoint, early, tb]
    print("\nFold: ", i)
    if COMPLETE_RUN:
        model = get_1d_conv_model(config)
    else:
        model = get_1d_dummy_model(config)

    # We fit the model using DataGenerator for training and validation splits.
    train_generator = DataGenerator(config, data_path + '/train_noisy/', train_set.index, 
                                    train_set.label_idx, batch_size=64,
                                    preprocessing_fn=audio_norm)
    val_generator = DataGenerator(config, data_path + '/train_noisy/', val_set.index, 
                                  val_set.label_idx, batch_size=64,
                                  preprocessing_fn=audio_norm)
    
    history = model.fit_generator(train_generator, callbacks=callbacks_list, validation_data=val_generator,
                                  epochs=config.max_epochs, use_multiprocessing=True, max_queue_size=20)
    
#     model.load_weights('../working/best_%d.h5'%i)
    
    # Save train predictions
    train_generator = DataGenerator(config, data_path + '/train_noisy/', train.index, batch_size=128,
                                    preprocessing_fn=audio_norm)
    predictions = model.predict_generator(train_generator, use_multiprocessing=True, 
                                          max_queue_size=20, verbose=1)
    
    # We get both training and test predictions and save them as .npy format.
    # We also generate a submission file. For 10-fold CV, the number of prediction files should be 10.
    # We will ensemble these predictions later.
    np.save(PREDICTION_FOLDER + "/train_predictions_%d.npy"%i, predictions)
    
    # Save test predictions
    test_generator = DataGenerator(config, data_path + '/test/', test.index, batch_size=128,
                                    preprocessing_fn=audio_norm)
    predictions = model.predict_generator(test_generator, use_multiprocessing=True, 
                                          max_queue_size=20, verbose=1)
    np.save(PREDICTION_FOLDER + "/test_predictions_%d.npy"%i, predictions)
    
    # Make a submission file
    top_3 = np.array(LABELS)[np.argsort(-predictions, axis=1)[:, :3]]
    predicted_labels = [' '.join(list(x)) for x in top_3]
    test['label'] = predicted_labels
    test[['label']].to_csv(PREDICTION_FOLDER + "/predictions_%d.csv"%i)


Fold:  0
Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
Epoch 1/50

In [None]:
# predictions are saved as following.
os.listdir('./predictions_1d_conv/')

In [None]:
# Ensembling 1D Conv Predictions
# Now that we have trained our model, it is time average the predictions of X-folds.
# We will try Geometric Mean averaging.

pred_list = []
for i in range(config.n_folds):
    pred_list.append(np.load("./predictions_1d_conv/noisy_test_predictions_%d.npy"%i))
prediction = np.ones_like(pred_list[0])

# calculate geometric mean
for pred in pred_list:
    prediction = prediction*pred
prediction = prediction**(1./len(pred_list))

# Make a submission file
top_3 = np.array(LABELS)[np.argsort(-prediction, axis=1)[:, :3]]
predicted_labels = [' '.join(list(x)) for x in top_3]
test = pd.read_csv(data_path + '/sample_submission.csv')
test['label'] = predicted_labels
test[['fname', 'label']].to_csv("1d_conv_noisy_ensembled_submission.csv", index=False)