In [0]:
import numpy as np
import os
import shutil
import IPython
import pandas as pd
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
from sklearn.cross_validation import StratifiedKFold
import librosa
import numpy as np
import scipy
from keras import losses, models, optimizers
from keras.activations import relu, softmax
import keras.backend as K
from keras.callbacks import (EarlyStopping, LearningRateScheduler,
                             ModelCheckpoint, TensorBoard, ReduceLROnPlateau)
from keras.layers import (Input , Convolution2D, Conv1D , GlobalAveragePooling2D, BatchNormalization, Flatten,
                          GlobalMaxPool2D, MaxPool2D, concatenate, Activation , Dense , CuDNNLSTM , CuDNNGRU , Bidirectional , Concatenate , Add)
from keras.utils import Sequence, to_categorical
from keras import backend as K
from keras.regularizers import l2
%matplotlib inline
matplotlib.style.use('ggplot')

In [9]:
train = pd.read_csv(".kaggle/competitions/freesound-audio-tagging/train.csv")
test = pd.read_csv(".kaggle/competitions/freesound-audio-tagging/sample_submission.csv")

In [10]:
class Config(object):
    def __init__(self,
                 sampling_rate=16000, audio_duration=2, n_classes=41,
                 use_mfcc=False, n_folds=10, learning_rate=0.0001, 
                 max_epochs=50, n_mfcc=20):
        self.sampling_rate = sampling_rate
        self.audio_duration = audio_duration
        self.n_classes = n_classes
        self.use_mfcc = use_mfcc
        self.n_mfcc = n_mfcc
        self.n_folds = n_folds
        self.learning_rate = learning_rate
        self.max_epochs = max_epochs

        self.audio_length = self.sampling_rate * self.audio_duration
        if self.use_mfcc:
            self.dim = (self.n_mfcc, 1 + int(np.floor(self.audio_length/512)), 1)
        else:
            self.dim = (self.audio_length, 1)

In [74]:
def resnet_block(identity , filters , downsample = True , proj = True):
  
  if(downsample and proj):
    
    x = Convolution2D(filters , kernel_size = 3 , strides = 2 , padding = 'same' , kernel_regularizer = l2(0.0001))(identity)
    identity = Convolution2D(filters , kernel_size = 1 , strides = 2 , padding = 'same' , kernel_regularizer = l2(0.0001))(identity)
    identity = BatchNormalization()(identity)
    identity = Activation('relu')(identity)
    
  elif proj:
    
    identity = Convolution2D(filters , kernel_size = 1 , strides = 1 , padding = 'same' , kernel_regularizer = l2(0.0001))(identity)
    identity = BatchNormalization()(identity)
    identity = Activation('relu')(identity)
    
    x = Convolution2D(filters , kernel_size = 3 , strides = 1 , padding = 'same' , kernel_regularizer = l2(0.0001))(identity)
    
  else:
    x = Convolution2D(filters , kernel_size = 3 , strides = 1 , padding = 'same' , kernel_regularizer = l2(0.0001))(identity)

  x = BatchNormalization()(x)
  x = Activation('relu')(x)
  x = Convolution2D(filters , kernel_size = 3 , strides = 1 , padding = 'same' , kernel_regularizer = l2(0.0001))(x)
  x = BatchNormalization()(x)
  
  x = Add()([x , identity])
  
  x = Activation('relu')(x)
  
  return x

In [119]:
def conv_model(config):
    
    nclass = config.n_classes
    
    inp = Input(shape=(config.dim[0],config.dim[1],1))

    x = Convolution2D(8, (7,7), strides = (1 , 2) , padding = "same" , kernel_regularizer = l2(0.0001))(inp)
    x = BatchNormalization()(x)
    x = Activation("relu")(x)
    
    x = MaxPool2D(pool_size = 3 , strides = (1,2) , padding = 'same')(x)
    
    x = resnet_block(x , 16 , downsample = False)
    x = resnet_block(x , 16 , downsample = False , proj = False)
    x = resnet_block(x , 16 , downsample = False , proj = False)
    x = resnet_block(x , 32 , downsample = True)
    x = resnet_block(x , 32 , downsample = False , proj = False)
    x = resnet_block(x , 32 , downsample = False , proj = False)
    x = resnet_block(x , 64 , downsample = True)
    x = resnet_block(x , 64 , downsample = False , proj = False)
    x = resnet_block(x , 64 , downsample = False , proj = False)
 
    x = GlobalAveragePooling2D()(x)

    out = Dense(nclass, activation=softmax , kernel_regularizer = l2(0.0001))(x)

    model = models.Model(inputs=inp, outputs=out)
    opt = optimizers.Adam(lr = config.learning_rate)

    model.compile(optimizer=opt, loss=losses.categorical_crossentropy, metrics=['acc'])
    return model

In [120]:
config = Config(sampling_rate=44100, audio_duration=2, n_folds=5, 
                learning_rate=0.001, use_mfcc=True, n_mfcc=40)

In [121]:
mymodel = conv_model(config)
mymodel.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            (None, 40, 173, 1)   0                                            
__________________________________________________________________________________________________
conv2d_23 (Conv2D)              (None, 40, 87, 8)    400         input_2[0][0]                    
__________________________________________________________________________________________________
batch_normalization_23 (BatchNo (None, 40, 87, 8)    32          conv2d_23[0][0]                  
__________________________________________________________________________________________________
activation_23 (Activation)      (None, 40, 87, 8)    0           batch_normalization_23[0][0]     
__________________________________________________________________________________________________
max_poolin

In [22]:
LABELS = list(train.label.unique())
label_idx = {label: i for i, label in enumerate(LABELS)}
train.set_index("fname", inplace=True)
test.set_index("fname", inplace=True)
train["label_idx"] = train.label.apply(lambda x: label_idx[x])

In [23]:
def prepare_data(df, config, data_dir):
    X = np.empty(shape=(df.shape[0], config.dim[0], config.dim[1] , 1))
    input_length = config.audio_length
    for i, fname in enumerate(df.index):
        file_path = data_dir + fname
        data, _ = librosa.core.load(file_path, sr=config.sampling_rate, res_type="kaiser_fast")

        # Random offset / Padding
        if len(data) > input_length:
            max_offset = len(data) - input_length
            offset = np.random.randint(max_offset)
            data = data[offset:(input_length+offset)]
        else:
            if input_length > len(data):
                max_offset = input_length - len(data)
                offset = np.random.randint(max_offset)
            else:
                offset = 0
            data = np.pad(data, (offset, input_length - len(data) - offset), "constant")

        data = librosa.feature.mfcc(data, sr=config.sampling_rate, n_mfcc=config.n_mfcc)
        data = np.expand_dims(data, axis=-1)
        X[i,] = data
    return X

In [24]:
X_test = prepare_data(test, config, 'audio_test/')
X_train = prepare_data(train, config, 'audio_train/')
y_train = to_categorical(train.label_idx, num_classes=config.n_classes)

In [25]:
mean = np.mean(X_train, axis=0)
std = np.std(X_train, axis=0)

X_train = (X_train - mean)/std
X_test = (X_test - mean)/std

In [116]:
def lr_schedular(epoch , lr):
  if epoch in [10 , 20]:
    lr = lr / 10.0
  return lr

In [117]:
ls = LearningRateScheduler(lr_schedular, verbose=2)

In [0]:
PREDICTION_FOLDER = "predictions_conv"
if not os.path.exists(PREDICTION_FOLDER):
    os.mkdir(PREDICTION_FOLDER)
if os.path.exists('logs/' + PREDICTION_FOLDER):
    shutil.rmtree('logs/' + PREDICTION_FOLDER)

skf = StratifiedKFold(train.label_idx, n_folds=config.n_folds , shuffle = True)
for i, (train_split, val_split) in enumerate(skf):
    K.clear_session()
    X, y, X_val, y_val = X_train[train_split], y_train[train_split], X_train[val_split], y_train[val_split]
    checkpoint = ModelCheckpoint('best_%d.h5'%i, monitor='val_loss', verbose=1, save_best_only=True)
    early = EarlyStopping(monitor="val_loss", mode="min", patience=8)
    tb = TensorBoard(log_dir='./logs/' + PREDICTION_FOLDER + '/fold_%i'%i, write_graph=True)
    callbacks_list = [checkpoint, early]
    print("#"*50)
    print("Fold: ", i)
    model = conv_model(config)
    history = model.fit(X, y, validation_data=(X_val, y_val), callbacks=callbacks_list, 
                        batch_size=64, epochs=config.max_epochs , verbose = 2)
    model.load_weights('best_%d.h5'%i)

    # Save train predictions
    predictions = model.predict(X_train, batch_size=64, verbose=2)
    np.save(PREDICTION_FOLDER + "/train_predictions_%d.npy"%i, predictions)

    # Save test predictions
    predictions = model.predict(X_test, batch_size=64, verbose=2)
    np.save(PREDICTION_FOLDER + "/test_predictions_%d.npy"%i, predictions)

    # Make a submission file
    top_3 = np.array(LABELS)[np.argsort(-predictions, axis=1)[:, :3]]
    predicted_labels = [' '.join(list(x)) for x in top_3]
    test['label'] = predicted_labels
    test[['label']].to_csv(PREDICTION_FOLDER + "/predictions_%d.csv"%i)

In [85]:
pred_list = []
for i in range(10):
    pred_list.append(np.load("predictions_conv/test_predictions_%d.npy"%i))
prediction = np.ones_like(pred_list[0])
for pred in pred_list:
    prediction = prediction*pred
prediction = prediction**(1./len(pred_list))
# Make a submission file
top_3 = np.array(LABELS)[np.argsort(-prediction, axis=1)[:, :3]]
predicted_labels = [' '.join(list(x)) for x in top_3]
test = pd.read_csv('.kaggle/competitions/freesound-audio-tagging/sample_submission.csv')
test['label'] = predicted_labels
test[['fname', 'label']].to_csv("submission_conv.csv", index=False)