<a href="https://colab.research.google.com/github/QColeman97/AudioTagger/blob/master/AudioTagger.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [23]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [24]:
!pip install sox



In [0]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns # for data visualization 

import IPython
import IPython.display as ipd #To play sound in notebook
import scipy as sci
import wave 
from pathlib import Path

from scipy.fftpack import fft #Fast Fourier Transformation 
from scipy.io import wavfile 

import librosa 

In [26]:
import os 
print(os.listdir("drive/My Drive/CSC490Final-AudioTagger"))

['FSDKaggle2018.audio_test', 'FSDKaggle2018.audio_train', 'test_post_competition_scoring_clips.csv', 'train_post_competition.csv']


In [27]:
INPUT_PATH ="drive/My Drive/CSC490Final-AudioTagger/"
audio_train_file = (INPUT_PATH + "FSDKaggle2018.audio_train")
audio_test_file = (INPUT_PATH + "FSDKaggle2018.audio_test")
train= pd.read_csv(INPUT_PATH + "train_post_competition.csv")


#scipy.wavfile.read returns rate of wave, and # of data read
filename = '/001ca53d.wav'
sample_rate, samples = wavfile.read(str(audio_train_file) + filename)
print(samples)
print(train.shape)

[-33 -32 -34 ...  -1  -1  -1]
(9473, 5)


In [28]:
print(train.head())

          fname         label  ...  freesound_id             license
0  00044347.wav        Hi-hat  ...         28739         Attribution
1  001ca53d.wav     Saxophone  ...        358827         Attribution
2  002d256b.wav       Trumpet  ...         10897  Creative Commons 0
3  0033e230.wav  Glockenspiel  ...        325017         Attribution
4  00353774.wav         Cello  ...        195688         Attribution

[5 rows x 5 columns]


# DATA PREPROCESSING 
Cut out silent parts 

Normalize wave form

In [29]:
df_train = pd.read_csv(INPUT_PATH +'/train_post_competition.csv')
df_test = pd.read_csv(INPUT_PATH + '/test_post_competition_scoring_clips.csv')
labels = df_train.label.unique()
label2int = {label:index for index, label in enumerate(labels)}
num_class = len(labels)
#Indices of manually verified training data
verifed_train = np.array(df_train[df_train.manually_verified == 1].index)
#array of labels in number form (0 = hi-hat, 1 = saxophone, etc)
plain_y_train = np.array([label2int[label] for label in df_train.label])

#np.set_printoptions(threshold=np.inf)
plain_y_train

array([ 0,  1,  2, ..., 12, 20, 17])

In [0]:
'''Two approaches:
  1) LH uses highest feature with only beginning sound. Useful info
     are usually in begging part of sample and support. 
     
  2) Splits sample and uses one that are long but with coarser feature. 
     Good for samples with contents in the middle or later part. Perfect 
     for ones that use entire wav file  
'''
confLH, confX = {}, {}
confs = [confLH, confX]
confLH['folder'] = Path('LH')
confX['folder'] = Path('X')

#configs for confLH: highest resolutions

confLH['sampling_rate'] = 44100
confLH['duration'] = 4
confLH['hop_length'] = 882 # 20ms
confLH['fmin'] = 20
confLH['fmax'] = confLH['sampling_rate'] // 2
confLH['n_mels'] = 128
confLH['n_fft'] = confLH['n_mels'] * 20
confLH['audio_split'] = 'head'
confLH['samples'] = confLH['sampling_rate'] * confLH['duration']
confLH['dims'] = (confLH['n_mels'], 1 + 
                  int(np.floor(confLH['samples']/confLH['hop_length'])), 1)


# Approach X uses longer sound, then it uses suppressed 
confX['sampling_rate'] = 26000
confX['duration'] = 6
confX['hop_length'] = 520 # 20ms
confX['fmin'] = 20
confX['fmax'] = confX['sampling_rate'] // 2
confX['n_mels'] = 48
confX['n_fft'] = confX['n_mels'] * 20
confX['audio_split'] = 'dont_crop'
confX['samples'] = confX['sampling_rate'] * confX['duration']
confX['dims'] = (confX['n_mels'], 1 + 
                  int(np.floor(confX['samples']/confX['hop_length'])), 1)




In [0]:
import librosa
import librosa.display

def read_audio(conf, pathname):
    #return audio time series and sampling rate 
    y, sr = librosa.load(pathname, sr=conf['sampling_rate'])
    # trim silence
    if 0 < len(y):
        y, _ = librosa.effects.trim(y) # trim, top_db=default(60)
    # make it unified length to conf.samples
    if len(y) > conf['samples']: # long enough
        if conf['audio_split'] == 'head':
            y = y[0:0+conf['samples']]
    else: # pad blank
        padding = conf['samples'] - len(y)    # add padding at both ends
        offset = padding // 2
        y = np.pad(y, (offset, conf['samples'] - len(y) - offset), 'constant')
    return y

def audio_to_melspectrogram(conf, audio):
    spectrogram = librosa.feature.melspectrogram(audio, 
                                                 sr=conf['sampling_rate'],
                                                 n_mels=conf['n_mels'],
                                                 hop_length=conf['hop_length'],
                                                 n_fft=conf['n_fft'],
                                                 fmin=conf['fmin'],
                                                 fmax=conf['fmax'])
    #convert spectrogram to decibel
    spectrogram = librosa.power_to_db(spectrogram)
    spectrogram = spectrogram.astype(np.float32)
    return spectrogram

def show_melspectrogram(mels, conf):
    librosa.display.specshow(mels, x_axis='time', y_axis='mel', 
                             sr=conf['sampling_rate'], hop_length=conf['hop_length'],
                            fmin=conf['fmin'], fmax=conf['fmax'])
    plt.colorbar(format='%+2.0f dB')
    plt.title('Log-frequency power spectrogram')
    plt.show()

def read_as_melspectrogram(conf, pathname, debug_display=False):
    x = read_audio(conf, pathname)
    mels = audio_to_melspectrogram(conf, x)
    if debug_display:
        IPython.display.display(IPython.display.Audio(x, rate=conf['sampling_rate']))
        show_melspectrogram(mels, conf)
    return mels

In [0]:
#spectograms are ndarray 

#LH method
mels1 = read_as_melspectrogram(confLH, audio_train_file + '/' +
                       df_train.fname[0], debug_display=False)
mels_LH2 = read_as_melspectrogram(confLH, audio_train_file + '/' +
                                  df_train.fname[100], debug_display=False)

# X method 
mels2 = read_as_melspectrogram(confX, audio_train_file + '/' + 
                       df_train.fname[0], debug_display=False)
mels_X2 = read_as_melspectrogram(confX, audio_train_file + '/' + 
                       df_train.fname[1], debug_display=False)

In [33]:
print(mels1.size)
print(mels_LH2.size)
print(mels2.size)
print(mels_X2.size)

25728
25728
27744
24816


In [0]:
def split_long_data(conf, X):
    # Splits long mel-spectrogram data with small overlap
    L = X.shape[1]
    one_length = conf['dims'][1]
    loop_length = int(one_length * 0.9)
    min_length = int(one_length * 0.2)
    print(' sample length', L, 'to cut every', one_length)
    for idx in range(L // loop_length):
        cur = loop_length * idx
        rest = L - cur
        if one_length <= rest:
            yield X[:, cur:cur+one_length]
        elif min_length <= rest:
            cur = L - one_length
            yield X[:, cur:cur+one_length]

def convert_X(df, conf, datapath):
    # Convert all files listed on df.fname
    # Then generates X (contains mel-spectrograms)
    # and index mapping to original sample order
    X = []
    index_map = []
    for i, fname in enumerate(df.fname):
        print('processing', fname)
        data = read_as_melspectrogram(conf, datapath + '/' + fname)
        for chunk in split_long_data(conf, data):
            X.append(np.expand_dims(chunk, axis=-1))
            index_map.append(i)
    return np.array(X), np.array(index_map)

def convert_y_train(idx_train, plain_y_train):
    return np.array( [plain_y_train[idx] for idx in idx_train])

In [51]:
print('All samples will be cut per split length (=duration)')
#X_train, idx_train = convert_X(df_train[:10], confX, audio_train_file)
X_train, idx_train = convert_X(df_train, confX, audio_train_file)
y_train = convert_y_train(idx_train, plain_y_train)
print('Now original 10 samples were cut into ', len(idx_train), 'samples.')
print()
print('idx_train holds original sample index, y_train is also converted to have the same length with X_train/idx_train.')
print('idx_train', idx_train)
print('y_train', y_train)

All samples will be cut per split length (=duration)
processing 00044347.wav
 sample length 578 to cut every 301
processing 001ca53d.wav
 sample length 517 to cut every 301
processing 002d256b.wav
 sample length 301 to cut every 301
processing 0033e230.wav
 sample length 301 to cut every 301
processing 00353774.wav
 sample length 301 to cut every 301
processing 003b91e8.wav


KeyboardInterrupt: ignored

## **Dataset for training**

In [44]:
def datapath(conf, filename):
    return conf['folder'] / filename

def loaddata(conf, filename):
    return np.load(conf['folder'] / filename)

#### This is Toy example by default ####
TRYING_AS_TOY = True # False if you like creating full set

for conf in confs:
    conf['folder'].mkdir(parents=True, exist_ok=True)
    if TRYING_AS_TOY:
        for file in ['X_train', 'y_train', 'idx_train', 'X_test', 'idx_test']:
            shutil.copy(EXTRA/datapath(conf, file+'.npy'), datapath(conf, file+'.npy'))
        plain_y_train = np.load(EXTRA/'toy_plain_y_train.npy')
        train_verified_idx = np.load(EXTRA/'toy_train_verified_idx.npy')
        train_blacklist_index = np.load(EXTRA / 'toy_train_blacklist.npy')
    else:
        if not os.path.exists(datapath(conf, 'X_train.npy')):
            X_train, idx_train = convert_X(df_train, conf, audio_train_file)
            y_train = convert_y_train(idx_train, plain_y_train)
            np.save(datapath(conf, 'X_train.npy'), X_train)
            np.save(datapath(conf, 'y_train.npy'), y_train)
            np.save(datapath(conf, 'idx_train.npy'), idx_train)

            X_test, idx_test = convert_X(df_test, conf, audio_test_file)
            np.save(datapath(conf, 'X_test.npy'), X_test)
            np.save(datapath(conf, 'idx_test.npy'), idx_test)

NameError: ignored

# Models 

Generic 2D CNN model from lab3 made by Quinn

In [45]:
import keras
from keras.layers import Dense, Conv2D, AveragePooling2D
from keras.layers import SeparableConv1D, BatchNormalization, Flatten, Dropout, GlobalAveragePooling1D,MaxPooling1D
from keras.models import Model, Sequential 
  
from keras.callbacks import EarlyStopping, LearningRateScheduler, ModelCheckpoint, TensorBoard, ReduceLROnPlateau
  
def make_model(conf):
    input_shape = conf['dims']
    
    nn = Sequential()
    nn.add(layers.SeparableConv2D(64, (3, 3), padding = 'same',
                                  activation = 'relu',
                                  input_shape = input_shape))
    # Shape: (126, 498, 64)
    nn.add(layers.BatchNormalization())
    nn.add(layers.SeparableConv2D(64, (3, 3), padding = 'same',
                                  activation = 'relu'))
    # Shape: (124, 496, 64)
    nn.add(layers.BatchNormalization())
    nn.add(layers.MaxPooling2D((2, 2)))
    # Shape: (62, 248, 64)
    nn.add(layers.Dropout(0.3))

    nn.add(layers.SeparableConv2D(128, (3, 3), padding = 'same',
                                  activation = 'relu'))
    # Shape: (60, 246, 128)
    nn.add(layers.BatchNormalization())
    nn.add(layers.SeparableConv2D(128, (3, 3), padding = 'same',
                                  activation = 'relu'))
    # Shape: (58, 244, 128)
    nn.add(layers.BatchNormalization())
    nn.add(layers.MaxPooling2D((2, 2)))
    # Shape: (29, 122, 128)
    nn.add(layers.Dropout(0.3))

    # Possibly make this block more like the others
    nn.add(layers.SeparableConv2D(256, (3, 3), padding = 'same',
                                  activation = 'relu'))
    # Shape: (27, 120, 256)
    nn.add(layers.BatchNormalization())
    nn.add(layers.Dropout(0.3))
    nn.add(layers.SeparableConv2D(256, (3, 3), padding = 'same',
                                  activation = 'relu'))
    # Shape: (25, 118, 256)
    nn.add(layers.BatchNormalization())
    nn.add(layers.Dropout(0.3))
    nn.add(layers.SeparableConv2D(256, (3, 3), padding = 'same',
                                  activation = 'relu'))
    # Shape: (23, 116, 256)
    nn.add(layers.BatchNormalization())
    nn.add(layers.Dropout(0.3))
    nn.add(layers.SeparableConv2D(256, (3, 3), padding = 'same',
                                  activation = 'relu'))
    # Shape: (21, 114, 256)
    nn.add(layers.BatchNormalization())
    nn.add(layers.MaxPooling2D((2, 2)))
    # Shape: (10, 57, 256)
    nn.add(layers.Dropout(0.3))

    nn.add(layers.SeparableConv2D(512, (3, 3), padding = 'same',
                                  activation = 'relu'))
    # Shape: (8, 55, 512)
    nn.add(layers.BatchNormalization())
    nn.add(layers.SeparableConv2D(512, (3, 3), padding = 'same',
                                  activation = 'relu'))
    # Shape: (6, 53, 512)
    nn.add(layers.BatchNormalization())
    nn.add(layers.MaxPooling2D((2, 2)))
    # Shape: (3, 26, 512)
    nn.add(layers.Dropout(0.3))

    nn.add(layers.SeparableConv2D(512, (3, 3), padding = 'same',
                                  activation = 'relu'))
    # Shape: (1, 24, 512)
    nn.add(layers.BatchNormalization())
    nn.add(layers.SeparableConv2D(512, (3, 3), padding = 'same',
                                  activation = 'relu'))
    nn.add(layers.BatchNormalization())
    nn.add(layers.GlobalAveragePooling2D())

    nn.add(layers.Dense(41, activation = 'softmax'))
    
    model.compile(optimizer= 'rmsprop', loss = 'categorical_crossentropy',
                  metrics = ['accuracy'])
    
    model.summary()
    
    return nn
  
def train_model(conf, fold, model, train_steps_per_epoch, valid_steps_per_epoch,
                init_best_weights= False, this_epochs = None):

    callbacks = [ModelCheckpoint(str(datapath(conf, 'best_%d.h5' % fold)),
                        monitor='val_loss', verbose=1, save_best_only=True, save_weights_only=True),
        TensorBoard(log_dir=str(datapath(conf, 'logs%s/fold_%d' % (conf['folder'], fold))), write_graph=True)
    ]
    
    if model is None:
        model = make_model(conf)
        weight_filename = str(init_best_weights)
        if weight_filename == 'True' : 
            weight_filename = str(datapath(conf, 'best_%d.h5' % fold))
        if weight_filename is not 'False':
            print('Initializing model with last best weights:', weight_filename)
            model.load_weights(weight_filename)
    #Training model
    partial_x_train = X_train[:7105]
    x_val = X_train[7105:]
    
    partial_y_train = y_train[:7105]
    y_val = y_train[:7105]
    
    history = model.fit(partial_x_train, partial_y_train, callbacks = callbacks, 
                       steps_per_epoch = train_steps_per_epoch,
                       epochs = 10, 
                       batch_size = 32,
                       validation_data = (x_val, y_val),
                       verbose = 1 )
    
    
    return model, history 


def run_k_fold(train_data, train_labels,model):
    k = 4
    num_val = len(train_data) // k
    num_train = len(train_labels) - num_val
    all_val_acc_histories, all_val_loss_histories = [], []
    for x in range(k):
        print("Fold %d " % k )
        val_data = train_data[x * num_val: (x + 1) * num_val]
        val_labels = train_labels[x * num_val: (x + 1) * num_val]

        partial_train_data = np.concatenate(
            [train_data[: x * num_val], train_data[(x + 1) * num_val:]],
            axis = 0)
        partial_train_labels = np.concatenate(
            [train_labels[: x * num_val],
             train_labels[(x + 1) * num_val:]],
            axis = 0)

    hst = model.fit(partial_train_data, partial_train_labels, batch_size = 2,
                    epochs = 10, validation_data = (val_data, val_labels),)

    hst = hst.history
    all_val_loss_histories.append(hst['val_loss'])
    all_val_acc_histories.append(hst['val_acc'])
    
           
    return np.mean(all_val_loss_histories, axis=0), np.mean(all_val_acc_histories, axis=0), hst, model
  
    
def samplewise_mean_X(X):
    for i in range(len(X)):
        X[i] -= np.mean(X[i], keepdims=True)
        X[i] /= (np.std(X[i], keepdims=True) + 1.0)

  
''' 
def run_fold(conf, fold, dataset, model=None, init_best_weights=False, eval_only=False):
    X_train, y_train, idx_train, all_X_train, all_y_train, all_idx_train, X_test, idx_test = dataset
    print('----- Fold#%d ----' % fold)
    # c. Cross validation split & balance # of samples
    _Xtrain, _ytrain, _Xvalid, _yvalid = \
        get_cross_valid_fold_balanced(conf, fold, X_train, y_train, idx_train)

    # d. Train model
    train_generator, valid_generator, plain_datagen = \
        create_generators(conf, _Xtrain, _ytrain, _Xvalid, _yvalid)
    train_steps_per_epoch, valid_steps_per_epoch = \
        get_steps_per_epoch(conf, _Xtrain, _Xvalid)
    
    
    model, history = train_model(conf, fold, model,
                                train_steps_per_epoch, valid_steps_per_epoch,
                                 init_best_weights=init_best_weights,
                                this_epochs=0 if eval_only else None)

    # e. Evaluate with all train sample
    model.load_weights(datapath(conf, 'best_%d.h5' % fold))
    acc, acc_v = evaluate_fold(conf, fold, 'train_predictions_%d.npy', model, plain_datagen,
                               all_X_train, all_idx_train, all_y_train, train_verified_idx)
    evaluate_fold(conf, fold, 'test_predictions_%d.npy', model, plain_datagen, X_test, idx_test)

    print('Trainset accuracy =', acc, '(tested all over the original training set)')
    print('Verified samples accuracy =', acc_v, '(tested over manually verified samples only)')
    return acc, acc_v, history, model, plain_datagen
'''  
   

" \ndef run_fold(conf, fold, dataset, model=None, init_best_weights=False, eval_only=False):\n    X_train, y_train, idx_train, all_X_train, all_y_train, all_idx_train, X_test, idx_test = dataset\n    print('----- Fold#%d ----' % fold)\n    # c. Cross validation split & balance # of samples\n    _Xtrain, _ytrain, _Xvalid, _yvalid =         get_cross_valid_fold_balanced(conf, fold, X_train, y_train, idx_train)\n\n    # d. Train model\n    train_generator, valid_generator, plain_datagen =         create_generators(conf, _Xtrain, _ytrain, _Xvalid, _yvalid)\n    train_steps_per_epoch, valid_steps_per_epoch =         get_steps_per_epoch(conf, _Xtrain, _Xvalid)\n    \n    \n    model, history = train_model(conf, fold, model,\n                                train_steps_per_epoch, valid_steps_per_epoch,\n                                 init_best_weights=init_best_weights,\n                                this_epochs=0 if eval_only else None)\n\n    # e. Evaluate with all train sample\n    mod

# Running Model 

In [49]:
for conf in [confX]: # Running confX only, change this to confs if you need running both confX and confLH
    print('== Attempt [%s] ==' % conf['folder'])

    # a. Load all dataset -> all_(X|y|idx)_train, (X|idx)_test
    all_X_train, all_y_train, all_idx_train = \
        loaddata(conf, 'X_train.npy'), \
        keras.utils.to_categorical(loaddata(conf, 'y_train.npy')), \
        loaddata(conf, 'idx_train.npy')
    X_test, idx_test = loaddata(conf, 'X_test.npy'), loaddata(conf, 'idx_test.npy')
    print('Loaded trainset:%d, testset:%d samples.' % (len(all_X_train), len(X_test)))

    # a'. Normalize samplewise if requested
    
    samplewise_mean_X(all_X_train)
    samplewise_mean_X(X_test)

    # b. Removing samples on the blacklist -> X|y|idx
    '''
    whitelist = [idx for idx in range(len(all_idx_train)) if all_idx_train[idx] not in train_blacklist_index]
    X_train, y_train, idx_train = \
        all_X_train[whitelist], all_y_train[whitelist], all_idx_train[whitelist]
    print('Filtered samples on blacklist, now trainset has %d samples' % len(idx_train))
    '''
    # Train folds
    
    work = {'train_acc': [],
            'train_acc_verified': [],
            'history': []}
    for fold in range(5):
        acc, acc_verified, history, model, _ = run_fold(conf, fold,
                [X_train, y_train, idx_train, all_X_train, all_y_train, all_idx_train, X_test, idx_test],
                model=None,
                init_best_weights=EXTRA / 'X48_AlexNet_00696.h5',
                eval_only=False)
        work['history'].append(history)
        work['train_acc'].append(acc)
        work['train_acc_verified'].append(acc_verified)

    print('___ training finished ___')


== Attempt [X] ==
Loaded trainset:13250, testset:337 samples.


NameError: ignored