<a href="https://colab.research.google.com/github/QColeman97/AudioTagger/blob/master/AudioTagger.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install sox



In [0]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns # for data visualization 

import IPython
import IPython.display as ipd #To play sound in notebook
import scipy as sci
import wave 
from pathlib import Path

from scipy.fftpack import fft #Fast Fourier Transformation 
from scipy.io import wavfile 

import librosa 

In [4]:
import os 
print(os.listdir("drive/My Drive/CSC490Final-AudioTagger"))

['FSDKaggle2018.audio_test', 'FSDKaggle2018.audio_train', 'test_post_competition_scoring_clips.csv', 'train_post_competition.csv']


In [5]:
INPUT_PATH ="drive/My Drive/CSC490Final-AudioTagger/"
audio_train_file = (INPUT_PATH + "FSDKaggle2018.audio_train")
audio_test_file = (INPUT_PATH + "FSDKaggle2018.audio_test")
train= pd.read_csv(INPUT_PATH + "train_post_competition.csv")


#scipy.wavfile.read returns rate of wave, and # of data read
filename = '/001ca53d.wav'
sample_rate, samples = wavfile.read(str(audio_train_file) + filename)
print(samples)
print(train.shape)

[-33 -32 -34 ...  -1  -1  -1]
(9473, 5)


In [6]:
print(train.head())

          fname         label  ...  freesound_id             license
0  00044347.wav        Hi-hat  ...         28739         Attribution
1  001ca53d.wav     Saxophone  ...        358827         Attribution
2  002d256b.wav       Trumpet  ...         10897  Creative Commons 0
3  0033e230.wav  Glockenspiel  ...        325017         Attribution
4  00353774.wav         Cello  ...        195688         Attribution

[5 rows x 5 columns]


# DATA PREPROCESSING 
Cut out silent parts 

Normalize wave form

In [40]:
df_train = pd.read_csv(INPUT_PATH +'/train_post_competition.csv')
df_test = pd.read_csv(INPUT_PATH + '/test_post_competition_scoring_clips.csv')
labels = df_train.label.unique()
label2int = {label:index for index, label in enumerate(labels)}
num_class = len(labels)
#Indices of manually verified training data
verifed_train = np.array(df_train[df_train.manually_verified == 1].index)
#array of labels in number form (0 = hi-hat, 1 = saxophone, etc)
plain_y_train = np.array([label2int[label] for label in df_train.label])

#np.set_printoptions(threshold=np.inf)
plain_y_train

array([ 0,  1,  2, ..., 12, 20, 17])

In [0]:
'''Two approaches:
  1) LH uses highest feature with only beginning sound. Useful info
     are usually in begging part of sample and support. 
     
  2) Splits sample and uses one that are long but with coarser feature. 
     Good for samples with contents in the middle or later part. Perfect 
     for ones that use entire wav file  
'''
confLH, confX = {}, {}
confs = [confLH, confX]
confLH['folder'] = Path('LH')
confX['folder'] = Path('X')

#configs for confLH: highest resolutions

confLH['sampling_rate'] = 44100
confLH['duration'] = 4
confLH['hop_length'] = 882 # 20ms
confLH['fmin'] = 20
confLH['fmax'] = confLH['sampling_rate'] // 2
confLH['n_mels'] = 128
confLH['n_fft'] = confLH['n_mels'] * 20
confLH['audio_split'] = 'head'
confLH['samples'] = confLH['sampling_rate'] * confLH['duration']
confLH['dims'] = (confLH['n_mels'], 1 + 
                  int(np.floor(confLH['samples']/confLH['hop_length'])), 1)


# Approach X uses longer sound, then it uses suppressed 
confX['sampling_rate'] = 26000
confX['duration'] = 6
confX['hop_length'] = 520 # 20ms
confX['fmin'] = 20
confX['fmax'] = confX['sampling_rate'] // 2
confX['n_mels'] = 48
confX['n_fft'] = confX['n_mels'] * 20
confX['audio_split'] = 'dont_crop'
confX['samples'] = confX['sampling_rate'] * confX['duration']
confX['dims'] = (confX['n_mels'], 1 + 
                  int(np.floor(confX['samples']/confX['hop_length'])), 1)




In [0]:
import librosa
import librosa.display

def read_audio(conf, pathname):
    #return audio time series and sampling rate 
    y, sr = librosa.load(pathname, sr=conf['sampling_rate'])
    # trim silence
    if 0 < len(y):
        y, _ = librosa.effects.trim(y) # trim, top_db=default(60)
    # make it unified length to conf.samples
    if len(y) > conf['samples']: # long enough
        if conf['audio_split'] == 'head':
            y = y[0:0+conf['samples']]
    else: # pad blank
        padding = conf['samples'] - len(y)    # add padding at both ends
        offset = padding // 2
        y = np.pad(y, (offset, conf['samples'] - len(y) - offset), 'constant')
    return y

def audio_to_melspectrogram(conf, audio):
    spectrogram = librosa.feature.melspectrogram(audio, 
                                                 sr=conf['sampling_rate'],
                                                 n_mels=conf['n_mels'],
                                                 hop_length=conf['hop_length'],
                                                 n_fft=conf['n_fft'],
                                                 fmin=conf['fmin'],
                                                 fmax=conf['fmax'])
    #convert spectrogram to decibel
    spectrogram = librosa.power_to_db(spectrogram)
    spectrogram = spectrogram.astype(np.float32)
    return spectrogram

def show_melspectrogram(mels, conf):
    librosa.display.specshow(mels, x_axis='time', y_axis='mel', 
                             sr=conf['sampling_rate'], hop_length=conf['hop_length'],
                            fmin=conf['fmin'], fmax=conf['fmax'])
    plt.colorbar(format='%+2.0f dB')
    plt.title('Log-frequency power spectrogram')
    plt.show()

def read_as_melspectrogram(conf, pathname, debug_display=False):
    x = read_audio(conf, pathname)
    mels = audio_to_melspectrogram(conf, x)
    if debug_display:
        IPython.display.display(IPython.display.Audio(x, rate=conf['sampling_rate']))
        show_melspectrogram(mels, conf)
    return mels

In [46]:
#spectograms are ndarray 
mels1 = read_as_melspectrogram(confLH, audio_train_file + '/' +
                       df_train.fname[0], debug_display=False)
mels_LH2 = read_as_melspectrogram(confLH, audio_train_file + '/' +
                                  df_train.fname[1], debug_display=False)

mels2 = read_as_melspectrogram(confX, audio_train_file + '/' + 
                       df_train.fname[0], debug_display=False)


[[-32.454773  -37.377052  -34.06007   ... -36.114246  -16.214312
   -2.8257742]
 [-44.019295  -43.012085  -38.085457  ... -46.167946  -15.615058
   -2.4670937]
 [-53.407314  -48.762466  -41.635468  ... -41.03498   -14.579424
   -1.5052011]
 ...
 [-55.19351   -55.19351   -55.19351   ... -55.19351   -55.19351
  -55.19351  ]
 [-55.19351   -55.19351   -55.19351   ... -55.19351   -55.19351
  -55.19351  ]
 [-55.19351   -55.19351   -55.19351   ... -55.19351   -55.19351
  -55.19351  ]]


In [44]:
print(mels1.size)
print(mels_LH2.size)
print(mels2.size)


25728
25728
27744


In [10]:
#
sox 

NameError: ignored