In [1]:
# Importing all the important libraries

import glob
import os
import librosa
import re
import numpy as np
librosa.util.MAX_MEM_BLOCK = 262144/4
import librosa.display
import numpy as np
from scipy import misc
from scipy import signal

# Loading all the sound files

In [2]:
def load_sounds(dataSetDir, input_sr):
    """
    Returns the sounds in a time-series format
    The name of the files should be in the ESC-50-audio dataset format
    input_sr is the sampling rate and should be set to default.
    Also returns the names and the labels of files
    """
    
    sounds = []
    names = []
    labels = []
    
    j = 0
    datasetSize = len(os.listdir(dataSetDir))
    
    print("Number of audio samples to be converted : " + str(datasetSize))
    
    for file in os.listdir(dataSetDir):
        
        sound,sr = librosa.load(dataSetDir+"//"+file,sr = input_sr)
        sounds.append(sound)
        names.append(file[:-4])

        label = re.findall(r'\d+\.wav$',file)
        label = int((label[0][:-4]))
        labels.append(label)
 
        j += 1
        if j % 100 == 99:
            print(str(j+1) + " audio samples extracted: Progress = " + 
                  str((j+1)/datasetSize * 100) +  "%" )
        
    return sounds,names,labels

# Enter the relative dataset Directory and the sampling rate

In [3]:
dataSetDir = "ESC-50-master//audio"
sr = 16000

# Load the audio signals in a time series format

In [4]:
soundSet,nameSet, labelSet = load_sounds(dataSetDir, sr)

Number of audio samples to be converted : 2000
100 datapoints extracted: Progress = 5.0%
200 datapoints extracted: Progress = 10.0%
300 datapoints extracted: Progress = 15.0%
400 datapoints extracted: Progress = 20.0%
500 datapoints extracted: Progress = 25.0%
600 datapoints extracted: Progress = 30.0%
700 datapoints extracted: Progress = 35.0%
800 datapoints extracted: Progress = 40.0%
900 datapoints extracted: Progress = 45.0%
1000 datapoints extracted: Progress = 50.0%
1100 datapoints extracted: Progress = 55.00000000000001%
1200 datapoints extracted: Progress = 60.0%
1300 datapoints extracted: Progress = 65.0%
1400 datapoints extracted: Progress = 70.0%
1500 datapoints extracted: Progress = 75.0%
1600 datapoints extracted: Progress = 80.0%
1700 datapoints extracted: Progress = 85.0%
1800 datapoints extracted: Progress = 90.0%
1900 datapoints extracted: Progress = 95.0%
2000 datapoints extracted: Progress = 100.0%


In [12]:
def spectogram_feature_extractor(sound, name, sr, frameLength):
    """
    Extracts the spectogram features from the sound file,
    converts them into a 256 X 256 image and saves them into a
    appropriate folder.
    
    sound is the audio signal in time series format
    name is the name of the original audio file so
    the spectogram image can be saved properly.
    sr is the sampling rates at which the audio files were sampled.
    framelength is the length of each frame considered for obtained the spectogram
    """
    
    frameSpan = int(frameLength*sr) 
    
    tot_seg = sr * 5 * 2 // frameSpan
    nperseg = 10*sr//tot_seg
    
    # Obtaining the spectogram
    f, t, Sxx = signal.spectrogram(sound, sr, nperseg = nperseg,
                                   noverlap = nperseg//2, nfft = max(256,nperseg))
    
    # Converting the spectogram into a 256 x 256 b/w image"
    SxxNew = misc.imresize(np.flip(Sxx,0),(256,256))
    
    # Saving the spectogram in the appropriate directory
    dirName = str(sr/1e3) + "KHz_framelength" + str(frameLength*1e3)
    misc.imsave(dirName + "//" + name + 'spect'  +'.png',SxxNew)
    
    return Sxx

def dataset_spectogram_feature_extractor(soundSet,nameSet,sr,frameLength):
    """
    Given a list of sound signals and their corresponding names, 
    extracts the spectograms in an image form and saves them in an 
    appropriate folder.
    
    sr is the sampling rates at which the audio files were sampled.
    framelength is the length of each frame considered for obtained the spectogram
    """
    featureSet  = []
    for i in range(len(soundSet)):
        spectogram_feature_extractor(soundSet[i],nameSet[i], sr, frameLength)
    print("Spectogram image features have been extracted and saved in the folder " + 
         str(sr/1e3) + "KHz_framelength" + str(frameLength*1e3) + ".")
    return  

In [13]:
# Extraction with frame length 20 ms
frameLength = 20 * 1e-3
if (os.path.isdir(str(sr/1e3) + "KHz_framelength" + str(frameLength*1e3)) != True):
    os.mkdir(str(sr/1e3) + "KHz_framelength" + str(frameLength*1e3))
features = dataset_spectogram_feature_extractor(np.array(soundSet), 
                                                nameSet, sr, frameLength)

Spectogram image features have been extracted and saved in the folder 16.0KHz_framelength20.0.


In [14]:
# Extraction with frame length 30 ms
frameLength = 30 * 1e-3
if (os.path.isdir(str(sr/1e3) + "KHz_framelength" + str(frameLength*1e3)) != True):
    os.mkdir(str(sr/1e3) + "KHz_framelength" + str(frameLength*1e3))
features = dataset_spectogram_feature_extractor(np.array(soundSet), 
                                                nameSet, sr, frameLength)

Spectogram image features have been extracted and saved in the folder 16.0KHz_framelength30.0.


In [15]:
# Extraction with frame length 40 ms
frameLength = 40 * 1e-3
if (os.path.isdir(str(sr/1e3) + "KHz_framelength" + str(frameLength*1e3)) != True):
    os.mkdir(str(sr/1e3) + "KHz_framelength" + str(frameLength*1e3))
features = dataset_spectogram_feature_extractor(np.array(soundSet), 
                                                nameSet, sr, frameLength)

Spectogram image features have been extracted and saved in the folder 16.0KHz_framelength40.0.


In [16]:
# Extraction with frame length 50 ms
frameLength = 50 * 1e-3
if (os.path.isdir(str(sr/1e3) + "KHz_framelength" + str(frameLength*1e3)) != True):
    os.mkdir(str(sr/1e3) + "KHz_framelength" + str(frameLength*1e3))
features = dataset_spectogram_feature_extractor(np.array(soundSet), 
                                     nameSet, sr, frameLength)

Spectogram image features have been extracted and saved in the folder 16.0KHz_framelength50.0.
