In [1]:
# import basic packages
import os
import numpy as np
import wget
import sys
import gdown
import zipfile
import librosa
# in the notebook, we only can use one GPU
os.environ["CUDA_VISIBLE_DEVICES"]="0"

In [None]:
def create_path(path):
    if not os.path.exists(path):
        os.mkdir(path)

workspace = "./workspace"
dataset_path = os.path.join(workspace, "esc-50")
esc_raw_path = os.path.join(dataset_path, 'raw')

create_path(workspace)
create_path(dataset_path)

In [None]:
if not os.path.exists(os.path.join(dataset_path, 'ESC-50-master.zip')):
    print("-------------Downloading ESC-50 Dataset-------------")
    wget.download('https://github.com/karoldvl/ESC-50/archive/master.zip', out=dataset_path)
    with zipfile.ZipFile(os.path.join(dataset_path, 'ESC-50-master.zip'), 'r') as zip_ref:
        zip_ref.extractall(esc_raw_path)
    print("-------------Success-------------")

In [None]:
meta_path = os.path.join(esc_raw_path, 'ESC-50-master', 'meta', 'esc50.csv')
audio_path = os.path.join(esc_raw_path, 'ESC-50-master', 'audio')
resample_path = os.path.join(dataset_path, 'resample')
savedata_path = os.path.join(dataset_path, 'esc-50-data.npy')
create_path(resample_path)

meta = np.loadtxt(meta_path , delimiter=',', dtype='str', skiprows=1)
audio_list = os.listdir(audio_path)

In [None]:
# resample
print("-------------Resample ESC-50-------------")
for f in audio_list:
    full_f = os.path.join(audio_path, f)
    resample_f = os.path.join(resample_path, f)
    if not os.path.exists(resample_f):
        os.system('sox -V1 ' + full_f + ' -r 32000 ' + resample_f)
print("-------------Success-------------")

print("-------------Build Dataset-------------")
output_dict = [[] for _ in range(5)]
for label in meta:
    name = label[0]
    fold = label[1]
    target = label[2]
    y, sr = librosa.load(os.path.join(resample_path, name), sr = None)
    output_dict[int(fold) - 1].append(
        {
            "name": name,
            "target": int(target),
            "waveform": y
        }
    )
np.save(savedata_path, output_dict)
print("-------------Success-------------")
    

In [2]:
import pandas as pd

In [3]:
df = pd.read_csv('./data/ESC-50-master/meta/esc50.csv')
df.head()

Unnamed: 0,filename,fold,target,category,esc10,src_file,take
0,1-100032-A-0.wav,1,0,dog,True,100032,A
1,1-100038-A-14.wav,1,14,chirping_birds,False,100038,A
2,1-100210-A-36.wav,1,36,vacuum_cleaner,False,100210,A
3,1-100210-B-36.wav,1,36,vacuum_cleaner,False,100210,B
4,1-101296-A-19.wav,1,19,thunderstorm,False,101296,A


In [4]:
df['fold'].unique()

array([1, 2, 3, 4, 5])

In [5]:
df.loc[df['fold']==1]

Unnamed: 0,filename,fold,target,category,esc10,src_file,take
0,1-100032-A-0.wav,1,0,dog,True,100032,A
1,1-100038-A-14.wav,1,14,chirping_birds,False,100038,A
2,1-100210-A-36.wav,1,36,vacuum_cleaner,False,100210,A
3,1-100210-B-36.wav,1,36,vacuum_cleaner,False,100210,B
4,1-101296-A-19.wav,1,19,thunderstorm,False,101296,A
...,...,...,...,...,...,...,...
395,1-9841-A-13.wav,1,13,crickets,False,9841,A
396,1-9886-A-49.wav,1,49,hand_saw,False,9886,A
397,1-9887-A-49.wav,1,49,hand_saw,False,9887,A
398,1-9887-B-49.wav,1,49,hand_saw,False,9887,B


### make esc-50 dataset

In [6]:
df['filename'][0]

'1-100032-A-0.wav'

In [7]:
df_2 = df.loc[df['fold']==2]
df_2

Unnamed: 0,filename,fold,target,category,esc10,src_file,take
400,2-100648-A-43.wav,2,43,car_horn,False,100648,A
401,2-100786-A-1.wav,2,1,rooster,True,100786,A
402,2-101676-A-10.wav,2,10,rain,True,101676,A
403,2-102414-A-17.wav,2,17,pouring_water,False,102414,A
404,2-102414-B-17.wav,2,17,pouring_water,False,102414,B
...,...,...,...,...,...,...,...
795,2-99795-A-32.wav,2,32,keyboard_typing,False,99795,A
796,2-99796-A-32.wav,2,32,keyboard_typing,False,99796,A
797,2-99955-A-7.wav,2,7,insects,False,99955,A
798,2-99955-B-7.wav,2,7,insects,False,99955,B


In [8]:
df_2.reset_index(drop = True, inplace=True)

In [9]:
df_2

Unnamed: 0,filename,fold,target,category,esc10,src_file,take
0,2-100648-A-43.wav,2,43,car_horn,False,100648,A
1,2-100786-A-1.wav,2,1,rooster,True,100786,A
2,2-101676-A-10.wav,2,10,rain,True,101676,A
3,2-102414-A-17.wav,2,17,pouring_water,False,102414,A
4,2-102414-B-17.wav,2,17,pouring_water,False,102414,B
...,...,...,...,...,...,...,...
395,2-99795-A-32.wav,2,32,keyboard_typing,False,99795,A
396,2-99796-A-32.wav,2,32,keyboard_typing,False,99796,A
397,2-99955-A-7.wav,2,7,insects,False,99955,A
398,2-99955-B-7.wav,2,7,insects,False,99955,B


In [10]:
len(df_2['target'].unique())

50

In [11]:
df.shape

(2000, 7)

In [12]:
from torch.utils.data import Dataset

In [13]:
class ESC_Dataset(Dataset):
    def __init__(self, dataset, fold, audio_path,eval_mode = False):
        self.df_dataset = dataset
        self.fold = fold
        self.eval_mode = eval_mode
        self.audio_path = audio_path

        # set dataset using the fold
        self.df_dataset = self.df_dataset.loc[self.df_dataset['fold'] == self.fold] if eval_mode else self.df_dataset.loc[self.df_dataset['fold'] != self.fold]
        
        # reset index
        self.df_dataset.reset_index(drop = True, inplace=True)

    def __getitem__(self, index):
        name = self.df_dataset.loc[index, 'filename']
        audio, _ = librosa.load(os.path.join(self.audio_path, name), sr = None)

        y = self.df_dataset.loc[index, 'target']

        return audio, y

    def __len__(self):

        return len(self.df_dataset)





In [14]:
audio_path = './data/ESC-50-master/audio/'
eval_mode = False
fold = '1'
dataset = pd.read_csv('./data/ESC-50-master/meta/esc50.csv')

In [15]:
train_dataset = ESC_Dataset(dataset, fold, audio_path, eval_mode=eval_mode)

In [16]:
len(train_dataset)

2000

In [17]:
train_dataset[600]

(array([-9.1552734e-05, -3.9672852e-04, -1.8310547e-04, ...,
         1.4648438e-03, -1.3122559e-03, -2.8076172e-03], dtype=float32),
 12)