# Loading the dataset

In [1]:
from deeplake.core.vectorstore.deeplake_vectorstore import VectorStore
import getpass
import os



In [2]:
os.environ['ACTIVELOOP_TOKEN'] = getpass.getpass("Enter the API Key: ")

Enter the API Key:  ········


In [3]:
import deeplake

In [8]:
deeplake_train_path = 'hub://hrayrm/train_data'
deeplake_val_path = 'hub://hrayrm/val_data'

In [9]:
from deeplake.util.exceptions import DatasetHandlerError

In [10]:
try:
    ds_train = deeplake.load(deeplake_train_path)
    ds_train.summary()
except DatasetHandlerError:
    ds_train = deeplake.empty(deeplake_train_path)
    with ds_train:
        ds_train.create_tensor('audio', htype = 'audio', sample_compression = None)
        ds_train.create_tensor('metadata', htype = 'json')

\

This dataset can be visualized in Jupyter Notebook by ds.visualize() or at https://app.activeloop.ai/hrayrm/train_data



|

hub://hrayrm/train_data loaded successfully.



 

Dataset(path='hub://hrayrm/train_data', tensors=['audio', 'metadata'])

  tensor    htype          shape           dtype  compression
 -------   -------        -------         -------  ------- 
  audio     audio   (40, 705600:1323000)  float64   None   
 metadata   json          (40, 1)           str     None   




In [11]:
try:
    ds_val = deeplake.load(deeplake_val_path)
    ds_val.summary()
except DatasetHandlerError:
    ds_val = deeplake.empty(deeplake_val_path)
    with ds_val:
        ds_val.create_tensor('audio', htype = 'audio', sample_compression = None)
        ds_val.create_tensor('metadata', htype = 'json')

 

Your Deep Lake dataset has been successfully created!


\

This dataset can be visualized in Jupyter Notebook by ds.visualize() or at https://app.activeloop.ai/hrayrm/val_data


 

hub://hrayrm/val_data loaded successfully.


 

# Populating the dataset

In [12]:
from glob import glob
import random
import librosa
from pathlib import Path
import json

In [13]:
def get_metadata(split='train'):
    metadata = []
    metadata_file = Path(f'./egs/{split}/data.jsonl')
    
    with open(str(metadata_file), "r") as filled_json_file:
        for index, line in enumerate(filled_json_file):
            link_info_dict = json.loads(line)
            metadata.append(link_info_dict)

    return metadata

In [14]:
train_metadata = get_metadata(split='train')
validation_metadata = get_metadata(split='validation')

In [15]:
def get_observation(metadata_i):
    music_path = Path(metadata_i['path'])
    json_path = music_path.with_suffix('.json')
    audio, sr = librosa.load(music_path, sr=None, mono=True)

    with open(json_path, 'r') as json_file:
        json_info = json.load(json_file)

    full_meta = {'metadata': metadata_i, 'info': json_info}

    return audio, full_meta

In [17]:
import time

In [13]:
start = time.time()
with ds_train:
    for i, metadata_i in enumerate(train_metadata[:40]):
        if i%2==0:
            print(i)
        audio, full_meta = get_observation(metadata_i)
        ds_train.append({'audio': audio, 'metadata': full_meta})
end = time.time()

print(f"Time needed to upload the dataset: {int(end-start)} seconds")

0
2
4
6
8
10
12
14
16
18
20
22
24
26
28
30
32
34
36
38


-

Time needed to upload the dataset: 1317 seconds


 

In [None]:
start = time.time()
with ds_val:
    for i, metadata_i in enumerate(validation_metadata[:40]):
        if i%2==0:
            print(i)
        audio, full_meta = get_observation(metadata_i)
        ds_val.append({'audio': audio, 'metadata': full_meta})
end = time.time()

print(f"Time needed to upload the dataset: {int(end-start)} seconds")

0
2
4
6
8
10
12
14
16
18
20
22
24
26
28
30
32
34
36
38


|