In [1]:
import os
import sys
import time
import glob
import datetime
import sqlite3
import numpy as np

In [2]:
# path to the Million Song Dataset subset (uncompressed)
# CHANGE IT TO YOUR LOCAL CONFIGURATION
msd_subset_path=r'A:\notebook\Notebook\Jupyter Notebooks\MillionSongSubset'
msd_subset_data_path=os.path.join(msd_subset_path,'data')
msd_subset_addf_path=os.path.join(msd_subset_path,'AdditionalFiles')

In [3]:
assert os.path.isdir(msd_subset_path),'wrong path' # sanity check

In [4]:
import hdf5_getters as GETTERS
def strtimedelta(starttime,stoptime):
    return str(datetime.timedelta(seconds=stoptime-starttime))


In [5]:
def apply_to_all_files(basedir,func=lambda x: x,ext='.h5'):
    """
    From a base directory, go through all subdirectories,
    find all files with the given extension, apply the
    given function 'func' to all of them.
    If no 'func' is passed, we do nothing except counting.
    INPUT
       basedir  - base directory of the dataset
       func     - function to apply to all filenames
       ext      - extension, .h5 by default
    RETURN
       number of files
    """
    cnt = 0
    # iterate over all files in all subdirectories
    for root, dirs, files in os.walk(basedir):
        files = glob.glob(os.path.join(root,'*'+ext))
        # count files
        cnt +=len(files)
        # apply function to all files
        for f in files :
            func(f)       
    return cnt

# we can now easily count the number of files in the dataset
print ('number of song files:',apply_to_all_files(msd_subset_data_path))

# let's now get all artist names in a set(). One nice property:
# if we enter many times the same artist, only one will be kept.
all_artist_names = set()

# we define the function to apply to all files
def func_to_get_artist_name(filename):
    """
    This function does 3 simple things:
    - open the song file
    - get artist ID and put it
    - close the file
    """
    h5 = GETTERS.open_h5_file_read(filename)
    artist_name = GETTERS.get_artist_name(h5)
    all_artist_names.add( artist_name )
    h5.close()
    
# let's apply the previous function to all files
# we'll also measure how long it takes
t1 = time.time()
apply_to_all_files(msd_subset_data_path,func=func_to_get_artist_name)
t2 = time.time()
print ('all artist names extracted in:',strtimedelta(t1,t2))

print(all_artist_names)



number of song files: 10000
all artist names extracted in: 0:00:49.632030
{b'The Field Mice', b'C.J. Chenier', b'Samy Deluxe', b'Phil Vassar', b'Bebe Winans', b'Doyle Bramhall', b'The Almost', b'Harry Connick_ Jr', b'Buddy Jewell', b'J.B.Lenoir', b'Ed Bruce', b'Leo Gandelman', b'Oui Oui', b'Cinder', b'Plagia', b'Refractory', b'Abijah', b'Incubus', b'Tum Tum / Double T / LiL Ronnie', b'P.o.d', b'Adam Richman', b'Steve Green', b'Wild Horses', b'Diamond D', b'Janina', b'Kisha', b'R. Carlos Nakai', b'Billy Butler', b'Leigh Jones', b'Sven Tasnadi', b'Eko Fresh feat. G-Style', b'Barbara Carr', b'D.A.R.', b'Autumnblaze', b'Albert Pr\xc3\xa9jean', b'Acorps de Rue', b'Ambient Music Therapy', b'Major Lazer', b'Bodies In the Gears of the Apparatus', b'Mira', b'Jacques Ferchit', b'Asha Puthli', b'Gal Costa / Maria Beth\xc3\xa2nia', b'For Squirrels', b'The Birds', b'Lily Allen Featuring Ours', b'Toby Love', b'Stimulator', b'The Irish Tenors', b'The Jane Shermans', b'Danielle Bollinger', b'Camp Lo',