In [1]:
from __future__ import print_function
import imageio
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os
import sys
import tarfile
import zipfile
import time
import glob
import datetime
from IPython.display import display, Image
from sklearn.linear_model import LogisticRegression
from six.moves.urllib.request import urlretrieve
from six.moves import cPickle as pickle

%matplotlib inline

In [2]:
last_percent_reported = None
data_root = '.' # Change me to store data elsewhere

def download_progress_hook(count, blockSize, totalSize):
  """A hook to report the progress of a download. This is mostly intended for users with
  slow internet connections. Reports every 5% change in download progress.
  """
  global last_percent_reported
  percent = int(count * blockSize * 100 / totalSize)

  if last_percent_reported != percent:
    if percent % 5 == 0:
      sys.stdout.write("%s%%" % percent)
      sys.stdout.flush()
    else:
      sys.stdout.write(".")
      sys.stdout.flush()
      
    last_percent_reported = percent
        
def maybe_download(url, filename, expected_bytes, force=False):
  """Download a file if not present, and make sure it's the right size."""
  dest_filename = os.path.join(data_root, filename)
  if force or not os.path.exists(dest_filename):
    print('Attempting to download:', filename) 
    filename, _ = urlretrieve(url + filename, dest_filename, reporthook=download_progress_hook)
    print('\nDownload Complete!')
  statinfo = os.stat(dest_filename)
  if statinfo.st_size == expected_bytes:
    print('Found and verified', dest_filename)
  else:
    raise Exception(
      'Failed to verify ' + dest_filename + '. Can you get to it with a browser?')
  return dest_filename

#dataset_tar = maybe_download('http://static.echonest.com/', 'millionsongsubset_full.tar.gz', 1994614463)
dataset_tar = "asdf"
MSongsDB_tar = maybe_download('https://github.com/tbertinmahieux/MSongsDB/archive/', 'master.zip', 17585436)

Found and verified .\master.zip


In [3]:
if os.path.isdir('MillionSongSubset'):
    print('MillionSongSubset already present - Skipping extraction of %s.' % dataset_tar)
else:
    print('Extracting data for MillionSongSubset. This may take a while. Please wait.')
    tar = tarfile.open(dataset_tar)
    sys.stdout.flush()
    tar.extractall(data_root)
    tar.close()

if os.path.isdir('MSongsDB-master'):
    print('MillionSongSubset already present - Skipping extraction of %s.' % MSongsDB_tar)
else:
    print('Extracting data for MillionSongSubset.')
    zip_ref = zipfile.ZipFile(MSongsDB_tar, 'r')
    zip_ref.extractall(data_root)
    zip_ref.close()

msd_path=['MillionSongDataset/B', 'MillionSongDataset/C', 'MillionSongDataset/D', 'MillionSongDataset/E', 'MillionSongDataset/F', 'MillionSongDataset/G', 'MillionSongDataset/H']
msd_data_path=msd_path
assert os.path.isdir(msd_path[0]),'wrong path' # sanity check

msd_code_path='MSongsDB-master'
assert os.path.isdir(msd_code_path),'wrong path' # sanity check

sys.path.append( os.path.join(msd_code_path,'PythonSrc') )

MillionSongSubset already present - Skipping extraction of asdf.
MillionSongSubset already present - Skipping extraction of .\master.zip.


In [4]:
import hdf5_getters as GETTERS
# the following function simply gives us a nice string for
# a time lag in seconds
def strtimedelta(starttime,stoptime):
    return str(datetime.timedelta(seconds=stoptime-starttime))

# we define this very useful function to iterate the files
def apply_to_all_files(basedir,func=lambda x: x,ext='.h5'):
    """
    From a base directory, go through all subdirectories,
    find all files with the given extension, apply the
    given function 'func' to all of them.
    If no 'func' is passed, we do nothing except counting.
    INPUT
       basedir  - base directory of the dataset
       func     - function to apply to all filenames
       ext      - extension, .h5 by default
    RETURN
       number of files
    """
    cnt = 0
    # iterate over all files in all subdirectories
    for root, dirs, files in os.walk(basedir):
        files = glob.glob(os.path.join(root,'*'+ext))
        # count files
        cnt += len(files)
        # apply function to all files
        for f in files :
            func(f)       
    return cnt

# we can now easily count the number of files in the dataset
# print('number of song files:',apply_to_all_files(msd_subset_data_path))

# let's now get all artist names in a set(). One nice property:
# if we enter many times the same artist, only one will be kept.
# all_artist_names = set()

# we define the function to apply to all files
def func_to_get_artist_name(filename):
    """
    This function does 3 simple things:
    - open the song file
    - get artist ID and put it
    - close the file
    """
    h5 = GETTERS.open_h5_file_read(filename)
    artist_name = GETTERS.get_artist_name(h5)
    all_artist_names.add( artist_name )
    h5.close()
    
# let's apply the previous function to all files
# we'll also measure how long it takes
# t1 = time.time()
# apply_to_all_files(msd_subset_data_path,func=func_to_get_artist_name)
# t2 = time.time()
# print('all artist names extracted in:',strtimedelta(t1,t2))

In [5]:
# let's see some of the content of 'all_artist_names'
# print('found',len(all_artist_names),'unique artist names')
# for k in range(5):
#     print(list(all_artist_names)[k].decode('utf-8'))

In [6]:
def apply_to_some_files(num, basedir, func=lambda x: x, ext='.h5'):
    cnt = 0
    # iterate over all files in all subdirectories
    for root, dirs, files in os.walk(basedir):
        files = glob.glob(os.path.join(root,'*'+ext))
        # apply function to all files
        for f in files:
            func(f)
            cnt += 1
            if cnt % 10000 == 0: print(cnt)
            if cnt >= num: break
        if cnt >= num: break
    return cnt

In [7]:
getters = [ getter for getter in GETTERS.__dict__.keys() if getter[:4] == 'get_']
getters.remove("get_num_songs") # special case
# all getters are 54

cols = [getter[4:] for getter in getters]

df = pd.DataFrame([], columns=cols)

def func_to_get_all_info(filename):
    h5 = GETTERS.open_h5_file_read(filename)
    global df
    df = df.append(pd.DataFrame([[GETTERS.__getattribute__(getter)(h5) for getter in getters]], columns=cols), ignore_index=True)
    h5.close()

In [8]:
for data_path in msd_data_path:
    df = pd.DataFrame([], columns=cols)
    count = apply_to_all_files(data_path)
    print('number of song files:', count, data_path[-1])
    t1 = time.time()
    apply_to_some_files(count, data_path,func=func_to_get_all_info)
    df = df.dropna(subset=['song_hotttnesss'])
    t2 = time.time()
    print('all data extracted in:',strtimedelta(t1,t2))
    df.to_pickle(data_path[-1] + '.pickle')
    print('data saved to:', data_path[-1] + '.pickle')

number of song files: 38265 B
10000
20000
30000
all data extracted in: 4:17:47.489009
data saved to: B.pickle
number of song files: 38611 C
10000
20000
30000
all data extracted in: 1:19:37.066798
data saved to: C.pickle
number of song files: 38825 D
10000
20000
30000
all data extracted in: 1:20:31.178279
data saved to: D.pickle
number of song files: 38466 E
10000
20000
30000
all data extracted in: 1:19:34.585798
data saved to: E.pickle
number of song files: 38919 F
10000
20000
30000
all data extracted in: 1:20:36.956103
data saved to: F.pickle
number of song files: 38156 G
10000
20000
30000
all data extracted in: 1:18:27.923539
data saved to: G.pickle
number of song files: 38519 H
10000
20000
30000
all data extracted in: 1:19:28.295700
data saved to: H.pickle
