In [1]:
import os
import sys
import time
import glob
import datetime
import sqlite3
import numpy as np

In [2]:
# path to the Million Song Dataset subset (uncompressed)
msd_subset_path='./MillionSongSubset'
msd_subset_data_path=os.path.join(msd_subset_path,'data')
msd_subset_addf_path=os.path.join(msd_subset_path,'AdditionalFiles')
assert os.path.isdir(msd_subset_path),'wrong path' # sanity check

In [5]:
# path to the Million Song Dataset code
# CHANGE IT TO YOUR LOCAL CONFIGURATION
#msd_code_path='/home/thierry/Columbia/MSongsDB'
#assert os.path.isdir(msd_code_path),'wrong path' # sanity check
# we add some paths to python so we can import MSD code
# Ubuntu: you can change the environment variable PYTHONPATH
# in your .bashrc file so you do not have to type these lines
#sys.path.append( os.path.join(msd_code_path,'PythonSrc') )

In [55]:
# imports specific to the MSD
import hdf5_getters as GETTERS

In [7]:
# the following function simply gives us a nice string for
# a time lag in seconds
def strtimedelta(starttime,stoptime):
    return str(datetime.timedelta(seconds=stoptime-starttime))

In [8]:
# we define this very useful function to iterate the files
def apply_to_all_files(basedir,func=lambda x: x,ext='.h5'):
    """
    From a base directory, go through all subdirectories,
    find all files with the given extension, apply the
    given function 'func' to all of them.
    If no 'func' is passed, we do nothing except counting.
    INPUT
       basedir  - base directory of the dataset
       func     - function to apply to all filenames
       ext      - extension, .h5 by default
    RETURN
       number of files
    """
    cnt = 0
    # iterate over all files in all subdirectories
    for root, dirs, files in os.walk(basedir):
        files = glob.glob(os.path.join(root,'*'+ext))
        # count files
        cnt += len(files)
        # apply function to all files
        for f in files :
            func(f)       
    return cnt

In [9]:
# we can now easily count the number of files in the dataset
print 'number of song files:',apply_to_all_files(msd_subset_data_path)

number of song files: 10000


In [10]:
# let's now get all artist names in a set(). One nice property:
# if we enter many times the same artist, only one will be kept.
all_artist_names = set()

In [11]:
# we define the function to apply to all files
def func_to_get_artist_name(filename):
    """
    This function does 3 simple things:
    - open the song file
    - get artist ID and put it
    - close the file
    """
    h5 = GETTERS.open_h5_file_read(filename)
    artist_name = GETTERS.get_artist_name(h5)
    all_artist_names.add( artist_name )
    h5.close()

In [12]:
# let's apply the previous function to all files
# we'll also measure how long it takes
t1 = time.time()
apply_to_all_files(msd_subset_data_path,func=func_to_get_artist_name)
t2 = time.time()
print 'all artist names extracted in:',strtimedelta(t1,t2)

all artist names extracted in: 0:03:20.137843


In [13]:
# let's see some of the content of 'all_artist_names'
print 'found',len(all_artist_names),'unique artist names'
for k in range(5):
    print list(all_artist_names)[k]

found 4412 unique artist names
Groundhogs
Pale Forest
The Real Kids
JennyAnyKind
Little Willie John


In [14]:
# this is too long, and the work of listing artist names has already
# been done. Let's redo the same task using an SQLite database.
# We connect to the provided database: track_metadata.db
conn = sqlite3.connect(os.path.join(msd_subset_addf_path,
                                    'subset_track_metadata.db'))

In [15]:
q = "SELECT DISTINCT artist_id FROM songs"
res = conn.execute(q)
all_artist_ids = map(lambda x: x[0], res.fetchall())
conn.close()

In [16]:
# The Echo Nest artist id look like:
for k in range(4):
    print all_artist_ids[k]

AR009211187B989185
AR00A6H1187FB5402A
AR00LNI1187FB444A5
AR00MBZ1187B9B5DB1


In [17]:
# let's count the songs from each of these artists.
# We will do it first by iterating over the dataset.
# we prepare a dictionary to count files
files_per_artist = {}
for aid in all_artist_ids:
    files_per_artist[aid] = 0

In [18]:
# we prepare the function to check artist id in each file
def func_to_count_artist_id(filename):
    """
    This function does 3 simple things:
    - open the song file
    - get artist ID and put it
    - close the file
    """
    h5 = GETTERS.open_h5_file_read(filename)
    artist_id = GETTERS.get_artist_id(h5)
    files_per_artist[artist_id] += 1
    h5.close()

In [19]:
# we apply this function to all files
apply_to_all_files(msd_subset_data_path,func=func_to_count_artist_id)

10000

In [20]:
# the most popular artist (with the most songs) is:
most_pop_aid = sorted(files_per_artist,
                      key=files_per_artist.__getitem__,
                      reverse=True)[0]
print most_pop_aid,'has',files_per_artist[most_pop_aid],'songs.'

AROIHOI122988FEB8E has 13 songs.


In [21]:
# of course, it is more fun to have the name(s) of this artist
# let's get it using SQLite
conn = sqlite3.connect(os.path.join(msd_subset_addf_path,
                                    'subset_track_metadata.db'))
q = "SELECT DISTINCT artist_name FROM songs"
q += " WHERE artist_id='"+most_pop_aid+"'"
res = conn.execute(q)
pop_artist_names = map(lambda x: x[0], res.fetchall())
conn.close()
print 'SQL query:',q
print 'name(s) of the most popular artist:',pop_artist_names

SQL query: SELECT DISTINCT artist_name FROM songs WHERE artist_id='AROIHOI122988FEB8E'
name(s) of the most popular artist: [u'Mario Rosenstock']


In [22]:
# let's redo all this work in SQLite in a few seconds
t1 = time.time()
conn = sqlite3.connect(os.path.join(msd_subset_addf_path,
                                    'subset_track_metadata.db'))
q = "SELECT DISTINCT artist_id,artist_name,Count(track_id) FROM songs"
q += " GROUP BY artist_id"
res = conn.execute(q)
pop_artists = res.fetchall()
conn.close()
t2 = time.time()
print 'found most popular artist in',strtimedelta(t1,t2)
print sorted(pop_artists,key=lambda x:x[2],reverse=True)[0]

found most popular artist in 0:00:00.167159
(u'AROIHOI122988FEB8E', u'Mario Rosenstock', 13)


In [23]:
def encode_string(s):
    """
    Simple utility function to make sure a string is proper
    to be used in a SQLite query
    (different than posgtresql, no N to specify unicode)
    EXAMPLE:
      That's my boy! -> 'That''s my boy!'
    """
    return "'"+s.replace("'","''")+"'"

In [24]:
msd_subset_addf_path

'./MillionSongSubset/AdditionalFiles'

In [25]:
dbfile = os.path.join(msd_subset_addf_path,'subset_track_metadata.db')

In [26]:
# connect to the SQLite database
conn = sqlite3.connect(dbfile)

# from that connection, get a cursor to do queries
c = conn.cursor()

# so there is no confusion, the table name is 'songs'
TABLENAME = 'songs'

In [27]:
# list all tables in that dataset
# note that sqlite does the actual job when we call fetchall() or fetchone()
q = "SELECT name FROM sqlite_master WHERE type='table' ORDER BY name"
res = c.execute(q)
print "* tables contained in that SQLite file/database (should be only 'songs'):"
print res.fetchall()

* tables contained in that SQLite file/database (should be only 'songs'):
[(u'songs',)]


In [28]:
# list all columns names from table 'songs'
q = "SELECT sql FROM sqlite_master WHERE tbl_name = 'songs' AND type = 'table'"
res = c.execute(q)
print '* get info on columns names (original table creation command):'
print res.fetchall()[0][0]

* get info on columns names (original table creation command):
CREATE TABLE songs (track_id text PRIMARY KEY, title text, song_id text, release text, artist_id text, artist_mbid text, artist_name text, duration real, artist_familiarity real, artist_hotttnesss real, year int)


In [29]:
# list all indices
q = "SELECT name FROM sqlite_master WHERE type='index' AND tbl_name='songs' ORDER BY name"
res = c.execute(q)
print '* one of the index we added to the table to make things faster:'
print res.fetchone()

* one of the index we added to the table to make things faster:
(u'idx_artist_id',)


In [30]:
# find an entry with The Beatles as artist_name
# returns all info (the full table row)
q = "SELECT * FROM songs WHERE artist_name='The Beatles' LIMIT 1"
res = c.execute(q)
print '* get all we have about one track from The Beatles:'
print res.fetchone()

* get all we have about one track from The Beatles:
(u'TRAHSSO128EF347345', u'Derek Taylor - Introduction', u'SORTPSA12A67ADBFE2', u'Here There And Everywhere', u'AR6XZ861187FB4CECD', u'b10bbbfc-cf9e-42e0-be17-e2c3e1d2600d', u'The Beatles', 53.78567, 0.840409662154, 0.840462688027, 0)


In [31]:
# get all artists whose artist familiarity is > .8
q = "SELECT DISTINCT artist_name, artist_familiarity FROM songs WHERE artist_familiarity>.8"
res = c.execute(q)
print '* one artist having familiaryt >0.8:'
print res.fetchone()

* one artist having familiaryt >0.8:
(u'3 Doors Down', 0.840432466617)


In [32]:
# get one artist with the highest artist_familiarity but no artist_hotttnesss
# notice the alias af and ah, makes things more readable
q = "SELECT DISTINCT artist_name, artist_familiarity as af, artist_hotttnesss as ah"
q += " FROM songs WHERE ah<0 ORDER BY af"
res = c.execute(q)
print '* get the artist with the highest familiarity that has no computed hotttnesss:'
print res.fetchone()

* get the artist with the highest familiarity that has no computed hotttnesss:
None


In [33]:
# close the cursor and the connection
# (if for some reason you added stuff to the db or alter
#  a table, you need to also do a conn.commit())
c.close()
conn.close()

In [34]:
song_info = []

In [37]:
# we define the function to apply to all files
def func_to_get_song_info(filename):
    """
    This function does 3 simple things:
    - open the song file
    - get danceability and put it
    - close the file
    """
    h5 = GETTERS.open_h5_file_read(filename)
    song_id = GETTERS.get_song_id(h5)
    num_songs = GETTERS.get_num_songs(h5)
    song_title = GETTERS.get_title(h5)
    artist_name = GETTERS.get_artist_name(h5)
    artist_familiarity = GETTERS.get_artist_familiarity(h5)
    artist_hotttnesss = GETTERS.get_artist_hotttnesss(h5)
    release = GETTERS.get_release(h5)
    song_hotttnesss = GETTERS.get_song_hotttnesss(h5)
    similar_artists = GETTERS.get_similar_artists(h5)
    artist_terms = GETTERS.get_artist_terms(h5)
    artist_terms_freq = GETTERS.get_artist_terms_freq(h5)
    artist_terms_weight = GETTERS.get_artist_terms_weight(h5)
    analysis_sample_rate = GETTERS.get_analysis_sample_rate(h5)
    duration = GETTERS.get_duration(h5)
    danceability = GETTERS.get_danceability(h5)
    end_of_fade_in = GETTERS.get_end_of_fade_in(h5)
    key_info = GETTERS.get_key(h5)
    key_confidence = GETTERS.get_key_confidence(h5)
    loudness = GETTERS.get_loudness(h5)
    mode = GETTERS.get_mode(h5)
    mode_confidence = GETTERS.get_mode_confidence(h5)
    time_signature = GETTERS.get_time_signature(h5)    #number of beats per bar
    time_signature_confidence = GETTERS.get_time_signature_confidence(h5)
    tempo = GETTERS.get_tempo(h5)
    energy = GETTERS.get_energy(h5)
    year = GETTERS.get_year(h5)
    song_info.append((song_id, num_songs, song_title, artist_name, artist_familiarity, artist_hotttnesss, \
                      release, song_hotttnesss, similar_artists, artist_terms, artist_terms_freq, artist_terms_weight, \
                      analysis_sample_rate, duration, danceability, end_of_fade_in, key_info, key_confidence, \
                      loudness, mode, mode_confidence, time_signature, time_signature_confidence, tempo, \
                      energy, year))
    h5.close()

In [38]:
apply_to_all_files(msd_subset_data_path,func=func_to_get_song_info)

10000

In [41]:
song_fieldlist = ('song_id', 'num_songs', 'song_title', 'artist_name', 'artist_familiarity', 'artist_hotttnesss', \
                  'release', 'song_hotttnesss', 'similar_artists', 'artist_terms', 'artist_terms_freq', 'artist_terms_weight', \
                  'analysis_sample_rate', 'duration', 'danceability', 'end_of_fade_in', 'key_info', 'key_confidence', \
                  'loudness', 'mode', 'mode_confidence', 'time_signature', 'time_signature_confidence', 'tempo', \
                  'energy', 'year')

In [42]:
for song in song_info[:10]:
    for i in range(len(song_fieldlist)):
        print song_fieldlist[i], song[i]

song_id SOMZWCG12A8C13C480
num_songs 1
song_title I Didn't Mean To
artist_name Casual
artist_familiarity 0.581793765845
artist_hotttnesss 0.401997543364
release Fear Itself
song_hotttnesss 0.602119989906
similar_artists ['ARV4KO21187FB38008' 'ARWHM281187FB3D381' 'ARJGOG11187B98D89F'
 'AR9ODB41187FB459B2' 'ARXM6VQ1187FB5B1E0' 'ARNWZ1N1187B9B71BA'
 'ARDWYZZ11F4C8413FA' 'ARTP3H51187B98FB75' 'ARWCDXN12454A4D1E8'
 'ARJ54S61187B9ACD39' 'AR5PF241187B989C1D' 'ARR7MLL1187B99B636'
 'ARLMHFV1187B9A3833' 'ARPRERY1187B99E2DC' 'AR34BCQ1187B9A68E4'
 'ARFWBUC11F4C8413DA' 'ARPWGMN1187FB560E3' 'ARVCIVW12454A4D1E7'
 'ARG89HY1187FB3CA15' 'AR9IGU51187FB40D6B' 'ARNNOYR11F4C845127'
 'ARZMFNT11F4C8413DD' 'ARPR9W71187FB3723A' 'AR5VBGP1187B98EB43'
 'ARFHDOI1187FB57230' 'ARBSQPF11F4C8413E0' 'AROYGID11F4C8413DB'
 'ARDXUGZ11F4C84452F' 'ARMW4I01187B98AEF8' 'AR7AYQG1187B994B3F'
 'ARHVZEM11F4C841FF9' 'ARP9H0U1187FB3FEA7' 'ARVSIGU11F4C8413E6'
 'AROWKNS1187FB59ED5' 'ARUSTLW11F4C8413DE' 'ARSKPDX11F4C83D2A9'
 'ARB4D89118

In [45]:
song_analysis = []

In [46]:
# we define the function to apply to all files
def func_to_get_song_analysis(filename):
    """
    This function does 3 simple things:
    - open the song file
    - get danceability and put it
    - close the file
    """
    h5 = GETTERS.open_h5_file_read(filename)
    song_id = GETTERS.get_song_id(h5)
    num_songs = GETTERS.get_num_songs(h5)
    song_title = GETTERS.get_title(h5)
    artist_name = GETTERS.get_artist_name(h5)
    duration = GETTERS.get_duration(h5)
    segments_start = GETTERS.get_segments_start(h5)
    segments_confidence = GETTERS.get_segments_confidence(h5)
    segments_pitch = GETTERS.get_segments_pitches(h5)
    segments_timbre = GETTERS.get_segments_timbre(h5)
    segments_loudness_max = GETTERS.get_segments_loudness_max(h5)
    segments_loudness_max_time = GETTERS.get_segments_loudness_max_time(h5)
    segments_loudness_start = GETTERS.get_segments_loudness_start(h5)
    sections_start = GETTERS.get_sections_start(h5)
    sections_confidence = GETTERS.get_sections_confidence(h5)
    beats_start = GETTERS.get_beats_start(h5)
    beats_confidence = GETTERS.get_beats_confidence(h5)
    bars_start = GETTERS.get_bars_start(h5)
    bars_confidence = GETTERS.get_bars_confidence(h5)
    tatums_start = GETTERS.get_tatums_start(h5)
    tatums_confidence = GETTERS.get_tatums_confidence(h5)
    
    song_analysis.append((song_id, num_songs, song_title, artist_name, duration, segments_start, segments_confidence, \
                      segments_pitch, segments_timbre, segments_loudness_max, segments_loudness_max_time, \
                      segments_loudness_start, sections_start, sections_confidence, beats_start, beats_confidence, \
                      bars_start, bars_confidence, tatums_start, tatums_confidence))
    h5.close()

In [47]:
apply_to_all_files(msd_subset_data_path,func=func_to_get_song_analysis)

10000

In [48]:
song_analysis_field = ('song_id', 'num_songs', 'song_title', 'artist_name', 'duration', 'segments_start', 'segments_confidence', \
                      'segments_pitch', 'segments_timbre', 'segments_loudness_max', 'segments_loudness_max_time', \
                      'segments_loudness_start', 'sections_start', 'sections_confidence', 'beats_start', 'beats_confidence', \
                      'bars_start', 'bars_confidence', 'tatums_start', 'tatums_confidence')

In [49]:
for song in song_analysis[:10]:
    for i in range(len(song_analysis_field)):
        print song_analysis_field[i], song[i]

song_id SOMZWCG12A8C13C480
num_songs 1
song_title I Didn't Mean To
artist_name Casual
duration 218.93179
segments_start [   0.         0.24671    0.47116    0.80376    0.89551    1.12626
    1.24499    1.75583    1.94417    2.09556    2.29578    2.6507
    2.94921    3.46254    3.81977    4.24916    4.78494    5.15755
    5.37029    5.98925    6.29506    6.54562    6.66032    6.96834
    7.74875    8.03528    8.21066    8.30553    8.70159    9.02494
    9.1824     9.29746    9.67465    9.83283   10.30118   10.65873
   10.97918   11.11492   11.27714   11.48522   11.71837   11.91048
   12.11868   12.27991   12.37664   12.50698   12.60544   12.84327
   12.97696   13.1132    13.26395   13.36036   13.46694   13.6478
   13.75025   13.92063   14.2576    14.4181    14.88054   15.00907
   15.22091   15.39896   15.55605   15.73556   15.87959   16.19687
   16.42921   16.52431   16.93692   17.13628   17.48236   17.7976
   18.10776   18.36005   18.53256   18.67465   18.79492   19.10798
   19.48041 

In [50]:
song_genre = []

In [51]:
def func_to_get_song_genre(filename):
    """
    This function does 3 simple things:
    - open the song file
    - get danceability and put it
    - close the file
    """
    h5 = GETTERS.open_h5_file_read(filename)
    song_id = GETTERS.get_song_id(h5)
    num_songs = GETTERS.get_num_songs(h5)
    song_title = GETTERS.get_title(h5)
    artist_name = GETTERS.get_artist_name(h5)
    genre = GETTERS.get_genre(h5)
    
    song_analysis.append((song_id, num_songs, song_title, artist_name, genre))
    h5.close()

In [57]:
#apply_to_all_files(msd_subset_data_path,func=func_to_get_song_genre)

In [58]:
len(song_info)

10000

In [60]:
len(song_info[0]), len(song_info[1])

(26, 26)

In [64]:
song_fieldlist[8:12]

('similar_artists', 'artist_terms', 'artist_terms_freq', 'artist_terms_weight')

In [65]:
import pandas as pd

In [67]:
df_song = pd.DataFrame(song_info, columns=song_fieldlist)

In [68]:
df_song.head()

Unnamed: 0,song_id,num_songs,song_title,artist_name,artist_familiarity,artist_hotttnesss,release,song_hotttnesss,similar_artists,artist_terms,...,key_info,key_confidence,loudness,mode,mode_confidence,time_signature,time_signature_confidence,tempo,energy,year
0,SOMZWCG12A8C13C480,1,I Didn't Mean To,Casual,0.581794,0.401998,Fear Itself,0.60212,"[ARV4KO21187FB38008, ARWHM281187FB3D381, ARJGO...","[hip hop, underground rap, g funk, alternative...",...,1,0.736,-11.197,0,0.636,4,0.778,92.198,0.0,0
1,SOCIWDW12A8C13D406,1,Soul Deep,The Box Tops,0.63063,0.4175,Dimensions,,"[ARSZWK21187B9B26D7, ARLDW2Y1187B9B544F, ARG0T...","[blue-eyed soul, pop rock, blues-rock, beach m...",...,6,0.169,-9.843,0,0.43,4,0.384,121.274,0.0,1969
2,SOXVLOJ12AB0189215,1,Amor De Cabaret,Sonora Santanera,0.487357,0.343428,Las Numero 1 De La Sonora Santanera,,"[ARFSJUG11C8A421AAD, AR8SD041187FB36015, ARR75...","[salsa, cumbia, tejano, ranchera, latin pop, l...",...,8,0.643,-9.689,1,0.565,1,0.0,100.07,0.0,0
3,SONHOTT12A8C13493C,1,Something Girls,Adam Ant,0.630382,0.454231,Friend Or Foe,,"[AR4R0741187FB39AF2, AR0D7K21187B9AD14E, ARRCB...","[pop rock, new wave, dance rock, rock, new rom...",...,0,0.751,-9.013,1,0.749,4,0.0,119.293,0.0,1982
4,SOFSOCN12A8C143F5D,1,Face the Ashes,Gob,0.651046,0.401724,Muertos Vivos,0.604501,"[ARUA62A1187B99D9B0, ARHJFFY1187B98BA76, ARHB1...","[pop punk, ska punk, breakcore, alternative me...",...,2,0.092,-4.501,1,0.371,4,0.562,129.738,0.0,2007


In [69]:
df_song.shape

(10000, 26)

In [70]:
df_song_no_artist = df_song.drop(['similar_artists', 'artist_terms', 'artist_terms_freq', 'artist_terms_weight'],axis=1)

In [71]:
df_song_no_artist.shape

(10000, 22)

In [73]:
df_song_no_artist.to_csv('./data/subset_song_info.csv',index=False)