In [7]:
import pandas as pd
import numpy as np

In [8]:
# Read Echonest Taste profile dataset
taste = pd.read_csv('./data/train_triplets.txt', sep='\t', header=None, names=['user_id','song_id','play_count'])

In [9]:
taste.shape, taste.song_id.unique().shape

((48373586, 3), (384546,))

In [10]:
# Read million songs data
import sqlite3

In [11]:
# path to the Million Song Dataset sond metadat (sqlite3)
msd_meta_path='./data/track_metadata.db'

In [12]:
# connect to the SQLite database
conn = sqlite3.connect(msd_meta_path)
# from that connection, get a cursor to do queries
c = conn.cursor()
# so there is no confusion, the table name is 'songs'
TABLENAME = 'songs'

In [13]:
# list all tables in that dataset
# note that sqlite does the actual job when we call fetchall() or fetchone()
q = "SELECT name FROM sqlite_master WHERE type='table' ORDER BY name"
res = c.execute(q)
print "* tables contained in that SQLite file/database (should be only 'songs'):"
print res.fetchall()

* tables contained in that SQLite file/database (should be only 'songs'):
[(u'songs',)]


In [14]:
q = "SELECT * FROM songs WHERE song_id = 'SOABAAU12AB01860F8'"
res = c.execute(q)
print res.fetchall()

[(u'TRBUDFM12903C9E25E', u'Leben in Dankbarkeit', u'SOABAAU12AB01860F8', u'Musik mit Herz und Schwung', u'ARRKCMU1269FB34D23', u'6871bb8e-e91c-4f93-bfdb-68bf9489e179', u'Die Mooskirchner', 208.37832, 0.327061846969, 0.0, 0, 7304610, -1, 0)]


In [15]:
q = "SELECT song_id, title, artist_id, artist_name FROM songs"
res = c.execute(q)

song_list = []
for song in res.fetchall():
    song_list.append(song)

In [16]:
df_song = pd.DataFrame(song_list, columns=['song_id','song_title','artist_id','artist_name'])

In [17]:
taste_song = taste.drop_duplicates('song_id')

In [18]:
relevant_song = pd.merge(taste_song, df_song, how='inner', on='song_id')

In [19]:
relevant_song.shape

(385256, 6)

In [20]:
import json
import os

In [85]:
directory = './millionsongdataset_echonest_mapping'
prefix = 'BN'
song_id_file = 'SOBNZDC12A6D4FC103' + '.json'
file_path = os.path.join(directory,prefix,song_id_file)

In [86]:
file_path

'./millionsongdataset_echonest_mapping/BN/SOBNZDC12A6D4FC103.json'

In [87]:
with open(file_path) as json_file:    
    map_json = json.load(json_file)

In [88]:
map_json

{u'response': {u'songs': [],
  u'status': {u'code': 0, u'message': u'Success', u'version': u'4.2'}}}

In [60]:
map_json['response']['songs'][0]['tracks'][0]['foreign_id']

u'7digital-UK:track:2093263'

In [62]:
[x['foreign_id'] for x in map_json['response']['songs'][0]['tracks'] if x['catalog'] == 'spotify']

[u'spotify:track:4nuY11Vo1Yd5xMHhLfDKe9',
 u'spotify:track:6d4xNwJPOI3VmvG8tCIYDI']

In [61]:
for track in map_json['response']['songs'][0]['tracks']:
    if track['catalog'] == 'spotify':
        print track['foreign_id']

spotify:track:4nuY11Vo1Yd5xMHhLfDKe9
spotify:track:6d4xNwJPOI3VmvG8tCIYDI


In [35]:
relevant_song.head()

Unnamed: 0,user_id,song_id,play_count,song_title,artist_id,artist_name
0,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAKIMP12A8C130995,1,The Cove,ARC8CQZ1187B98DECA,Jack Johnson
1,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAPDEY12A81C210A9,1,Nothing from Nothing,ARHYUI71187FB48366,Billy Preston
2,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBBMDR12A8C13253B,2,Entre Dos Aguas,ARC1SF21187FB51D0F,Paco De Lucia
3,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBFNSP12AF72A0E22,1,Under Cold Blue Stars,ARMS97V1187B99DD98,Josh Rouse
4,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBFOVM12A58A7D494,1,Riot Radio (Soundtrack Version),ARI4V0E1187B9B6FC2,The Dead 60s


In [36]:
relevant_song['song_prefix'] = relevant_song['song_id'].map(lambda x: x[2:4])

In [75]:
def read_song_json(song_prefix, song_id):
    directory = './millionsongdataset_echonest_mapping'
    prefix = song_prefix
    song_id_file = song_id + '.json'
    file_path = os.path.join(directory,prefix,song_id_file)
    
    try:
        with open(file_path) as json_file:    
            map_json = json.load(json_file)

            if map_json:
                #return map_json['response']['songs'][0]['tracks'][0]['foreign_id']
                # In case there are multiple Spotify ID, return only 1 ID
                spotify_tracks = [x['foreign_id'] for x in map_json['response']['songs'][0]['tracks'] if x['catalog'] == 'spotify']
                return spotify_tracks[0]
            return 'SpotifyNA'
    except:
        return 'MappingNA'
    json_file.close()

In [76]:
test_song = relevant_song.head(10).copy()

In [77]:
test_song['spotify_id'] = test_song[['song_prefix','song_id']].apply(lambda x: read_song_json(x[0],x[1]), axis=1)

In [78]:
test_song

Unnamed: 0,user_id,song_id,play_count,song_title,artist_id,artist_name,song_prefix,spotify_id
0,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAKIMP12A8C130995,1,The Cove,ARC8CQZ1187B98DECA,Jack Johnson,AK,spotify:track:4nuY11Vo1Yd5xMHhLfDKe9
1,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAPDEY12A81C210A9,1,Nothing from Nothing,ARHYUI71187FB48366,Billy Preston,AP,MappingNA
2,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBBMDR12A8C13253B,2,Entre Dos Aguas,ARC1SF21187FB51D0F,Paco De Lucia,BB,MappingNA
3,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBFNSP12AF72A0E22,1,Under Cold Blue Stars,ARMS97V1187B99DD98,Josh Rouse,BF,MappingNA
4,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBFOVM12A58A7D494,1,Riot Radio (Soundtrack Version),ARI4V0E1187B9B6FC2,The Dead 60s,BF,spotify:track:2HPwuvT6HKngPSynOCi6Lb
5,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBNZDC12A6D4FC103,1,Sin límites (I),ARGYPGG1187B9923B5,Amset,BN,MappingNA
6,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBSUJE12A6D4F8CF5,2,12 segundos de oscuridad,AR1BQ6Y1187B98D586,Jorge Drexler,BS,MappingNA
7,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBVFZR12A6D4F8AE3,1,Ears To The Ground (Album Version),ARMS97V1187B99DD98,Josh Rouse,BV,spotify:track:5WjXULJvSlMxuies9diz1Q
8,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBXALG12A8C13C108,1,Food Chain (Album Version),ARG3I891187FB3A391,Eric Hutchinson,BX,spotify:track:1bqi9YEdZweX9H6JuSQ6Qw
9,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBXHDL12A81C204C0,1,Stronger,ARRH63Y1187FB47783,Kanye West,BX,MappingNA


In [79]:
relevant_song.shape

(385256, 7)

In [80]:
relevant_song['spotify_id'] = relevant_song[['song_prefix','song_id']].apply(lambda x: read_song_json(x[0],x[1]), axis=1)

In [81]:
relevant_song.head()

Unnamed: 0,user_id,song_id,play_count,song_title,artist_id,artist_name,song_prefix,spotify_id
0,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAKIMP12A8C130995,1,The Cove,ARC8CQZ1187B98DECA,Jack Johnson,AK,spotify:track:4nuY11Vo1Yd5xMHhLfDKe9
1,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAPDEY12A81C210A9,1,Nothing from Nothing,ARHYUI71187FB48366,Billy Preston,AP,MappingNA
2,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBBMDR12A8C13253B,2,Entre Dos Aguas,ARC1SF21187FB51D0F,Paco De Lucia,BB,MappingNA
3,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBFNSP12AF72A0E22,1,Under Cold Blue Stars,ARMS97V1187B99DD98,Josh Rouse,BF,MappingNA
4,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBFOVM12A58A7D494,1,Riot Radio (Soundtrack Version),ARI4V0E1187B9B6FC2,The Dead 60s,BF,spotify:track:2HPwuvT6HKngPSynOCi6Lb


In [82]:
len(relevant_song[relevant_song['spotify_id']=='MappingNA']), len(relevant_song[relevant_song['spotify_id']=='SpotifyNA'])

(204443, 0)

In [84]:
len(relevant_song), relevant_song.shape

(385256, (385256, 8))

In [90]:
relevant_song.to_csv('./data/songs_for_spotify_download.csv', index=False, encoding='utf-8')

In [91]:
to_download = relevant_song[relevant_song['spotify_id'] != 'MappingNA']

In [92]:
to_download.shape

(180813, 8)

In [93]:
to_download.to_csv('./data/songs_for_spotify_download.csv', index=False, encoding='utf-8')

NameError: name 'to_download' is not defined