This Notebook has the single purpose of creating a single file out of the resource found here: https://labs.acousticbrainz.org/million-song-dataset-echonest-archive/

Since the Million Song Data Set was created with outdated IDs this allows to remap ids to other apis such as Spotify etc.

1. Download file from link ftp://ftp.acousticbrainz.org/pub/acousticbrainz/acousticbrainz-labs/download/msdrosetta/millionsongdataset_echonest.tar.bz2
2. Unpack in '/data/raw/MillionSongFullSummary/mapping/' (~3.5GB)
3. Run this Notebook to create 'mapping.csv' (reading in these files is very resource intensive and is going to create autosave points temporarily, which will be removed once the files are merged)
4. You can now run the notebook -> Million Song Data Set Wrangling

In [1]:
import os
import re
import h5py
import pandas as pd
from glob import glob, iglob
import json

In [2]:
%cd ../data/raw

/Users/sebastian/git_repos/data_science/hit_predictor/data/raw


In [5]:
output_filepath = '../interim/'
temp_dir = 'mapping_temp'
skip = 0

if not os.path.exists(output_filepath + temp_dir):
    os.mkdir(output_filepath + temp_dir)
else:
    for file in iglob(output_filepath + temp_dir + '/*.csv'):
        m = re.search('mapping_([0-9]+?).csv$', file)
        if m:
            file_nr = int(m.group(1))
            if file_nr > skip:
                skip = file_nr

filenames_all = glob('MillionSongFullSummary/mapping/**/*.json',recursive=True)

def chunks(l, n):
    """Yield successive n-sized chunks from l."""
    for i in range(0, len(l), n):
        yield l[i:i + n]

mapped_songs_dfs = []
for i,filenames in enumerate(chunks(filenames_all,10000)):
    file_nr = i + 1
    if file_nr <= skip:
        print('skip:', file_nr)
        continue
    print('work on:', file_nr)
    for filename in filenames:
        with open(filename) as data_file:    
            data = json.load(data_file)
        mapped_song = pd.io.json.json_normalize(data=data['response']['songs'], record_path=['tracks'], 
                                      record_prefix='tracks.',
                                      meta=['artist_name','artist_id','id','title'])
        mapped_song = mapped_song.reindex(['tracks.catalog','tracks.foreign_id','tracks.id','artist_name','id','title'],axis='columns')
        mapped_song = mapped_song.loc[mapped_song['tracks.catalog'] == 'spotify',:]
        mapped_songs_dfs.append(mapped_song)

    mapping_df = pd.concat(mapped_songs_dfs).reset_index(drop=True)
    mapping_df.to_csv(output_filepath + temp_dir + '/mapping_' + str(file_nr) + '.csv',sep='\t',index=False,encoding='utf-8')
    mapped_songs_dfs = []

print('done')

skip: 1
skip: 2
skip: 3
skip: 4
skip: 5
skip: 6
skip: 7
skip: 8
skip: 9
skip: 10
skip: 11
skip: 12
skip: 13
skip: 14
skip: 15
skip: 16
skip: 17
skip: 18
skip: 19
skip: 20
skip: 21
skip: 22
skip: 23
skip: 24
skip: 25
skip: 26
skip: 27
skip: 28
skip: 29
skip: 30
skip: 31
skip: 32
skip: 33
skip: 34
skip: 35
skip: 36
skip: 37
skip: 38
skip: 39
skip: 40
skip: 41
skip: 42
skip: 43
skip: 44
skip: 45
skip: 46
skip: 47
skip: 48
skip: 49
skip: 50
skip: 51
skip: 52
skip: 53
skip: 54
skip: 55
skip: 56
skip: 57
skip: 58
skip: 59
skip: 60
skip: 61
skip: 62
skip: 63
skip: 64
skip: 65
skip: 66
skip: 67
skip: 68
skip: 69
skip: 70
skip: 71
skip: 72
skip: 73
skip: 74
skip: 75
skip: 76
skip: 77
skip: 78
skip: 79
skip: 80
skip: 81
skip: 82
skip: 83
skip: 84
skip: 85
skip: 86
skip: 87
skip: 88
skip: 89
skip: 90
skip: 91
skip: 92
skip: 93
skip: 94
skip: 95
skip: 96
skip: 97
skip: 98
skip: 99
skip: 100
done


In [7]:
subfiles_mapping = [pd.read_csv(file,sep='\t') for file in iglob(output_filepath + temp_dir + '/mapping_*.csv')]

mapping_df = pd.concat(subfiles_mapping).reset_index(drop=True)
mapping_df.to_csv(output_filepath + '/mapping_summary.csv',sep='\t',index=False,encoding='utf-8')

In [29]:
# Remove Temp files and directory - Cleaning Up
[os.remove(file) for file in iglob(output_filepath + temp_dir + '/mapping_*.csv')]
try:
    os.remove(output_filepath + temp_dir + '/.DS_Store')
except:
    print('No .DS_Store file. You\'re all set!')
    
os.rmdir(output_filepath + temp_dir)