IMPORTANT: Make sure to run this notebook first (Create Mapping File from Echonest Mapping Project) in order to create 'mapping.csv' (if not already available)

Import the Dataset and have an initial exploration, see shifts and possible issues.

In [22]:
import h5py
import numpy as np
import pandas as pd
from glob import glob, iglob
import json

In [23]:
%cd ../data/raw/
%ls

[Errno 2] No such file or directory: '../data/raw/'
/Users/sebastian/git_repos/data_science/hit_predictor/data/raw
[34mMillionSongFullSummary[m[m/ [34mfaq[m[m/


First we'll look at the summary file

In [24]:
data = h5py.File('MillionSongFullSummary/msd_summary_file.h5','r')
for key in data.keys():
    print(key)

analysis
metadata
musicbrainz


In [26]:
# Load Metadata
meta_data_songs = pd.DataFrame(data['metadata']['songs'].value)
meta_data_songs.head()

Unnamed: 0,analyzer_version,artist_7digitalid,artist_familiarity,artist_hotttnesss,artist_id,artist_latitude,artist_location,artist_longitude,artist_mbid,artist_name,artist_playmeid,genre,idx_artist_terms,idx_similar_artists,release,release_7digitalid,song_hotttnesss,song_id,title,track_7digitalid
0,b'',4069,0.649822,0.394032,b'ARYZTJS1187B98C555',,b'',,b'357ff05d-848a-44cf-b608-cb34b5701ae5',b'Faster Pussy cat',44895,b'',0,0,b'Monster Ballads X-Mas',633681,0.542899,b'SOQMMHC12AB0180CB8',b'Silent Night',7032331
1,b'',113480,0.439604,0.356992,b'ARMVN3U1187FB3A1EB',,b'',,b'8d7ef530-a6fd-4f8f-b2e2-74aec765e0f9',b'Karkkiautomaatti',-1,b'',0,0,b'Karkuteill\xc3\xa4',145266,0.299877,b'SOVFVAK12A8C1350D9',b'Tanssi vaan',1514808
2,b'',63531,0.643681,0.437504,b'ARGEKB01187FB50750',55.8578,"b'Glasgow, Scotland'",-4.24251,b'3d403d44-36ce-465c-ad43-ae877e65adc4',b'Hudson Mohawke',-1,b'',0,0,b'Butter',625706,0.617871,b'SOGTUKN12AB017F4F1',b'No One Could Ever',6945353
3,b'',65051,0.448501,0.372349,b'ARNWYLR1187B9B2F9C',,b'',,b'12be7648-7094-495f-90e6-df4189d68615',b'Yerba Brava',34000,b'',0,0,b'De Culo',199368,,b'SOBNYVR12A8C13558C',b'Si Vos Quer\xc3\xa9s',2168257
4,b'',158279,0.0,0.0,b'AREQDTE1269FB37231',,b'',,b'',b'Der Mystic',-1,b'',0,0,b'Rene Ablaze Presents Winter Sessions',209038,,b'SOHSBXH12A8C13B0DF',b'Tangle Of Aspens',2264873


In [27]:
meta_reduced = meta_data_songs.loc[:,['artist_name','artist_mbid','title','song_id']].copy()

In [28]:
# Transform bytes to strings
byte_str_columns = ['artist_mbid','artist_name','song_id','title']
for col in byte_str_columns:    
    meta_reduced.loc[~meta_reduced[col].isnull(),col] = meta_reduced.loc[~meta_reduced[col].isnull(),col].str.decode('utf-8')
    meta_reduced.loc[meta_reduced[col] == '',col] = np.nan
    
# Test output
meta_reduced.head()

Unnamed: 0,artist_name,artist_mbid,title,song_id
0,Faster Pussy cat,357ff05d-848a-44cf-b608-cb34b5701ae5,Silent Night,SOQMMHC12AB0180CB8
1,Karkkiautomaatti,8d7ef530-a6fd-4f8f-b2e2-74aec765e0f9,Tanssi vaan,SOVFVAK12A8C1350D9
2,Hudson Mohawke,3d403d44-36ce-465c-ad43-ae877e65adc4,No One Could Ever,SOGTUKN12AB017F4F1
3,Yerba Brava,12be7648-7094-495f-90e6-df4189d68615,Si Vos Querés,SOBNYVR12A8C13558C
4,Der Mystic,,Tangle Of Aspens,SOHSBXH12A8C13B0DF


In [43]:
# Import mapping.csv file (see notebook Create Mapping File from Echonest Mapping Project)
mapping_catalog_view = pd.read_csv('../interim/mapping_summary.csv',sep='\t').drop_duplicates(subset=['id','tracks.catalog'],keep='first')
mapping_catalog_view.head()

Unnamed: 0,tracks.catalog,tracks.foreign_id,tracks.id,artist_name,id,title
0,spotify,spotify:track:4UvblPNa9nwMTkzLtUBxnj,TRNAQME144D1525B11,Jackson Sisters,SOCSZMF12AAF3B3498,Why Do Fools Fall In Love
1,musicbrainz,musicbrainz:track:73ac9287-2056-4ec8-b5f2-5189...,TRXBBYC13B7C4E2FC5,Jackson Sisters,SOCSZMF12AAF3B3498,Why Do Fools Fall In Love
2,spotify,spotify:track:2biysHaswSmcpAM1T554BR,TROAEGA144D0FE9F1A,Journey,SOCSMOE12A8C1459E8,Live and Breathe
4,musicbrainz,musicbrainz:track:00f26e00-4907-4869-91e6-9e5c...,TREIEXB13B7B543E25,Journey,SOCSMOE12A8C1459E8,Live and Breathe
5,spotify,spotify:track:4YkMlmjR7KVwbZa3D3ek8N,TRYQDDR144D154494E,Chamillionaire,SOCBVBO1315CD46DFA,Think I'm Crazy


In [44]:
ids_pivot_df = mapping_catalog_view.pivot(index='id',columns='tracks.catalog',values='tracks.foreign_id').reset_index()
ids_pivot_df.head()

tracks.catalog,id,musicbrainz,spotify
0,SOAAAAI1313438F93F,,spotify:track:4rmDmhbsQCnotwzoBwIEDY
1,SOAAAAI13134396D63,musicbrainz:track:2ee83483-687e-4fef-90d5-3cb6...,spotify:track:4XfqVEMwPUMHPB96CXcxbU
2,SOAAABI12A8C13615F,,spotify:track:6i9aLGfG1Id1oCcpgmRjmE
3,SOAAABJ12A58A7A848,,spotify:track:5wvw53vrVv7dbEGQalRZtO
4,SOAAABL131712CF706,,spotify:track:1GkGoUh9pa99J1RYNCLxkk


In [46]:
unique_songs_df = mapping_catalog_view.drop_duplicates(subset='id',keep='first')
mapping_df = unique_songs_df.merge(ids_pivot_df,how='left',on='id').loc[:,['id','tracks.id','artist_name','title','musicbrainz','spotify']]
mapping_df.head()

Unnamed: 0,id,tracks.id,artist_name,title,musicbrainz,spotify
0,SOCSZMF12AAF3B3498,TRNAQME144D1525B11,Jackson Sisters,Why Do Fools Fall In Love,musicbrainz:track:73ac9287-2056-4ec8-b5f2-5189...,spotify:track:4UvblPNa9nwMTkzLtUBxnj
1,SOCSMOE12A8C1459E8,TROAEGA144D0FE9F1A,Journey,Live and Breathe,musicbrainz:track:00f26e00-4907-4869-91e6-9e5c...,spotify:track:2biysHaswSmcpAM1T554BR
2,SOCBVBO1315CD46DFA,TRYQDDR144D154494E,Chamillionaire,Think I'm Crazy,musicbrainz:track:119df7c9-66f2-4a22-9ca7-4fc1...,spotify:track:4YkMlmjR7KVwbZa3D3ek8N
3,SOCSZXL12A6D4F8145,TRCSAFR147B6EB3D98,Moonspell,Opium,musicbrainz:track:041f4e5f-c065-4d22-a858-e64e...,spotify:track:7oZHYB2HXjvgcMfu8tTqBA
4,SOCSBRR12A6D4FBEC2,TRJDLNF144D118CFAF,Oliver Huntemann,37 Grad (Arnaud Rebotini's Blackstrobe Remix),musicbrainz:track:9a631053-a49d-4d47-a306-b5ea...,spotify:track:3CSkXJgKEupR2mYx6CnW8c


In [37]:
meta_reduced_sp_ids = meta_reduced.merge(mapping_df,left_on=['song_id','artist_name','title'],right_on=['id','artist_name','title'],how='left')
meta_reduced_sp_ids.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1163102 entries, 0 to 1163101
Data columns (total 8 columns):
artist_name          1163102 non-null object
artist_mbid          1099146 non-null object
title                1163087 non-null object
song_id              1163102 non-null object
tracks.catalog       455026 non-null object
tracks.foreign_id    455026 non-null object
tracks.id            455026 non-null object
id                   455026 non-null object
dtypes: object(8)
memory usage: 79.9+ MB


In [None]:
meta_reduced_sp_ids.head()

In [17]:
# Generate a DataFrame with unique artists that have a musicbrainzid
msd_artists_df = meta_reduced_sp_ids.dropna(subset=['artist_mbid']).drop_duplicates(subset=['artist_name'],keep='first').loc[:,['artist_name','artist_mbid']]
msd_artists_df.head()

Unnamed: 0,artist_name,artist_mbid
0,Faster Pussy cat,357ff05d-848a-44cf-b608-cb34b5701ae5
1,Karkkiautomaatti,8d7ef530-a6fd-4f8f-b2e2-74aec765e0f9
2,Hudson Mohawke,3d403d44-36ce-465c-ad43-ae877e65adc4
3,Yerba Brava,12be7648-7094-495f-90e6-df4189d68615
5,David Montgomery,d087b377-bab7-46c4-bd12-15debebb5d61


In [18]:
# Generate Artists File
output_dir = '../interim/'
target = 'msd_artists.csv'
msd_artists_df.to_csv(output_dir + target, sep='\t',index=False,encoding='utf-8')