IMPORTANT: Make sure to run this notebook first (Create Mapping File from Echonest Mapping Project) in order to create 'mapping.csv' (if not already available)

Import the Dataset and have an initial exploration, see shifts and possible issues.

In [1]:
import h5py
import numpy as np
import pandas as pd
from glob import glob, iglob
import json

In [2]:
%cd ../data/raw/
%ls

/Users/sebastian/git_repos/data_science/hit_predictor/data/raw
[34mMillionSongFullSummary[m[m/ [34mfaq[m[m/


First we'll look at the summary file

In [3]:
data = h5py.File('MillionSongFullSummary/msd_summary_file.h5','r')
for key in data.keys():
    print(key)

analysis
metadata
musicbrainz


In [4]:
# Load Metadata
meta_data_songs = pd.DataFrame(data['metadata']['songs'].value)
meta_data_songs.head()



Unnamed: 0,analyzer_version,artist_7digitalid,artist_familiarity,artist_hotttnesss,artist_id,artist_latitude,artist_location,artist_longitude,artist_mbid,artist_name,artist_playmeid,genre,idx_artist_terms,idx_similar_artists,release,release_7digitalid,song_hotttnesss,song_id,title,track_7digitalid
0,b'',4069,0.649822,0.394032,b'ARYZTJS1187B98C555',,b'',,b'357ff05d-848a-44cf-b608-cb34b5701ae5',b'Faster Pussy cat',44895,b'',0,0,b'Monster Ballads X-Mas',633681,0.542899,b'SOQMMHC12AB0180CB8',b'Silent Night',7032331
1,b'',113480,0.439604,0.356992,b'ARMVN3U1187FB3A1EB',,b'',,b'8d7ef530-a6fd-4f8f-b2e2-74aec765e0f9',b'Karkkiautomaatti',-1,b'',0,0,b'Karkuteill\xc3\xa4',145266,0.299877,b'SOVFVAK12A8C1350D9',b'Tanssi vaan',1514808
2,b'',63531,0.643681,0.437504,b'ARGEKB01187FB50750',55.8578,"b'Glasgow, Scotland'",-4.24251,b'3d403d44-36ce-465c-ad43-ae877e65adc4',b'Hudson Mohawke',-1,b'',0,0,b'Butter',625706,0.617871,b'SOGTUKN12AB017F4F1',b'No One Could Ever',6945353
3,b'',65051,0.448501,0.372349,b'ARNWYLR1187B9B2F9C',,b'',,b'12be7648-7094-495f-90e6-df4189d68615',b'Yerba Brava',34000,b'',0,0,b'De Culo',199368,,b'SOBNYVR12A8C13558C',b'Si Vos Quer\xc3\xa9s',2168257
4,b'',158279,0.0,0.0,b'AREQDTE1269FB37231',,b'',,b'',b'Der Mystic',-1,b'',0,0,b'Rene Ablaze Presents Winter Sessions',209038,,b'SOHSBXH12A8C13B0DF',b'Tangle Of Aspens',2264873


In [5]:
meta_reduced = meta_data_songs.loc[:,['artist_name','artist_mbid','title','song_id']].copy()

In [6]:
# Transform bytes to strings
byte_str_columns = ['artist_mbid','artist_name','song_id','title']
for col in byte_str_columns:    
    meta_reduced.loc[~meta_reduced[col].isnull(),col] = meta_reduced.loc[~meta_reduced[col].isnull(),col].str.decode('utf-8')
    meta_reduced.loc[meta_reduced[col] == '',col] = np.nan
    
# Test output
meta_reduced.head()

Unnamed: 0,artist_name,artist_mbid,title,song_id
0,Faster Pussy cat,357ff05d-848a-44cf-b608-cb34b5701ae5,Silent Night,SOQMMHC12AB0180CB8
1,Karkkiautomaatti,8d7ef530-a6fd-4f8f-b2e2-74aec765e0f9,Tanssi vaan,SOVFVAK12A8C1350D9
2,Hudson Mohawke,3d403d44-36ce-465c-ad43-ae877e65adc4,No One Could Ever,SOGTUKN12AB017F4F1
3,Yerba Brava,12be7648-7094-495f-90e6-df4189d68615,Si Vos Querés,SOBNYVR12A8C13558C
4,Der Mystic,,Tangle Of Aspens,SOHSBXH12A8C13B0DF


In [7]:
# Import mapping.csv file (see notebook Create Mapping File from Echonest Mapping Project)
mapping_catalog_view = pd.read_csv('../interim/mapping_summary.csv',sep='\t').drop_duplicates(subset=['id','tracks.catalog'],keep='first')
mapping_catalog_view.head()

Unnamed: 0,tracks.catalog,tracks.foreign_id,tracks.id,artist_name,id,title
0,spotify,spotify:track:4UvblPNa9nwMTkzLtUBxnj,TRNAQME144D1525B11,Jackson Sisters,SOCSZMF12AAF3B3498,Why Do Fools Fall In Love
1,musicbrainz,musicbrainz:track:73ac9287-2056-4ec8-b5f2-5189...,TRXBBYC13B7C4E2FC5,Jackson Sisters,SOCSZMF12AAF3B3498,Why Do Fools Fall In Love
2,spotify,spotify:track:2biysHaswSmcpAM1T554BR,TROAEGA144D0FE9F1A,Journey,SOCSMOE12A8C1459E8,Live and Breathe
4,musicbrainz,musicbrainz:track:00f26e00-4907-4869-91e6-9e5c...,TREIEXB13B7B543E25,Journey,SOCSMOE12A8C1459E8,Live and Breathe
5,spotify,spotify:track:4YkMlmjR7KVwbZa3D3ek8N,TRYQDDR144D154494E,Chamillionaire,SOCBVBO1315CD46DFA,Think I'm Crazy


In [20]:
ids_pivot_df = mapping_catalog_view.pivot(index='id',columns='tracks.catalog',values='tracks.foreign_id').reset_index()
ids_pivot_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 762338 entries, 0 to 762337
Data columns (total 3 columns):
id             762338 non-null object
musicbrainz    418856 non-null object
spotify        750359 non-null object
dtypes: object(3)
memory usage: 17.4+ MB
<class 'pandas.core.frame.DataFrame'>
Int64Index: 762338 entries, 0 to 3041114
Data columns (total 6 columns):
tracks.catalog       762338 non-null object
tracks.foreign_id    762338 non-null object
tracks.id            762338 non-null object
artist_name          762338 non-null object
id                   762338 non-null object
title                762338 non-null object
dtypes: object(6)
memory usage: 40.7+ MB


In [22]:
unique_songs_df = mapping_catalog_view.drop_duplicates(subset='id',keep='first')
mapping_df = unique_songs_df.merge(ids_pivot_df,how='left',on='id')
mapping_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 762338 entries, 0 to 762337
Data columns (total 8 columns):
tracks.catalog       762338 non-null object
tracks.foreign_id    762338 non-null object
tracks.id            762338 non-null object
artist_name          762338 non-null object
id                   762338 non-null object
title                762338 non-null object
musicbrainz          418856 non-null object
spotify              750359 non-null object
dtypes: object(8)
memory usage: 52.3+ MB


In [28]:
meta_reduced_sp_ids = meta_reduced.merge(mapping_df,left_on=['song_id'],right_on=['id'],how='left')
meta_matched = meta_reduced_sp_ids.loc[:,['artist_name_x','id','title_x','musicbrainz','spotify']]
meta_matched.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000000 entries, 0 to 999999
Data columns (total 5 columns):
artist_name_x    1000000 non-null object
id               435081 non-null object
title_x          999985 non-null object
musicbrainz      239801 non-null object
spotify          426992 non-null object
dtypes: object(5)
memory usage: 45.8+ MB


Unnamed: 0,artist_name_x,artist_mbid,title_x,song_id,tracks.catalog,tracks.foreign_id,tracks.id,artist_name_y,id,title_y,musicbrainz,spotify
6,Sasha / Turbulence,d2461c0a-5575-4425-a225-fce0180de3fd,We Have Got Love,SOQVRHI12A6D4FB2D7,spotify,spotify:track:5zvuyMMCl5TQrEefdMSERe,TRLKEIW144D1988CE0,Sasha,SOQVRHI12A6D4FB2D7,We Have Got Love,,spotify:track:5zvuyMMCl5TQrEefdMSERe
9,The Sun Harbor's Chorus-Documentary Recordings,,Mama_ mama can't you see ?,SOJCFMH12A8C13B0C2,spotify,spotify:track:4U2ryP1lJ09IeWA5tBpq3R,TRRWKLR144D14C1B5C,US Marines,SOJCFMH12A8C13B0C2,"Mama, Mama, Can't You See?",musicbrainz:track:f47b1dc9-61ef-4fee-b1b1-4f3b...,spotify:track:4U2ryP1lJ09IeWA5tBpq3R
46,Lil O,8f1b6974-6529-4e27-8ba9-f2ef2aa7fbe3,My Everything [Screwed] (feat. Trae The Truth),SONYUEW12AB018B373,spotify,spotify:track:1hVwAWqaIUMMzobJXB5b4P,TRHQLDI144D0D568EA,Lil' O,SONYUEW12AB018B373,My Everything [Screwed] (feat. Trae The Truth),,spotify:track:1hVwAWqaIUMMzobJXB5b4P
55,Ray Conniff;Billy Butterfield,b84dfb67-c437-4ccc-b512-1073fd4612eb,Heartaches,SOCBSOR12A8C1314DD,spotify,spotify:track:0aHv8wVoW1X0x8APahweZs,TRLSUQZ144D0A8B5FB,Ray Conniff,SOCBSOR12A8C1314DD,Heartaches,,spotify:track:0aHv8wVoW1X0x8APahweZs
66,JOHN DOE,804adff4-9f7b-49d9-9585-6c73fb33e5aa,Suffer,SOUVPJZ12AB018218C,spotify,spotify:track:4kQYePtwvIZ8P36f2hRorh,TRWJWOE144D15D775B,John Doe,SOUVPJZ12AB018218C,Suffer,,spotify:track:4kQYePtwvIZ8P36f2hRorh
68,Tonex,3e9c5302-782c-4bd9-99f0-76ed93c38f3a,Bring It,SOGBBEE12AB017ED6A,spotify,spotify:track:0gIW0Edu0SVRNUJXgZ8v1c,TRIQLXC144D0ACAE68,Tonéx,SOGBBEE12AB017ED6A,Bring It,,spotify:track:0gIW0Edu0SVRNUJXgZ8v1c
78,Kruiz,50eaf061-e265-434a-937f-09289270d08a,Dalny svet (bonus 2003),SODXEOD12AB018EC03,spotify,spotify:track:5H3Zt7j5UKZtzVEP3wJXrM,TRIQEYD144D17674BF,Круиз,SODXEOD12AB018EC03,Dalny svet (bonus 2003),,spotify:track:5H3Zt7j5UKZtzVEP3wJXrM
94,Alice In Videoland,6a5f768c-db43-461a-8e58-658084ebe84d,Stuck On My Vision,SOCKLYJ12A8C14017E,spotify,spotify:track:5QoQvK5Y64k7uyzaLRtrIb,TRBVVOQ144D17D7654,Alice in Videoland,SOCKLYJ12A8C14017E,Stuck On My Vision,musicbrainz:track:0e8e7d1b-b4ec-4659-9de4-b645...,spotify:track:5QoQvK5Y64k7uyzaLRtrIb
102,Ice Cube Featuring Chuck D,1d11e2a1-4531-4d61-a8c7-7b5c6a608fd2,Endangered Species (Tales From The Darkside) (...,SOADQCE12A6D4F64B3,spotify,spotify:track:3nJmja2ziyQLmEUXdgO6Gw,TRETYWE14C5E8DAA89,Ice Cube,SOADQCE12A6D4F64B3,Endangered Species (Tales From The Darkside) (...,,spotify:track:3nJmja2ziyQLmEUXdgO6Gw
153,Zen Cafe,6ca14d5e-3a33-4696-9e82-bdebe070838e,En oo koskaan tavannut,SOIGUQJ12A6D4FAB11,spotify,spotify:track:4IQnn2fg0KbMVQjtMA2ngF,TRMOFBE144D14A3C63,Zen Café,SOIGUQJ12A6D4FAB11,En oo koskaan tavannut,musicbrainz:track:ff905660-c0df-42bf-b0a8-ee1f...,spotify:track:4IQnn2fg0KbMVQjtMA2ngF


In [11]:
meta_reduced_sp_ids.head()

Unnamed: 0,artist_name,artist_mbid,title,song_id,id,tracks.id,musicbrainz,spotify
0,Faster Pussy cat,357ff05d-848a-44cf-b608-cb34b5701ae5,Silent Night,SOQMMHC12AB0180CB8,,,,
1,Karkkiautomaatti,8d7ef530-a6fd-4f8f-b2e2-74aec765e0f9,Tanssi vaan,SOVFVAK12A8C1350D9,SOVFVAK12A8C1350D9,TRXYTBQ144D1A0B175,musicbrainz:track:91d584bf-eed8-40cd-88e5-102d...,spotify:track:6DOmOjeTc3btomrfFfPgy8
2,Hudson Mohawke,3d403d44-36ce-465c-ad43-ae877e65adc4,No One Could Ever,SOGTUKN12AB017F4F1,SOGTUKN12AB017F4F1,TRETTUS144D13D0801,musicbrainz:track:537099d9-fb53-41d5-9b31-2d30...,spotify:track:41RpZW2lxAdnqDd2nMBzLQ
3,Yerba Brava,12be7648-7094-495f-90e6-df4189d68615,Si Vos Querés,SOBNYVR12A8C13558C,SOBNYVR12A8C13558C,TRIDANV144D1DF4003,musicbrainz:track:881ff6e5-4949-4924-b9fb-3508...,spotify:track:7z4BZV7eZO1bqVKwAeTmou
4,Der Mystic,,Tangle Of Aspens,SOHSBXH12A8C13B0DF,,,,


In [12]:
# Generate a DataFrame with unique artists that have a musicbrainzid
msd_artists_df = meta_reduced_sp_ids.dropna(subset=['artist_mbid']).drop_duplicates(subset=['artist_name'],keep='first').loc[:,['artist_name','artist_mbid']]
msd_artists_df.head()

Unnamed: 0,artist_name,artist_mbid
0,Faster Pussy cat,357ff05d-848a-44cf-b608-cb34b5701ae5
1,Karkkiautomaatti,8d7ef530-a6fd-4f8f-b2e2-74aec765e0f9
2,Hudson Mohawke,3d403d44-36ce-465c-ad43-ae877e65adc4
3,Yerba Brava,12be7648-7094-495f-90e6-df4189d68615
5,David Montgomery,d087b377-bab7-46c4-bd12-15debebb5d61


In [13]:
# Generate Artists File
output_dir = '../interim/'
target = 'msd_artists.csv'
msd_artists_df.to_csv(output_dir + target, sep='\t',index=False,encoding='utf-8')