In [1]:
import pandas as pd
import numpy as np
import sqlite3
import os

In [2]:
# Read Echonest Taste profile dataset
taste = pd.read_csv('./data/train_triplets.txt', sep='\t', header=None, names=['user_id','song_id','play_count'])

In [3]:
taste.head()

Unnamed: 0,user_id,song_id,play_count
0,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAKIMP12A8C130995,1
1,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAPDEY12A81C210A9,1
2,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBBMDR12A8C13253B,2
3,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBFNSP12AF72A0E22,1
4,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBFOVM12A58A7D494,1


In [4]:
len(taste.user_id.unique()), len(taste.song_id.unique())

(1019318, 384546)

In [5]:
len(taste[taste['user_id']=='b80344d063b5ccb3212f76538f3d9e43d87dca9e'])

104

In [6]:
# path to the Million Song Dataset track meta data
track_meta_path='./data/track_metadata.db'

In [7]:
conn = sqlite3.connect(track_meta_path)

In [8]:
q = "SELECT sql FROM sqlite_master WHERE tbl_name = 'songs' AND type = 'table'"
res = conn.execute(q)
res.fetchall()

[(u'CREATE TABLE songs (track_id text PRIMARY KEY, title text, song_id text, release text, artist_id text, artist_mbid text, artist_name text, duration real, artist_familiarity real, artist_hotttnesss real, year int, track_7digitalid int, shs_perf int, shs_work int)',)]

In [9]:
q = "SELECT song_id, title, artist_name FROM songs"
res = conn.execute(q)
echonest_meta = res.fetchall()

In [10]:
df_meta = pd.DataFrame(echonest_meta, columns=['song_id','song_title','artist_name'])

In [12]:
# Export song metadata to csv
df_meta.to_csv('./data/echonest_song_meta.csv', index=False, encoding='utf-8')

In [13]:
labels, levels = pd.factorize(taste['user_id'])

In [14]:
labels.shape

(48373586,)

In [15]:
levels.shape

(1019318,)

In [16]:
taste['user_index'] = labels

In [17]:
taste.head()

Unnamed: 0,user_id,song_id,play_count,user_index
0,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAKIMP12A8C130995,1,0
1,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAPDEY12A81C210A9,1,0
2,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBBMDR12A8C13253B,2,0
3,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBFNSP12AF72A0E22,1,0
4,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBFOVM12A58A7D494,1,0


In [18]:
len(taste.user_index.unique())

1019318

In [19]:
slabels, slevels = pd.factorize(taste['song_id'])

In [20]:
slabels.shape, slevels.shape

((48373586,), (384546,))

In [21]:
taste['song_index'] = slabels

In [22]:
len(taste.song_index.unique())

384546

In [23]:
taste.head()

Unnamed: 0,user_id,song_id,play_count,user_index,song_index
0,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAKIMP12A8C130995,1,0,0
1,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAPDEY12A81C210A9,1,0,1
2,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBBMDR12A8C13253B,2,0,2
3,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBFNSP12AF72A0E22,1,0,3
4,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBFOVM12A58A7D494,1,0,4


In [24]:
taste[['user_index','song_index','play_count']].head()

Unnamed: 0,user_index,song_index,play_count
0,0,0,1
1,0,1,1
2,0,2,2
3,0,3,1
4,0,4,1


In [25]:
# Export taste profile (encoded) to csv file
taste[['user_index','song_index','play_count']].to_csv('./data/taste_profile.csv', index=False)

In [28]:
all_song_ids = taste[['song_id','song_index']].drop_duplicates()

In [29]:
all_song_ids.shape

(384546, 2)

In [30]:
all_song_ids.head()

Unnamed: 0,song_id,song_index
0,SOAKIMP12A8C130995,0
1,SOAPDEY12A81C210A9,1
2,SOBBMDR12A8C13253B,2
3,SOBFNSP12AF72A0E22,3
4,SOBFOVM12A58A7D494,4


In [31]:
# Export song_id song_index link to csv file
all_song_ids.to_csv('./data/songs_encoder.csv', index=False)

In [32]:
# Prepare subset Taste data
subset_taste = taste.loc[:1000000,['user_index','song_index','play_count']]

In [33]:
subset_taste.shape

(1000001, 3)

In [34]:
subset_taste.head()

Unnamed: 0,user_index,song_index,play_count
0,0,0,1
1,0,1,1
2,0,2,2
3,0,3,1
4,0,4,1


In [35]:
len(subset_taste.user_index.unique()), len(subset_taste.song_index.unique())

(20787, 148039)

In [36]:
subset_taste.to_csv('./data/subset_taste_profile.csv', index=False)

In [37]:
subset_taste.head()

Unnamed: 0,user_index,song_index,play_count
0,0,0,1
1,0,1,1
2,0,2,2
3,0,3,1
4,0,4,1


In [38]:
subset_taste[subset_taste['user_index']==1356]

Unnamed: 0,user_index,song_index,play_count
65094,1356,3568,1
65095,1356,13786,1
65096,1356,16511,1
65097,1356,33278,1
65098,1356,23315,1
65099,1356,33279,1
65100,1356,497,1
65101,1356,28251,1
65102,1356,33280,1
65103,1356,2788,1


In [2]:
import pandas as pd
song_encode = pd.read_csv('./data/songs_encoder.csv')
song_meta = pd.read_csv('./data/echonest_song_meta.csv')

In [3]:
song_encode.shape, song_meta.shape

((384546, 2), (1000000, 3))

In [4]:
song_encode.head()

Unnamed: 0,song_id,song_index
0,SOAKIMP12A8C130995,0
1,SOAPDEY12A81C210A9,1
2,SOBBMDR12A8C13253B,2
3,SOBFNSP12AF72A0E22,3
4,SOBFOVM12A58A7D494,4


In [5]:
song_meta.head()

Unnamed: 0,song_id,song_title,artist_name
0,SOQMMHC12AB0180CB8,Silent Night,Faster Pussy cat
1,SOVFVAK12A8C1350D9,Tanssi vaan,Karkkiautomaatti
2,SOGTUKN12AB017F4F1,No One Could Ever,Hudson Mohawke
3,SOBNYVR12A8C13558C,Si Vos Querés,Yerba Brava
4,SOHSBXH12A8C13B0DF,Tangle Of Aspens,Der Mystic


In [6]:
song_encode_meta = pd.merge(song_encode, song_meta, how='left', on='song_id')

In [9]:
song_encode_meta.drop('song_id', axis=1).to_csv('./data/song_encode_meta.csv', index=False, encoding='utf-8')