### Task: convert song's and user's indexes from string to integer 
#### Input: 
 - train_triplets.txt (original Echonest MSD taste profile)
 - track_metadata.db (original from MSD) or echonest_song_meta.csv

#### Output: 
 - taste_profile.csv (48,373,586 rows)
 - subset_taste_profile.csv (1,000,000 rows)
 - songs_encoder.csv + echonest_song_meta.csv = song_encode_meta.csv (384,546 rows)

In [2]:
import pandas as pd
import numpy as np
import sqlite3
import os

In [None]:
# Read Echonest Taste profile dataset
taste = pd.read_csv('./data/train_triplets.txt', sep='\t', header=None, names=['user_id','song_id','play_count'])

In [3]:
taste.head()

Unnamed: 0,user_id,song_id,play_count
0,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAKIMP12A8C130995,1
1,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAPDEY12A81C210A9,1
2,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBBMDR12A8C13253B,2
3,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBFNSP12AF72A0E22,1
4,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBFOVM12A58A7D494,1


In [4]:
len(taste.user_id.unique()), len(taste.song_id.unique())

(1019318, 384546)

In [5]:
len(taste[taste['user_id']=='b80344d063b5ccb3212f76538f3d9e43d87dca9e'])

104

In [6]:
# path to the Million Song Dataset track meta data
track_meta_path='./data/track_metadata.db'

In [7]:
conn = sqlite3.connect(track_meta_path)

In [8]:
q = "SELECT sql FROM sqlite_master WHERE tbl_name = 'songs' AND type = 'table'"
res = conn.execute(q)
res.fetchall()

[(u'CREATE TABLE songs (track_id text PRIMARY KEY, title text, song_id text, release text, artist_id text, artist_mbid text, artist_name text, duration real, artist_familiarity real, artist_hotttnesss real, year int, track_7digitalid int, shs_perf int, shs_work int)',)]

In [9]:
q = "SELECT song_id, title, artist_name FROM songs"
res = conn.execute(q)
echonest_meta = res.fetchall()

In [10]:
df_meta = pd.DataFrame(echonest_meta, columns=['song_id','song_title','artist_name'])

In [12]:
# Export song metadata to csv
df_meta.to_csv('./data/echonest_song_meta.csv', index=False, encoding='utf-8')

In [13]:
labels, levels = pd.factorize(taste['user_id'])

In [14]:
labels.shape

(48373586,)

In [15]:
levels.shape

(1019318,)

In [16]:
taste['user_index'] = labels

In [17]:
taste.head()

Unnamed: 0,user_id,song_id,play_count,user_index
0,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAKIMP12A8C130995,1,0
1,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAPDEY12A81C210A9,1,0
2,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBBMDR12A8C13253B,2,0
3,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBFNSP12AF72A0E22,1,0
4,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBFOVM12A58A7D494,1,0


In [18]:
len(taste.user_index.unique())

1019318

In [19]:
slabels, slevels = pd.factorize(taste['song_id'])

In [20]:
slabels.shape, slevels.shape

((48373586,), (384546,))

In [21]:
taste['song_index'] = slabels

In [22]:
len(taste.song_index.unique())

384546

In [23]:
taste.head()

Unnamed: 0,user_id,song_id,play_count,user_index,song_index
0,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAKIMP12A8C130995,1,0,0
1,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAPDEY12A81C210A9,1,0,1
2,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBBMDR12A8C13253B,2,0,2
3,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBFNSP12AF72A0E22,1,0,3
4,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBFOVM12A58A7D494,1,0,4


In [24]:
taste[['user_index','song_index','play_count']].head()

Unnamed: 0,user_index,song_index,play_count
0,0,0,1
1,0,1,1
2,0,2,2
3,0,3,1
4,0,4,1


In [25]:
# Export taste profile (encoded) to csv file
taste[['user_index','song_index','play_count']].to_csv('./data/taste_profile.csv', index=False)

In [28]:
all_song_ids = taste[['song_id','song_index']].drop_duplicates()

In [29]:
all_song_ids.shape

(384546, 2)

In [30]:
all_song_ids.head()

Unnamed: 0,song_id,song_index
0,SOAKIMP12A8C130995,0
1,SOAPDEY12A81C210A9,1
2,SOBBMDR12A8C13253B,2
3,SOBFNSP12AF72A0E22,3
4,SOBFOVM12A58A7D494,4


In [31]:
# Export song_id song_index link to csv file
all_song_ids.to_csv('./data/songs_encoder.csv', index=False)

In [32]:
# Prepare subset Taste data
#subset_taste = taste.loc[:1000000,['user_index','song_index','play_count']]

### Prepare subset Taste data

In [3]:
taste = pd.read_csv('./data/taste_profile.csv')

In [4]:
taste.shape

(48373586, 3)

In [5]:
taste.head()

Unnamed: 0,user_index,song_index,play_count
0,0,0,1
1,0,1,1
2,0,2,2
3,0,3,1
4,0,4,1


In [6]:
len(taste.user_index.unique()), len(taste.song_index.unique())

(1019318, 384546)

In [7]:
# How many song a user had listened to
playcount_by_user = taste.groupby('user_index')['play_count'].count()

# How many users a song was listened to
playcount_by_song = taste.groupby('song_index')['play_count'].count()

In [8]:
playcount_by_user.shape, playcount_by_song.shape

((1019318,), (384546,))

In [9]:
len(playcount_by_user[playcount_by_user > 10])

968710

In [10]:
len(playcount_by_song[playcount_by_song > 10])

212527

In [11]:
playcount_by_user = playcount_by_user.reset_index()
playcount_by_user.columns = ['user_index','num_songs']
playcount_by_song = playcount_by_song.reset_index()
playcount_by_song.columns = ['song_index', 'num_users']

In [12]:
print "User's number of songs"
print playcount_by_user.num_songs.mean()
print playcount_by_user.num_songs.median()
print playcount_by_user.num_songs.max()
print playcount_by_user.num_songs.min()

print "Song's number of users"
print playcount_by_song.num_users.mean()
print playcount_by_song.num_users.median()
print playcount_by_song.num_users.max()
print playcount_by_song.num_users.min()

User's number of songs
47.4568152431
27.0
4400
10
Song's number of users
125.794016841
13.0
110479
1


In [13]:
print playcount_by_user.num_songs.describe()
print playcount_by_song.num_users.describe()

count    1.019318e+06
mean     4.745682e+01
std      5.781573e+01
min      1.000000e+01
25%      1.600000e+01
50%      2.700000e+01
75%      5.500000e+01
max      4.400000e+03
Name: num_songs, dtype: float64
count    384546.000000
mean        125.794017
std         799.025834
min           1.000000
25%           4.000000
50%          13.000000
75%          52.000000
max      110479.000000
Name: num_users, dtype: float64


In [14]:
taste_data = pd.merge(taste, playcount_by_user, how='left', on='user_index')

In [15]:
taste_data = pd.merge(taste_data, playcount_by_song, how='left', on='song_index')

In [16]:
taste.shape, taste_data.shape

((48373586, 3), (48373586, 5))

In [17]:
taste_data.head()

Unnamed: 0,user_index,song_index,play_count,num_songs,num_users
0,0,0,1,104,2357
1,0,1,1,104,723
2,0,2,2,104,2097
3,0,3,1,104,277
4,0,4,1,104,451


In [18]:
subset_taste = taste_data[(taste_data.num_songs > 100) & (taste_data.num_songs < 10000) & \
                          (taste_data.num_users > 200) & (taste_data.num_songs < 50000)]
print len(subset_taste)
print len(subset_taste.user_index.unique())
print len(subset_taste.song_index.unique())

15229569
110119
41001


In [19]:
subset_taste[['user_index','song_index','play_count']].to_csv('./data/medium_set_taste_profile.csv', index=False)

### Split data into train and test sets

In [20]:
subset_taste = pd.read_csv('./data/subset_taste_profile.csv')

In [21]:
subset_taste.head()

Unnamed: 0,user_index,song_index,play_count
0,12,546,12
1,12,251,1
2,12,547,3
3,12,548,1
4,12,549,1


In [22]:
# We will split data into half-half
subset_taste.shape

(5091004, 3)

In [23]:
subset_taste.user_index.unique()

array([     12,      16,      61, ..., 1019273, 1019291, 1019314])

In [24]:
len(subset_taste.user_index.unique())

25995

In [25]:
train_subset_taste = pd.DataFrame(columns=subset_taste.columns)
test_subset_taste = pd.DataFrame(columns=subset_taste.columns)
for user_id in subset_taste.user_index.unique().tolist():
#for user_id in (12, 16, 61):
    user_taste = subset_taste[subset_taste.user_index==user_id]
    mid = len(user_taste) / 2
    train_subset_taste = pd.concat((train_subset_taste, user_taste.iloc[:mid,:]))
    test_subset_taste = pd.concat((test_subset_taste, user_taste.iloc[mid:,:]))

In [26]:
train_subset_taste.shape, test_subset_taste.shape, subset_taste.shape

((2538996, 3), (2552008, 3), (5091004, 3))

In [27]:
train_subset_taste = train_subset_taste.astype(int)
test_subset_taste = test_subset_taste.astype(int)

In [28]:
print len(train_subset_taste.user_index.unique())
print len(test_subset_taste.user_index.unique())
print len(subset_taste.user_index.unique())

print len(train_subset_taste.song_index.unique())
print len(test_subset_taste.song_index.unique())
print len(subset_taste.song_index.unique())

25995
25995
25995
14057
14098
23254


In [29]:
train_songs = train_subset_taste.song_index.unique().tolist()
test_songs = test_subset_taste.song_index.unique().tolist()

In [30]:
len(set(train_songs) - set(test_songs))

9156

In [31]:
train_subset_taste.to_csv('./data/subset_train_taste_profile.csv', index=False)
test_subset_taste.to_csv('./data/subset_test_taste_profile.csv', index=False)

In [30]:
subset_taste[(subset_taste.user_index==12) & (subset_taste.song_index==708)]

Unnamed: 0,user_index,song_index,play_count
114,12,708,7


In [31]:
subset_taste.head()

Unnamed: 0,user_index,song_index,play_count
0,12,251,1
1,12,547,3
2,12,549,1
3,12,551,1
4,12,552,2


In [38]:
subset_taste[subset_taste['user_index']==1356]

Unnamed: 0,user_index,song_index,play_count
65094,1356,3568,1
65095,1356,13786,1
65096,1356,16511,1
65097,1356,33278,1
65098,1356,23315,1
65099,1356,33279,1
65100,1356,497,1
65101,1356,28251,1
65102,1356,33280,1
65103,1356,2788,1


In [2]:
import pandas as pd
song_encode = pd.read_csv('./data/songs_encoder.csv')
song_meta = pd.read_csv('./data/echonest_song_meta.csv')

In [3]:
song_encode.shape, song_meta.shape

((384546, 2), (1000000, 3))

In [4]:
song_encode.head()

Unnamed: 0,song_id,song_index
0,SOAKIMP12A8C130995,0
1,SOAPDEY12A81C210A9,1
2,SOBBMDR12A8C13253B,2
3,SOBFNSP12AF72A0E22,3
4,SOBFOVM12A58A7D494,4


In [5]:
song_meta.head()

Unnamed: 0,song_id,song_title,artist_name
0,SOQMMHC12AB0180CB8,Silent Night,Faster Pussy cat
1,SOVFVAK12A8C1350D9,Tanssi vaan,Karkkiautomaatti
2,SOGTUKN12AB017F4F1,No One Could Ever,Hudson Mohawke
3,SOBNYVR12A8C13558C,Si Vos Querés,Yerba Brava
4,SOHSBXH12A8C13B0DF,Tangle Of Aspens,Der Mystic


In [6]:
song_encode_meta = pd.merge(song_encode, song_meta, how='left', on='song_id')

In [9]:
song_encode_meta.drop('song_id', axis=1).to_csv('./data/song_encode_meta.csv', index=False, encoding='utf-8')