## Collaborative Filtering - extract only PID, track_uri and pos 

In [1]:
import json
import numpy as np
from time import time
import pandas as pd
from pandas.io.json import json_normalize
import os
import concurrent.futures
import functools
import random
import gc
import warnings
from pandas.core.common import SettingWithCopyWarning
warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)

In [None]:
path = '../dataset/spotify_million_playlist_dataset/data/'

In [None]:
arr = np.empty((66346428,4), dtype = object)

In [None]:
keys = [
    'pid',
#     'name',
#     'description',
#     'num_artists',
#     'num_albums',
#     'num_tracks',
#     'num_followers',
#     'duration_ms',
#     'collaborative',
    'tracks'
]

In [None]:
t0=time()
samp = 1000
filenames = os.listdir(path)
num = 0
for i, filename in enumerate(random.sample(sorted(filenames), samp)):
    if filename.startswith("mpd.slice.") and filename.endswith(".json"):
        fullpath = os.sep.join((path, filename))
        f = open(fullpath)
        js = f.read()
        f.close()
        mpd_slice = json.loads(js)
        D = pd.DataFrame(mpd_slice['playlists'])[keys]
        df = pd.DataFrame(D, columns=['pid','tracks'])
        idx = df.set_index(['pid']).tracks.apply(pd.Series).stack().index
        df = pd.DataFrame(df.set_index(['pid']).tracks.apply(pd.Series).stack().values.tolist(),index=idx).reset_index().drop('level_1',1)
        arr0 = df[['pid','pos','track_uri','artist_uri']].to_numpy()
        arr[num:num+len(df),:] = arr0
        num = num+len(df)
        
        print(filename,i)
# Time diff
print(f"Time taken: {(time()-t0)/60}")

In [None]:
gc.collect()

In [None]:
arr.shape

In [None]:
df = pd.DataFrame(arr)

In [None]:
df.columns=['pid','pos','track_uri', 'artist_uri']

In [None]:
df.tail()

In [None]:
df.to_csv('../data-processed/pid-track-artist.csv', index = None)

## Add binary and scaled rating

In [None]:
df = pd.read_csv('../data-processed/pid-track-artist.csv')

In [None]:
df.head()

In [None]:
df.dtypes

### Binary scaling

In [None]:
df['binary_rating'] = 1

### Scale 1 to 10 

In [None]:
df.head()

In [None]:
df['pos_max'] = df.groupby('pid')['pos'].transform('max')
df['pos_rating'] = (9*(df['pos_max'] - df['pos'])/(df['pos_max']))+1
df=df.drop(columns=['pos_max'], axis =1 )

In [None]:
df.head()

In [None]:
df[['pid','track_uri','binary_rating','pos_rating']].to_csv('../data-processed/pid-track-ratings.csv', index = None)

## Hold out playlists for testing - to be used for R-precision and NDGC calculation

In [3]:
df = pd.read_csv('../data-processed/pid-track-ratings_train_test_tags.csv')

In [4]:
pids = pd.DataFrame(df.groupby('pid')['pos'].max()).reset_index()
pids.columns=['pid','num_tracks']
pids['train_test'] = np.where(np.random.rand(len(pids),1)<=0.7,'train','test')

In [5]:
pids.head()

Unnamed: 0,pid,num_tracks,train_test
0,0,51,train
1,1,38,train
2,2,63,train
3,3,125,train
4,4,16,train


In [6]:
D_num_tracks={}
for index, row in pids.iterrows():
    D_num_tracks[row['pid']] = row['num_tracks']

In [7]:
D_num_tracks[0]

51

In [8]:
pids.describe()

Unnamed: 0,pid,num_tracks
count,1000000.0,1000000.0
mean,499999.5,65.346428
std,288675.278933,53.669358
min,0.0,4.0
25%,249999.75,25.0
50%,499999.5,48.0
75%,749999.25,91.0
max,999999.0,375.0


In [9]:
holdout_array = [0.5,0.6,0.7,0.8,0.9,0.95]
first_random = ['first','random']

In [10]:
pids['test_first_random'] = np.random.randint(0, 2, pids.shape[0])

In [11]:
pids['test_hould_out_pct'] = [random.choice(holdout_array)*100 for i in range(len(pids))]
pids['seed_tracks_num'] = np.ceil(pids['num_tracks']*(1-pids['test_hould_out_pct']/100))
pids['seed_tracks_num'] = np.where(pids['seed_tracks_num']>100,100, pids['seed_tracks_num'])

In [13]:
pids['train_test'].value_counts()

train    699964
test     300036
Name: train_test, dtype: int64

In [14]:
D_test_train={}
for index, row in pids.iterrows():
    D_test_train[row['pid']] = row['train_test']

In [15]:
D_test_first_random={}
for index, row in pids.iterrows():
    D_test_first_random[row['pid']] = row['test_first_random']

In [16]:
D_seed_tracks_num={}
for index, row in pids.iterrows():
    D_seed_tracks_num[row['pid']] = row['seed_tracks_num']

In [17]:
D_test_train[4357], D_test_first_random[1223], D_seed_tracks_num[230]

('test', 1, 10.0)

In [18]:
df['train_test'] = df['pid'].map(D_test_train)
df['test_first_random'] = df['pid'].map(D_test_first_random)
df['seed_tracks_num'] = df['pid'].map(D_seed_tracks_num)

In [19]:
df['train_test'].value_counts()

train    46431809
test     19914619
Name: train_test, dtype: int64

In [20]:
df['num_tracks'] = df['pid'].map(D_num_tracks)

In [21]:
df.head()

Unnamed: 0,pid,pos,track_uri,artist_uri,binary_rating,pos_rating,train_test,test_first_random,seed_tracks_num,num_tracks
0,491000,0,spotify:track:3giQ7393501IRNrd8iHugf,spotify:artist:2ptmyXoL7poH6Zq62h1QT9,1,10.0,train,1,36.0,177
1,491000,1,spotify:track:3jpcVaeyNjWgjqIxAiWasz,spotify:artist:2ptmyXoL7poH6Zq62h1QT9,1,9.949153,train,1,36.0,177
2,491000,2,spotify:track:1uuqRaSJAiQ6VB8BWblXWJ,spotify:artist:2ptmyXoL7poH6Zq62h1QT9,1,9.898305,train,1,36.0,177
3,491000,3,spotify:track:7gXpcXwtmEiQzskYJmtGgk,spotify:artist:2ptmyXoL7poH6Zq62h1QT9,1,9.847458,train,1,36.0,177
4,491000,4,spotify:track:5wtIWwOtowY2howCZ7Veq2,spotify:artist:2ptmyXoL7poH6Zq62h1QT9,1,9.79661,train,1,36.0,177


In [42]:
df_train = df[df.train_test == 'train']
df_test_1 = df[(df.train_test == 'test') & (df.test_first_random==1)]#first x tracks
df_test_2 = df[(df.train_test == 'test') & (df.test_first_random==0)]#random x tracks
df_train['hold_out'] = 0

In [43]:
df.shape[0]-df_test_1.shape[0]-df_test_2.shape[0]-df_train.shape[0]

0

In [44]:
df_test_1.shape, df_test_2.shape, df_train.shape

((9952949, 10), (9961670, 10), (46431809, 11))

In [45]:
df_train.head()

Unnamed: 0,pid,pos,track_uri,artist_uri,binary_rating,pos_rating,train_test,test_first_random,seed_tracks_num,num_tracks,hold_out
0,491000,0,spotify:track:3giQ7393501IRNrd8iHugf,spotify:artist:2ptmyXoL7poH6Zq62h1QT9,1,10.0,train,1,36.0,177,0
1,491000,1,spotify:track:3jpcVaeyNjWgjqIxAiWasz,spotify:artist:2ptmyXoL7poH6Zq62h1QT9,1,9.949153,train,1,36.0,177,0
2,491000,2,spotify:track:1uuqRaSJAiQ6VB8BWblXWJ,spotify:artist:2ptmyXoL7poH6Zq62h1QT9,1,9.898305,train,1,36.0,177,0
3,491000,3,spotify:track:7gXpcXwtmEiQzskYJmtGgk,spotify:artist:2ptmyXoL7poH6Zq62h1QT9,1,9.847458,train,1,36.0,177,0
4,491000,4,spotify:track:5wtIWwOtowY2howCZ7Veq2,spotify:artist:2ptmyXoL7poH6Zq62h1QT9,1,9.79661,train,1,36.0,177,0


In [46]:
#select x first tracks
df_test_1['hold_out'] = np.where(df_test_1.pos<df_test_1.seed_tracks_num,0,1)

In [152]:
df_test_2['hold_out_'] = np.where(df_test_2.pos<df_test_2.seed_tracks_num,0,1)

In [48]:
df_test_1.shape

(9952949, 11)

In [51]:
df_test_2.shape

(9961670, 11)

In [32]:
D_seed_tracks={}
for index, row in pids.iterrows():
    D_seed_tracks[row['pid']] = row['seed_tracks_num']

In [91]:
D_num_rows={}
for index, row in pids.iterrows():
    D_num_rows[row['pid']] = row['num_tracks']+1

In [121]:
D_num_rows_sorted={}
for el in list(df_test_2.pid.unique()):
    D_num_rows_sorted[el] = D_num_rows[el]

In [120]:
D_seed_tracks_sorted={}
for el in list(df_test_2.pid.unique()):
    D_seed_tracks_sorted[el] = D_seed_tracks[el]

In [151]:
hold_out_list = []
for i,el in enumerate(list(df_test_2.pid.unique())):
    l=[0]*int(D_seed_tracks_sorted[el]) + [1]*(int(D_num_rows_sorted[el])-int(D_seed_tracks_sorted[el]))
    random.shuffle(l)
    hold_out_list.extend(l)

In [150]:
len(hold_out_list)

9961670

In [153]:
df_test_2['hold_out']=hold_out_list

In [165]:
df_test_2.head()

Unnamed: 0,pid,pos,track_uri,artist_uri,binary_rating,pos_rating,train_test,test_first_random,seed_tracks_num,num_tracks,hold_out,hold_out_
178,491001,0,spotify:track:4anqYZt4APNlFwQixpbdZR,spotify:artist:4VmEWwd8y9MCLwexFMdpwt,1,10.0,test,0,5.0,96,1,0
179,491001,1,spotify:track:4voEoczU7Ijborps9XF1n3,spotify:artist:1ZwdS5xdxEREPySFridCfh,1,9.90625,test,0,5.0,96,1,0
180,491001,2,spotify:track:33CdK2WiQfodCIHiXfgoLJ,spotify:artist:1ZwdS5xdxEREPySFridCfh,1,9.8125,test,0,5.0,96,1,0
181,491001,3,spotify:track:7iaw359G2XT14uTfV9feip,spotify:artist:6vXTefBL93Dj5IqAWq6OTv,1,9.71875,test,0,5.0,96,1,0
182,491001,4,spotify:track:64XdaHjuyOQmVRdqn7aCgB,spotify:artist:7dGJo4pcD2V6oG8kP0tJRR,1,9.625,test,0,5.0,96,1,0


In [166]:
df_test_2.drop(columns=['hold_out_'], inplace=True)

In [168]:
df_test_2.head()

Unnamed: 0,pid,pos,track_uri,artist_uri,binary_rating,pos_rating,train_test,test_first_random,seed_tracks_num,num_tracks,hold_out
178,491001,0,spotify:track:4anqYZt4APNlFwQixpbdZR,spotify:artist:4VmEWwd8y9MCLwexFMdpwt,1,10.0,test,0,5.0,96,1
179,491001,1,spotify:track:4voEoczU7Ijborps9XF1n3,spotify:artist:1ZwdS5xdxEREPySFridCfh,1,9.90625,test,0,5.0,96,1
180,491001,2,spotify:track:33CdK2WiQfodCIHiXfgoLJ,spotify:artist:1ZwdS5xdxEREPySFridCfh,1,9.8125,test,0,5.0,96,1
181,491001,3,spotify:track:7iaw359G2XT14uTfV9feip,spotify:artist:6vXTefBL93Dj5IqAWq6OTv,1,9.71875,test,0,5.0,96,1
182,491001,4,spotify:track:64XdaHjuyOQmVRdqn7aCgB,spotify:artist:7dGJo4pcD2V6oG8kP0tJRR,1,9.625,test,0,5.0,96,1


In [172]:
df2=df_train.append([df_test_1,df_test_2])

In [173]:
df2.shape, df.shape

((66346428, 11), (66346428, 10))

In [182]:
temp = df_test_1.groupby('pid')['hold_out','num_tracks'].agg({'hold_out':'sum','num_tracks':'max'})
temp['num_tracks']=temp['num_tracks']+1

  temp = df_test_1.groupby('pid')['hold_out','num_tracks'].agg({'hold_out':'sum','num_tracks':'max'})


In [183]:
temp[temp['hold_out']>=temp['num_tracks']]

Unnamed: 0_level_0,hold_out,num_tracks
pid,Unnamed: 1_level_1,Unnamed: 2_level_1


In [184]:
temp[temp['hold_out']==0]

Unnamed: 0_level_0,hold_out,num_tracks
pid,Unnamed: 1_level_1,Unnamed: 2_level_1


In [176]:
df2.head()

Unnamed: 0,pid,pos,track_uri,artist_uri,binary_rating,pos_rating,train_test,test_first_random,seed_tracks_num,num_tracks,hold_out
0,491000,0,spotify:track:3giQ7393501IRNrd8iHugf,spotify:artist:2ptmyXoL7poH6Zq62h1QT9,1,10.0,train,1,36.0,177,0
1,491000,1,spotify:track:3jpcVaeyNjWgjqIxAiWasz,spotify:artist:2ptmyXoL7poH6Zq62h1QT9,1,9.949153,train,1,36.0,177,0
2,491000,2,spotify:track:1uuqRaSJAiQ6VB8BWblXWJ,spotify:artist:2ptmyXoL7poH6Zq62h1QT9,1,9.898305,train,1,36.0,177,0
3,491000,3,spotify:track:7gXpcXwtmEiQzskYJmtGgk,spotify:artist:2ptmyXoL7poH6Zq62h1QT9,1,9.847458,train,1,36.0,177,0
4,491000,4,spotify:track:5wtIWwOtowY2howCZ7Veq2,spotify:artist:2ptmyXoL7poH6Zq62h1QT9,1,9.79661,train,1,36.0,177,0


In [174]:
df2.to_csv('../data-processed/pid-track-ratings-train-test-tags.csv', index = None)

# Save Full data - binary rating - train set

In [175]:
df2[df2['hold_out'] == 0][['pid','track_uri','binary_rating']].to_csv('../data-processed/pid-track-binary-rating-train-data.csv', index = None)

In [197]:
evaluation_pids_ground_truth = df2[df2.train_test == 'test'][['pid','pos','track_uri','hold_out','test_first_random']]

In [199]:
evaluation_pids_ground_truth['seed_pattern'] = evaluation_pids_ground_truth['test_first_random'].map({1:'first n', 0:'random n'})

In [203]:
evaluation_pids_ground_truth[['pid','pos','track_uri','hold_out','seed_pattern']].to_csv('../data-processed/evaluation-pids-ground-truth.csv', index = None)

In [3]:
df2 = pd.read_csv('../data-processed/full-data/pid-track-ratings-train-test-tags.csv')
df2.head()

# Save Full data - pos rating - train set

In [6]:
df2[df2['hold_out'] == 0][['pid','track_uri','pos_rating']].to_csv('../data-processed/full-data/pid-track-pos-rating-train-data.csv', index = None)