In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import csr_matrix

subset100 = pd.read_csv("../raw_data/track_meta_100subset_new.csv")

### Train-val-test split

In [5]:
# Train-val-test split (20%)
train, test = train_test_split(subset100, test_size=0.2, random_state=42, stratify = subset100['Playlistid'])
train, val = train_test_split(train, test_size=0.2, random_state=42, stratify = train['Playlistid'])

In [6]:
test.head()

Unnamed: 0,Playlistid,Trackid,Artist_Name,Track_Name,Album_Name,Track_Duration,Artist_uri,Track_uri,Album_uri,acousticness,...,loudness,mode,speechiness,tempo,time_signature,valence,Playlist,Album,Track,Artist
557,38828,35,Bastille,Pompeii,Bad Blood,214147,spotify:artist:7EQ0qTo7fWT7DPxmxtSYEc,spotify:track:3gbBpTdY8lnQwqxNCcf795,spotify:album:64fQ94AVziavTPdnkCS6Nj,0.0755,...,-6.383,1,0.0407,127.435,4,0.571,tb,55,63,44
556,38828,34,Britney Spears,Womanizer,Circus (Deluxe Version),224400,spotify:artist:26dSoYclwsYLMAKD3tpOr4,spotify:track:4fixebDZAVToLbUCuEloa2,spotify:album:2tve5DGwub1TtbX1khPX5j,0.073,...,-5.226,1,0.0622,139.0,4,0.235,tb,55,63,44
2414,229646,7,Soft Cell,Tainted Love,Non-Stop Erotic Cabaret,153762,spotify:artist:6aq8T2RcspxVOGgMrTzjWc,spotify:track:0cGG2EouYCEEC3xfa0tDFV,spotify:album:3KFWViJ1wIHAdOVLFTVzjD,0.462,...,-8.284,0,0.0378,144.435,4,0.623,Throwback,121,135,91
1771,186672,28,Imagine Dragons,Radioactive,Night Visions,186813,spotify:artist:53XhwfbYqKCa1cC15pYq2q,spotify:track:6Ep6BzIOB9tz3P4sWqiiAB,spotify:album:1vAEF8F0HoRFGiYOEeJXHW,0.119,...,-3.698,1,0.059,136.249,4,0.21,campfire,30,34,29
516,37634,17,LANY,WHERE THE HELL ARE MY FRIENDS,WHERE THE HELL ARE MY FRIENDS,216180,spotify:artist:49tQo2QULno7gxHutgccqF,spotify:track:4TA2nSix6i8K2VV9wt6rUn,spotify:album:34ySll9UQXpSngEI0NJbFO,0.0652,...,-3.811,1,0.0344,127.994,4,0.472,not sure,16,23,13


### kNN Collaborative Filtering

In [7]:
# Create Binary Sparse Matrix
co_mat = pd.crosstab(train.Playlistid, train.Track_uri)
co_mat = co_mat.clip(upper=1)
assert np.max(co_mat.describe().loc['max']) == 1

co_mat_sparse = csr_matrix(co_mat)

In [8]:
# Train kNN model
col_filter = NearestNeighbors(metric='cosine', algorithm='brute')
col_filter.fit(co_mat_sparse)

NearestNeighbors(algorithm='brute', leaf_size=30, metric='cosine',
         metric_params=None, n_jobs=None, n_neighbors=5, p=2, radius=1.0)

## Making Predictions

In [9]:
def nholdout(playlist_id, df):
    '''Pass in a playlist id to get number of songs held out in val/test set'''
    
    return len(df[df.Playlistid == playlist_id].Track_uri)

def kpredict(knnmodel, playlist_id, df):
    '''for a playlist id, generate list of 15*k predictions where k is num holdouts''' 
    
    k = nholdout(playlist_id, df)*15 # number of holdouts
    ref_songs = co_mat.columns.values[co_mat.loc[playlist_id] == 1] # songs already in playlist
    dist, ind = knnmodel.kneighbors(np.array(co_mat.loc[playlist_id]).reshape(1, -1), n_neighbors = 99)
    rec_ind = co_mat.index[ind[0]] # recommended playlists
    
    n_pred = 0
    pred = []
    for i in rec_ind:
        new_songs = co_mat.columns.values[co_mat.loc[i] == 1] # potential recommendations
        for song in new_songs:
            if song not in ref_songs: # only getting songs not already in target playlist
                pred.append(song)
                n_pred += 1
                if n_pred == k:
                    break
        if n_pred == k:
            break
    
    return pred

In [14]:
### Prediction Example
pi = 430 # target playlist index
kpreds = kpredict(col_filter, pi, val) # list of predictions

In [23]:
val_set = val[val.Playlistid == pi]
val_set = val_set['Track_uri'] # ground truth

## Metrics

In [24]:
def r_precision(prediction, val_set):
# prediction should be a list of predictions
# val_set should be pandas Series of ground truths
    score = np.sum(val_set.isin(prediction))/val_set.shape[0]
    return score

In [25]:
### Example Usage
r_precision(kpreds, val_set)

0.0

In [26]:
### NDCG Code Source: https://gist.github.com/bwhite/3726239
def dcg_at_k(r, k, method=0):
    r = np.asfarray(r)[:k]
    if r.size:
        if method == 0:
            return r[0] + np.sum(r[1:] / np.log2(np.arange(2, r.size + 1)))
        elif method == 1:
            return np.sum(r / np.log2(np.arange(2, r.size + 2)))
        else:
            raise ValueError('method must be 0 or 1.')
    return 0.


def ndcg_at_k(r, k, method=0):
    dcg_max = dcg_at_k(sorted(r, reverse=True), k, method)
    if not dcg_max:
        return 0.
    return dcg_at_k(r, k, method) / dcg_max

In [28]:
### Example Usage
# Generate binary relevance array
r = np.zeros(len(kpreds))
for i, p in enumerate(kpreds):
    if p in val_set:
        r[i] = 1

ndcg_at_k(r, len(r))

0.0

## Baseline Model Performance

In [57]:
rps = []
ndcgs = []
for pid in co_mat.index:
    ps = kpredict(col_filter, pid, val) # predictions
    vs = val[val.Playlistid == pid].Track_uri # ground truth
    rps.append(r_precision(ps, vs))
    
    r = np.zeros(len(ps))
    for i, p in enumerate(ps):
        if np.any(vs.isin([p])):
            r[i] = 1
    ndcgs.append(ndcg_at_k(r, len(r)))
    

In [58]:
avg_rp = np.mean(rps)
avg_ndcg = np.mean(ndcgs)
print('Avg. R-Precision: ', avg_rp)
print('Avg. NDCG: ', avg_ndcg)
print('Total Sum: ', np.mean([avg_rp, avg_ndcg]))

Avg. R-Precision:  0.07702539127539126
Avg. NDCG:  0.08034624710411524
Total Sum:  0.07868581918975326
