## ALS Implicit Collaborative Filtering - pos ratings

https://medium.com/radon-dev/als-implicit-collaborative-filtering-5ed653ba39fe

In [1]:
import import_ipynb

In [2]:
from evaluation import DCG
from evaluation import nDCG
from evaluation import R_Precision
from time import time

importing Jupyter notebook from evaluation.ipynb
DCG = 0.5
IDCG = 1.0
nDCG = 0.5


In [3]:
import sys
import pandas as pd
import numpy as np
import scipy.sparse as sparse
import random
import implicit
import json

from sklearn.preprocessing import MinMaxScaler
from scipy.sparse.linalg import spsolve

# Recomendation and evaluation functions

In [4]:
#---------------------
# FIND SIMILAR ITEMS
#---------------------

def similar_items(seed_track, top_n):
    """
    input: track_uri
    output: top_n recommended track_uris
    """
    track_id = D_track_id[seed_track] 
    n_similar =  top_n

    # Use implicit to get similar items.
    similar = model.similar_items(track_id, n_similar)
    
    similar_i = []
    
    # Print the names of our most similar artists
    for item in similar:
        idx, score = item
        track_uri = data.track_uri.loc[data.track_uri_id == idx].iloc[0]
        #print(data.track_uri.loc[data.track_uri_id == idx].iloc[0], D_desc[track_uri])
        similar_i.append(data.track_uri.loc[data.track_uri_id == idx].iloc[0])
    return similar_i

In [5]:
#-------------------------------------
# FIND SIMILAR ITEMS WITH DESCRIPTION
#-------------------------------------

def similar_items_with_description(seed_track, top_n):
    """
    input: track_uri
    output: top_n recommended track_uris with description as dictionary
    """
    print('CF ALS pos - first track returned is the seed track')
    
    track_id = D_track_id[seed_track] 
    n_similar =  top_n+1

    # Use implicit to get similar items.
    similar = model.similar_items(track_id, n_similar)
    
    similar_i = {}
    
    # Print the names of our most similar artists
    for item in similar:
        idx, score = item
        track_uri = data.track_uri.loc[data.track_uri_id == idx].iloc[0]
        similar_i[track_uri] = D_desc[track_uri]
        #print(data.track_uri.loc[data.track_uri_id == idx].iloc[0], D_desc[track_uri])
        #similar_i.append(data.track_uri.loc[data.track_uri_id == idx].iloc[0])
    return list(similar_i.values())

In [6]:
#------------------------------
# CREATE USER RECOMMENDATIONS
#------------------------------

def create_recs(pid,N):
    """
    returna list
    """
    
    pid_id = pid

    # Use the implicit recommender.
    recommended = model.recommend(pid_id, sparse_user_item, N=N)

    tracks = []
    scores = []
    desc = []

    # Get artist names from ids
    for item in recommended:
        idx, score = item
        tracks.append(data.track_uri.loc[data.track_uri_id == idx].iloc[0])
        scores.append(score)
        #desc.append(D_desc[data.track_uri.loc[data.track_uri_id == idx].iloc[0]])

    # Create a dataframe of artist names and scores
    #recommendations = pd.DataFrame({'track_uris': tracks, 'score': scores})

    return tracks

In [7]:
#----------------------------------------------
# CREATE SEED TRACKS FROM A PID
#----------------------------------------------

def get_seed_tracks(pid):
    pid_id = pid
    print(f'Seed tracks from pid {pid_id}')
    # Use the implicit recommender.
    I = {}
    for el in data[data.pid == pid_id].track_uri.unique():
        I[el] = D_desc[el]
    
#     recommended = model.recommend(pid_id, sparse_user_item, N=N)
    
#     R = {}

#     for item in recommended:
#         idx, score = item
#         R[data.track_uri.loc[data.track_uri_id == idx].iloc[0]] = D_desc[data.track_uri.loc[data.track_uri_id == idx].iloc[0]]
    
    return list(I.values())

In [8]:
#----------------------------------------------
# CREATE USER RECOMMENDATIONS WITH DESCRIPTION
#----------------------------------------------

def create_recs_with_description(pid,N):
    pid_id = pid
    print(f'Recommendations for {pid_id}')
#     # Use the implicit recommender.
#     I = {}
#     for el in data[data.pid == pid_id].track_uri.unique():
#         I[el] = D_desc[el]
    
    recommended = model.recommend(pid_id, sparse_user_item, N=N)
    
    R = {}

    for item in recommended:
        idx, score = item
        R[data.track_uri.loc[data.track_uri_id == idx].iloc[0]] = D_desc[data.track_uri.loc[data.track_uri_id == idx].iloc[0]]
    
    return list(R.values())

In [9]:
#----------------------------------------------------
# CREATE USER RECOMMENDATIONS WITH DICTIONARY OUTPUT
#---------------------------------------------------
def create_recs_dictionary_output(pid,N):
    """
    input: 
        pid
        N - 
    output: 
        reccomendation dictionary {track_uri: score}
    """
    
    pid_id = pid

    # Use the implicit recommender.
    recommended = model.recommend(pid_id, sparse_user_item, N=N)

    rec_tracks = {}

    # Get artist names from ids
    for item in recommended:
        idx, score = item
        rec_tracks[D_track_id_to_uri[idx]] = score

    return rec_tracks

In [10]:
#----------------------------------
# GET RECOMMENDATIONS AND EVALUATE
#----------------------------------

def als_predict_and_evaluate_top_n(pid, top_n=100):
    """
    input:
        pid - playlist id
        top_n - top_n recommendations
    return
        top_n predicted: track_ids
        ground_truth : track_ids in the hold_out
        R_Prec
        NDGC
    """
    L_pred = list(create_recs_dictionary_output(pid,top_n).keys())
    
    ground_truth = ev_set_arr[ev_set_arr[:,0]==pid][:,1]
    
    R_Prec = R_Precision(L_pred[:len(ground_truth)],ground_truth)
    
    res = [int(el in ground_truth) for el in L_pred]
    
    NDCG = nDCG(res)[1]
    
    return L_pred, ground_truth, R_Prec, NDCG, res


In [11]:
#-----------------------------------
# SAVE R-PRECISION AND NDCG BY PID
#-----------------------------------

def save_als_res_k_n(n = 10, top_n=20):
    """
    k = number of factors
    n= number of random lists to predict
    """
    time0=time()
    RES={}
    ep = random.sample(evaluation_pids,n)
    for i,pid in enumerate(ep):
        predictions=als_predict_and_evaluate_top_n(pid,top_n)
        RES[pid] = [predictions[2], predictions[3]]
        if i % 500 ==0:
            print(i)
            print(time()-time0)
    df = pd.DataFrame(RES).transpose().reset_index()
    df.columns=['pid','R-Precision','nDCG']
    df['rating'] = 'pos'
    df['model'] = f'ALS'
    df.to_csv(f'../evaluation/ALS_pos_topn_{top_n}_{n}.csv', index = None)
    print(time()-time0)
    return df

In [12]:
def search_track_artist(name, entity):
    S = []
    if entity == 'track':
        for k, v in D_desc.items():
            if v[0].lower().find(name.lower()) !=-1:
                S.append([k, v])
    if entity == 'artist':
        for k, v in D_desc.items():
            if v[1].lower().find(name.lower()) !=-1:
                S.append([k, v])     
    return S

# Load data

In [13]:
file_path = '../data-processed/full-data/pid-track-pos-rating-train-data.csv'

In [14]:
# Load the data like we did before
raw_data = pd.read_csv(file_path)
# raw_data.head()

In [15]:
raw_data.columns = ['pid', 'track_uri', 'rating']
data = raw_data

In [16]:
# Create a numeric user_id and artist_id column
data['pid'] = data['pid'].astype("category")
data['track_uri'] = data['track_uri'].astype("category")
data['pid_id'] = data['pid'].cat.codes
data['track_uri_id'] = data['track_uri'].cat.codes
# data.head()

In [17]:
D_track_id = data.groupby('track_uri')['track_uri_id'].min().to_dict()

In [18]:
D_track_id_to_uri = {}
for k,v in D_track_id.items():
    D_track_id_to_uri[v] = k

# Build ALS model

In [19]:
# The implicit library expects data as a item-user matrix so we
# create two matricies, one for fitting the model (item-user) 
# and one for recommendations (user-item)
sparse_item_user = sparse.csr_matrix((data['rating'].astype(float), (data['track_uri_id'], data['pid_id'])))
sparse_user_item = sparse.csr_matrix((data['rating'].astype(float), (data['pid_id'], data['track_uri_id'])))

In [20]:
# Initialize the als model and fit it using the sparse item-user matrix
model = implicit.als.AlternatingLeastSquares(factors=20, regularization=0.1, iterations=20)



In [21]:
# Calculate the confidence by multiplying it by our alpha value.
alpha_val = 15
data_conf = (sparse_item_user * alpha_val).astype('double')

In [22]:
#Fit the model
model.fit(data_conf)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=20.0), HTML(value='')))




# Load dictionary with tracks

In [23]:
with open('../data-processed/full-data/track_descriptions.json') as json_file:
    D_desc = json.load(json_file)
    
D_desc['spotify:track:0UaMYEvWZi0ZqiDOoHU3YI']

['Lose Control (feat. Ciara & Fat Man Scoop)', 'Missy Elliott', 'The Cookbook']

# Evaluation set

In [24]:
evaluation_set = pd.read_csv('../data-processed/full-data/evaluation-pids-ground-truth.csv')
evaluation_set.head()

ev_set = evaluation_set[evaluation_set['hold_out'] == 1][['pid','track_uri','hold_out']]
ev_set = ev_set[ev_set.isnull()==False]

ev_set_arr = ev_set.to_numpy()

evaluation_pids = list(ev_set.pid.unique())

# ev_set.head()

# `~~~~~~~~~~~~~~~~~~~~~~~~~~~ DEMO TIME~~~~~~~~~~~~~~~~~~~~~~~~~~~`

 <img src="tenor.gif">

# Search for track or artist

In [25]:
track_or_artist = 'the cure'
entity = 'artist'
results_to_print = 5
search_track_artist(track_or_artist, entity)[0:results_to_print]

[['spotify:track:4NnWuGQujzWUEg0uZokO5M',
  ['Just Like Heaven', 'The Cure', 'Kiss Me, Kiss Me, Kiss Me']],
 ['spotify:track:4QlzkaRHtU8gAdwqjWmO8n',
  ["Friday I'm In Love", 'The Cure', 'Wish']],
 ['spotify:track:0X5C4WjQNubRysTkHOubz3',
  ['Lovesong', 'The Cure', 'Disintegration']],
 ['spotify:track:1QFh8OH1e78dGd3VyJZCAC',
  ["Boys Don't Cry", 'The Cure', 'Three Imaginary Boys']],
 ['spotify:track:0T6kwiueP62ten2KLLmQS4',
  ['A Forest', 'The Cure', 'Seventeen Seconds']]]

# Find similar tracks

In [26]:
similar_items_with_description('spotify:track:4QlzkaRHtU8gAdwqjWmO8n',15)

CF ALS pos - first track returned is the seed track


[["Friday I'm In Love", 'The Cure', 'Wish'],
 ['Just Like Heaven', 'The Cure', 'Kiss Me, Kiss Me, Kiss Me'],
 ['Blister in the Sun - 2002 Remastered Version',
  'Violent Femmes',
  'Violent Femmes'],
 ['I Melt With You', 'Modern English', 'Pillow Lips'],
 ['Love It When You Call', 'The Feeling', 'Twelve Stops and Home'],
 ["What's Up?", '4 Non Blondes', 'Bigger, Better, Faster, More !'],
 ['When The Stars Go Blue - feat. Bono Disclab Remix',
  'The Corrs',
  'Dreams - The Ultimate Corrs Collection'],
 ['Lovesong', 'The Cure', 'Disintegration'],
 ['Everybody Wants To Rule The World',
  'Tears For Fears',
  'Songs From The Big Chair'],
 ["Don't Dream It's Over", 'Crowded House', 'Crowded House'],
 ['There She Goes', "The La's", "The La's"],
 ['Birdhouse In Your Soul', 'They Might Be Giants', 'Flood'],
 ['I Melt With You (7" Mix)', 'Modern English', 'After the Snow'],
 ['Crash', 'The Primitives', 'Lovely'],
 ['Dreams', 'The Cranberries', "Everybody Else Is Doing It, So Why Can't We?"],
 [

# Create a playlist continuation 

In [27]:
inp = random.sample(evaluation_pids,1)[0]
get_seed_tracks(inp)

Seed tracks from pid 59701


[["Sarah's Song", 'Ricky Hil', 'SYLDD'],
 ['I Know', 'Big Sean', 'Dark Sky Paradise'],
 ['Swimming Pools (Drank) - Extended Version',
  'Kendrick Lamar',
  'good kid, m.A.A.d city'],
 ['Alright', 'Logic', 'Under Pressure'],
 ['Low Life', 'Future', 'EVOL'],
 ['Already', 'Kodak Black', 'Institution'],
 ['False Alarm', 'The Weeknd', 'Starboy'],
 ['Starboy', 'The Weeknd', 'Starboy'],
 ['Murder', 'ShredGang', 'Shred Gang'],
 ['You Got Me', 'G-Eazy', "When It's Dark Out"],
 ['All Me', 'Drake', 'Nothing Was The Same'],
 ['No Role Modelz', 'J. Cole', '2014 Forest Hills Drive'],
 ['Wet Dreamz', 'J. Cole', '2014 Forest Hills Drive'],
 ['A Tale of 2 Citiez', 'J. Cole', '2014 Forest Hills Drive'],
 ['Turnt (feat. Sweezee Don)', 'Dame Dot', '3rd World'],
 ['Forbidden Fruit', 'J. Cole', 'Born Sinner'],
 ['Sidewalks', 'The Weeknd', 'Starboy'],
 ['I Feel It Coming', 'The Weeknd', 'Starboy'],
 ['Broken', 'Lund', 'Broken'],
 ['Clubhouse', 'Mac Miller', 'GO:OD AM'],
 ['Bad and Boujee (feat. Lil Uzi Vert)

In [28]:
create_recs_with_description(inp, 20)

Recommendations for 59701


[['White Iverson', 'Post Malone', 'Stoney'],
 ["Don't", 'Bryson Tiller', 'T R A P S O U L'],
 ['Fake Love', 'Drake', 'More Life'],
 ['Go Flex', 'Post Malone', 'Stoney'],
 ['V. 3005', 'Childish Gambino', 'because the internet'],
 ['Chill Bill', 'Rob $tone', 'Chill Bill'],
 ['What They Want', 'Russ', "There's Really A Wolf"],
 ['Exchange', 'Bryson Tiller', 'T R A P S O U L'],
 ['Father Stretch My Hands Pt. 1', 'Kanye West', 'The Life Of Pablo'],
 ['No Problem (feat. Lil Wayne & 2 Chainz)',
  'Chance The Rapper',
  'Coloring Book'],
 ['Caroline', 'Aminé', 'Good For You'],
 ['goosebumps', 'Travis Scott', 'Birds In The Trap Sing McKnight'],
 ['Deja Vu', 'J. Cole', '4 Your Eyez Only'],
 ['G.O.M.D.', 'J. Cole', '2014 Forest Hills Drive'],
 ['One Night', 'Lil Yachty', 'Lil Boat'],
 ['IV. sweatpants', 'Childish Gambino', 'because the internet'],
 ['Broccoli (feat. Lil Yachty)', 'DRAM', 'Big Baby DRAM'],
 ['Antidote', 'Travis Scott', 'Rodeo'],
 ['Congratulations', 'Post Malone', 'Stoney'],
 ['Bo

# Save evaluation

In [29]:
# df = save_als_res_k_n(10000,500)
# df.describe()