## ALS Implicit Collaborative Filtering - binary ratings

https://medium.com/radon-dev/als-implicit-collaborative-filtering-5ed653ba39fe

In [3]:
import import_ipynb

In [4]:
from evaluation import DCG
from evaluation import nDCG
from evaluation import R_Precision
from time import time

In [5]:
import sys
import pandas as pd
import numpy as np
import scipy.sparse as sparse
import random
import implicit

from sklearn.preprocessing import MinMaxScaler
from scipy.sparse.linalg import spsolve

# Recomendation and evaluation functions

In [6]:
#---------------------
# FIND SIMILAR ITEMS
#---------------------

def similar_items(seed_track, top_n):
    """
    input: 
        track_uri
    output: 
        top_n recommended track_uris
    """
    
    track_id = D_track_id[seed_track] 
    n_similar =  top_n

    # Use implicit to get similar items.
    similar = model.similar_items(track_id, n_similar)
    
    similar_i = []
    
    # Print the names of our most similar artists
    for item in similar:
        idx, score = item
        track_uri = data.track_uri.loc[data.track_uri_id == idx].iloc[0]
        #print(data.track_uri.loc[data.track_uri_id == idx].iloc[0], D_desc[track_uri])
        similar_i.append(data.track_uri.loc[data.track_uri_id == idx].iloc[0])
    return similar_i

In [7]:
#-------------------------------------
# FIND SIMILAR ITEMS WITH DESCRIPTION
#-------------------------------------

def similar_items_with_description(seed_track, top_n):
    """
    input: track_uri
    output: top_n recommended track_uris with description as dictionary
    """
    print('CF ALS binary - first track returned is the seed track')
    
    track_id = D_track_id[seed_track] 
    n_similar =  top_n+1

    # Use implicit to get similar items.
    similar = model.similar_items(track_id, n_similar)
    
    similar_i = {}
    
    # Print the names of our most similar artists
    for item in similar:
        idx, score = item
        track_uri = data.track_uri.loc[data.track_uri_id == idx].iloc[0]
        similar_i[track_uri] = D_desc[track_uri]
        #print(data.track_uri.loc[data.track_uri_id == idx].iloc[0], D_desc[track_uri])
        #similar_i.append(data.track_uri.loc[data.track_uri_id == idx].iloc[0])
    return list(similar_i.values())

In [8]:
#------------------------------
# CREATE USER RECOMMENDATIONS
#------------------------------

def create_recs(pid,N):
    """
    input:
        pid
        N - top n recommendations
    return: 
        list of recommended tracks
    """
    
    pid_id = pid

    # Use the implicit recommender.
    recommended = model.recommend(pid_id, sparse_user_item, N=N)

    tracks = []
    scores = []
    desc = []

    # Get artist names from ids
    for item in recommended:
        idx, score = item
        tracks.append(data.track_uri.loc[data.track_uri_id == idx].iloc[0])
        scores.append(score)
        #desc.append(D_desc[data.track_uri.loc[data.track_uri_id == idx].iloc[0]])

    return tracks

In [9]:
#----------------------------------------------
# CREATE USER RECOMMENDATIONS WITH DESCRIPTION
#----------------------------------------------

def create_recs_with_description(pid,N):
    """
    input: 
        pid
        top_n - top_n similar tracks to find
    return
        pid input
        reccommendations
    """
    pid_id = pid

    # Use the implicit recommender.
    I = {}
    for el in data[data.pid == pid_id].track_uri.unique():
        I[el] = D_desc[el]
    
    recommended = model.recommend(pid_id, sparse_user_item, N=N)
    
    R = {}

    for item in recommended:
        idx, score = item
        R[data.track_uri.loc[data.track_uri_id == idx].iloc[0]] = D_desc[data.track_uri.loc[data.track_uri_id == idx].iloc[0]]
    
    return list(I.values()), list(R.values())

In [10]:
#----------------------------------------------------
# CREATE USER RECOMMENDATIONS WITH DICTIONARY OUTPUT
#---------------------------------------------------

def create_recs_dictionary_output(pid,N):
    """
    input: 
        pid
    output: 
        reccomendation dictionary {track_uri: score}
    """
    
    pid_id = pid

    # Use the implicit recommender.
    recommended = model.recommend(pid_id, sparse_user_item, N=N)

    rec_tracks = {}

    # Get artist names from ids
    for item in recommended:
        idx, score = item
        rec_tracks[D_track_id_to_uri[idx]] = score

    return rec_tracks

In [11]:
#----------------------------------
# GET RECOMMENDATIONS AND EVALUATE
#----------------------------------

def als_predict_and_evaluate_top_n(pid, top_n=100):
    """
    return:
        top_n predicted: track_ids
        ground_truth : track_ids in the hold_out
        R_Prec
        NDGC
    
    """
    L_pred = list(create_recs_dictionary_output(pid,top_n).keys())
    
    ground_truth = ev_set_arr[ev_set_arr[:,0]==pid][:,1]
    
    R_Prec = R_Precision(L_pred[:len(ground_truth)],ground_truth)
    
    res = [int(el in ground_truth) for el in L_pred]
    
    NDCG = nDCG(res)[1]
    
    return L_pred, ground_truth, R_Prec, NDCG, res

In [12]:
#-----------------------------------
# SAVE R-PRECISION AND NDCG BY PID
#-----------------------------------

def save_als_res_k_n(n = 10, top_n=20):
    """
    k = number of factors
    n = number of random lists to predict
    """
    time0=time()
    RES={}
    ep = random.sample(evaluation_pids,n)
    for i,pid in enumerate(ep):
        predictions=als_predict_and_evaluate_top_n(pid,top_n)
        RES[pid] = [predictions[2], predictions[3]]
        if i % 500 ==0:
            print(i)
            print(time()-time0)
    df = pd.DataFrame(RES).transpose().reset_index()
    df.columns=['pid','R-Precision','nDCG']
    df['rating'] = 'binary'
    df['model'] = f'ALS'
    df.to_csv(f'../evaluation/ALS_binary_topn_{top_n}_{n}.csv', index = None)
    print(time()-time0)
    return df

In [13]:
#------------------------------------------------------------------------
# SEARCH FOR AN ARTIST OR TRACK BASED ON THE NAME OF AN ARTIST OR TRACKS
#------------------------------------------------------------------------

def search_track_artist(name, entity):
    """
    input:
        name of an artist or track
        entity - 'track' or 'artist'
    return:
        track_uri for a given artist name or a track name
    """
    S = []
    if entity == 'track':
        for k, v in D_desc.items():
            if v[0].lower().find(name.lower()) !=-1:
                S.append([k, v])
    if entity == 'artist':
        for k, v in D_desc.items():
            if v[1].lower().find(name.lower()) !=-1:
                S.append([k, v])     
    return S

# Load data

In [14]:
file_path = '../data-processed/full-data/pid-track-binary-rating-train-data.csv'

In [15]:
# Load the data like we did before
raw_data = pd.read_csv(file_path)
# raw_data.head()

In [16]:
raw_data.columns = ['pid', 'track_uri', 'rating']
data = raw_data

In [17]:
# Create a numeric user_id and artist_id column
data['pid'] = data['pid'].astype("category")
data['track_uri'] = data['track_uri'].astype("category")
data['pid_id'] = data['pid'].cat.codes
data['track_uri_id'] = data['track_uri'].cat.codes
# data.head()

In [18]:
D_track_id = data.groupby('track_uri')['track_uri_id'].min().to_dict()

In [19]:
D_track_id_to_uri = {}
for k,v in D_track_id.items():
    D_track_id_to_uri[v] = k

# Build ALS model

In [20]:
# The implicit library expects data as a item-user matrix so we
# create two matricies, one for fitting the model (item-user) 
# and one for recommendations (user-item)
sparse_item_user = sparse.csr_matrix((data['rating'].astype(float), (data['track_uri_id'], data['pid_id'])))
sparse_user_item = sparse.csr_matrix((data['rating'].astype(float), (data['pid_id'], data['track_uri_id'])))

In [21]:
# Initialize the als model and fit it using the sparse item-user matrix
model = implicit.als.AlternatingLeastSquares(factors=20, regularization=0.1, iterations=20)



In [22]:
# Calculate the confidence by multiplying it by our alpha value.
alpha_val = 15
data_conf = (sparse_item_user * alpha_val).astype('double')

In [23]:
#Fit the model
model.fit(data_conf)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=20.0), HTML(value='')))




# Load dictionary with tracks

In [24]:
import json

with open('../data-processed/full-data/track_descriptions.json') as json_file:
    D_desc = json.load(json_file)
    
D_desc['spotify:track:0UaMYEvWZi0ZqiDOoHU3YI']

['Lose Control (feat. Ciara & Fat Man Scoop)', 'Missy Elliott', 'The Cookbook']

# Find similar tracks

<img src="tenor.gif">

In [25]:
track_or_artist = 'John Lennon'
#entity 'track' or 'artist'
entity = 'artist'
results_to_print = 10
search_track_artist(track_or_artist, entity)[0:results_to_print]

[['spotify:track:3zJw3rugfpVrmBeDDnUYzy',
  ['Happy Xmas (War Is Over) - 2010 Digital Remaster',
   'John Lennon',
   'Power To The People - The Hits']],
 ['spotify:track:2pNwQBjJppt8v3sZojH1aj',
  ['Oh Yoko! - 2010 - Remaster', 'John Lennon', 'Imagine']],
 ['spotify:track:6Ux4atjtjSCvtW27UCFj7d',
  ['Dear Yoko - 2010 - Remaster',
   'John Lennon',
   'Double Fantasy Stripped Down']],
 ['spotify:track:7pKfPomDEeI4TPT6EOYjn9',
  ['Imagine - 2010 - Remaster', 'John Lennon', 'Imagine']],
 ['spotify:track:36bDXIjlYQKTxESFmAPEK3',
  ['Imagine', 'John Lennon', 'Jazz Do It']],
 ['spotify:track:31QuJZfFiMk1uOawow8ejS',
  ['Nobody Told Me - 2010 - Remaster', 'John Lennon', 'Milk And Honey']],
 ['spotify:track:0GGxVTb0UwDwdaKNjBdCn3',
  ['Woman - 2010 - Remaster', 'John Lennon', 'Double Fantasy Stripped Down']],
 ['spotify:track:3ntrdR24dLkKrzSGRv1FlH',
  ['Stand By Me - 2010 - Remaster', 'John Lennon', "Rock 'N' Roll"]],
 ['spotify:track:5mAuSPm8ZygJViGqQxKd48',
  ['Oh Yoko', 'John Lennon', 'Ru

In [26]:
similar_items_with_description('spotify:track:36bDXIjlYQKTxESFmAPEK3',15)

CF ALS binary - first track returned is the seed track


[['Imagine', 'John Lennon', 'Jazz Do It'],
 ["Maybe I'm Amazed - 2013 Remaster", 'Paul McCartney', 'Wings Over America'],
 ['Maybe I’m Amazed - Remastered 2011', 'Paul McCartney', 'McCartney'],
 ['Lucy in the Sky with Diamonds',
  'Bombilates',
  'Popular Song Covers - Vol. 6'],
 ['Time In A Bottle', 'Jim Croce', "You Don't Mess Around With Jim"],
 ['The Sounds of Silence - Acoustic Version',
  'Simon & Garfunkel',
  'Wednesday Morning, 3 A.M.'],
 ['Hotel California', 'The Desperados', 'Acoustic Moondance'],
 ['People Get Ready', 'Jeff Beck', 'Flash'],
 ['Border Song', 'Elton John', 'Elton John'],
 ['The Killing of Georgie (Part I and II) - 2009 Remastered Version',
  'Rod Stewart',
  'A Night On The Town [Deluxe Edition]'],
 ['Let It Be', 'Paul McCartney', 'Back In The World'],
 ['Ticket To Ride', 'The British Invasions', 'A Tribute To The Beatles'],
 ['Hey Jude', 'The Apples', 'Best Of The 60s'],
 ['Bridge Over Troubled Water',
  'Simon & Garfunkel',
  'Bridge Over Troubled Water'],


# Create a playlist continuation

In [27]:
#test_pids = pd.read_csv('../data-processed/full-data/pid-track-ratings-train-test-tags.csv')
#test_pids = test_pids[test_pids['hold_out'] == 1]
#pd.DataFrame(list(test_pids['pid'].unique()),columns=['ground_truth_pid']).to_csv('../data-processed/full-data/ground_truth_pids.csv')

In [64]:
print('Input')
inp = 491031
create_recs_with_description(inp, 20)[0][:10]

Input


[['Pursuit Of Happiness (nightmare)',
  'Kid Cudi',
  'Man On The Moon: The End Of Day'],
 ['Loyal', 'Chris Brown', 'X (Deluxe Version)'],
 ['No Hands (feat. Roscoe Dash and Wale) - Explicit Album Version',
  'Waka Flocka Flame',
  'Flockaveli'],
 ['Flex (Ooh, Ooh, Ooh)', 'Rich Homie Quan', 'Flex (Ooh, Ooh, Ooh)'],
 ['Lose Yourself - Soundtrack Version', 'Eminem', 'Curtain Call'],
 ['Young, Wild & Free (feat. Bruno Mars) - feat. Bruno Mars',
  'Snoop Dogg',
  'Mac and Devin Go To High School (Music From and Inspired By The Movie)'],
 ['Black And Yellow', 'Wiz Khalifa', 'Rolling Papers'],
 ['True Colors (feat. Nicki Minaj)', 'Wiz Khalifa', 'Blacc Hollywood'],
 ['Cudi Zone', 'Kid Cudi', 'Man On The Moon: The End Of Day'],
 ["Day 'N' Nite (nightmare)", 'Kid Cudi', 'Man On The Moon: The End Of Day']]

In [65]:
print('output')
create_recs_with_description(inp, 20)[1]

output


[["I Don't Fuck With You", 'Big Sean', 'Dark Sky Paradise'],
 ['Ni**as In Paris', 'JAY Z', 'Top Five'],
 ['Trap Queen', 'Fetty Wap', 'Fetty Wap'],
 ['Jumpman', 'Drake', 'What A Time To Be Alive'],
 ["F**kin' Problems", 'A$AP Rocky', 'LONG.LIVE.A$AP (Deluxe Version)'],
 ['679 (feat. Remy Boyz)', 'Fetty Wap', '679 (feat. Remy Boyz)'],
 ['0 To 100 / The Catch Up', 'Drake', '0 To 100 / The Catch Up'],
 ['No Type', 'Rae Sremmurd', 'SremmLife'],
 ['Back To Back', 'Drake', 'Back To Back'],
 ['I Mean It', 'G-Eazy', 'These Things Happen'],
 ['Forever', 'Drake', 'Relapse: Refill'],
 ['m.A.A.d city', 'Kendrick Lamar', 'good kid, m.A.A.d city'],
 ['Come Get Her', 'Rae Sremmurd', 'SremmLife'],
 ['6 Foot 7 Foot', 'Lil Wayne', 'Tha Carter IV'],
 ['Swimming Pools (Drank)', 'Kendrick Lamar', 'Swimming Pools (Drank)'],
 ['We Dem Boyz', 'Wiz Khalifa', 'Blacc Hollywood'],
 ['All Me', 'Drake', 'Nothing Was The Same'],
 ['Panda', 'Desiigner', 'New English'],
 ['Started From the Bottom', 'Drake', 'Nothing Wa

# Evaluation

In [27]:
# evaluation_set = pd.read_csv('../data-processed/full-data/evaluation-pids-ground-truth.csv')
# evaluation_set.head()

In [28]:
# ev_set = evaluation_set[evaluation_set['hold_out'] == 1][['pid','track_uri','hold_out']]
# ev_set = ev_set[ev_set.isnull()==False]

# ev_set_arr = ev_set.to_numpy()

# evaluation_pids = list(ev_set.pid.unique())

# # ev_set.head()

### Save evaluation

In [29]:
# df = save_als_res_k_n(10000,500)
# df.describe()