# Conten Based filtering using audio features, album_uri and artist_uri

In [1]:
import import_ipynb

In [2]:
import json
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import random
import scipy.sparse as sps

from sklearn.preprocessing import normalize
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import linear_kernel
from category_encoders import TargetEncoder
from time import time

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

#own functions
from evaluation import DCG
from evaluation import nDCG
from evaluation import R_Precision


%matplotlib inline

importing Jupyter notebook from evaluation.ipynb
DCG = 0.5
IDCG = 1.0
nDCG = 0.5


# Define function to look up similar items 

In [None]:
with open('../data-processed/full-data/track_descriptions.json') as json_file:
    D_desc = json.load(json_file)
    
D_desc['spotify:track:0UaMYEvWZi0ZqiDOoHU3YI']

In [3]:
def similar_items(track_uri, top_n, similarity_matrix):
    #print('seed track is not included')
    D_rec={}
    #time0 = time()
    idx = D_track_uri_to_id[track_uri]
    similarity_array = cosine_similarity(similarity_matrix[idx:idx+1,:], similarity_matrix)
    recc_tracks = np.fliplr(similarity_array.argsort())
    for el in recc_tracks[0][1:top_n+1]:
        D_rec[D_track_id_to_uri[el]]=similarity_array[0][el]
    #print(f'{time()-time0:0.2f}sec')
    return D_rec

In [4]:
def similar_items_with_description(track_uri, top_n, similarity_matrix):
    print('CB audio features - first tracks returned is the seed track')
    D_rec={}
    time0 = time()
    idx = D_track_uri_to_id[track_uri]
    similarity_array = cosine_similarity(similarity_matrix[idx:idx+1,:], similarity_matrix)
    recc_tracks = np.fliplr(similarity_array.argsort())
    for el in recc_tracks[0][0:top_n+1]:
        D_rec[D_track_id_to_uri[el]]=D_desc[D_track_id_to_uri[el]]
    print(f'{time()-time0:0.2f}sec')
    return list(D_rec.values())

In [5]:
def similar_items_with_description_external(track_uri, top_n):
    print('CB audio features - first tracks returned is the seed track')
    D_rec={}
    time0 = time()
    idx = D_track_uri_to_id[track_uri]
    similarity_array = cosine_similarity(X_transformed[idx:idx+1,:], X_transformed)
    recc_tracks = np.fliplr(similarity_array.argsort())
    for el in recc_tracks[0][0:top_n+1]:
        D_rec[D_track_id_to_uri[el]]=D_desc[D_track_id_to_uri[el]]
    print(f'{time()-time0:0.2f}sec')
    return list(D_rec.values())

In [6]:
import operator

def predict_and_evaluate_top_n(pid,top_n):
    """
    return
    (1) top_n predicted track_ids
    (2) ground_truth : track_ids in the hold_out
    (3) R_Prec
    
    """
    train_array_track_ids = track_id_array[M[pid].toarray()[0].astype(bool)]
    
    D_pred={}
    
    topn_n_by_track = int(top_n/len(train_array_track_ids))*2
    
    for el in train_array_track_ids:
        D_pred.update(similar_items(D_track_id_to_uri[el],topn_n_by_track,X_transformed))
    
    D_pred = dict(sorted(D_pred.items(), key=operator.itemgetter(1),reverse=True))
    ground_truth = ev_set_arr[ev_set_arr[:,0]==pid][:,2]
    L_pred = list(D_pred.keys())[:top_n]
    R_Prec = R_Precision(L_pred[:len(ground_truth)],ground_truth)
    res = [int(el in list(ground_truth)) for el in L_pred]
    NDCG = nDCG(res)[1]
    return L_pred, ground_truth, R_Prec, NDCG, res

# Data transformation, PCA and merging 

In [7]:
df_merged = pd.read_csv('../data-processed/transformation-matrices/cb_df_merged.csv')

In [8]:
df_merged.shape

(1447059, 1036)

In [9]:
df_merged.head()

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,...,pca_genre_991,pca_genre_992,pca_genre_993,pca_genre_994,pca_genre_995,pca_genre_996,pca_genre_997,pca_genre_998,pca_genre_999,pca_genre_artist_uri
0,0.89725,-0.616004,0.488537,-0.337567,-1.377756,-0.53584,0.830192,-0.62609,-0.608544,-0.995276,...,-0.012959,-0.027508,0.003682,-0.013497,-0.018047,-0.006893,-0.026525,0.010478,-0.015831,spotify:artist:1vSHzGHsVOCrgPSCmKNimP
1,-2.218729,-0.54082,-0.073421,-0.5745,0.725818,-0.000277,0.974099,1.33416,-0.42732,-1.493465,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,1.347035,1.15832,0.769516,0.740272,0.725818,-0.223356,-1.001042,1.983828,0.805424,0.197414,...,-0.015915,-0.00928,-0.010366,0.016623,-0.00398,-0.020958,0.030079,-0.038151,0.051228,spotify:artist:7kxOVclB0zQamtBR0syCrg
3,1.292844,-0.13859,-0.3544,0.443262,0.725818,-0.411715,0.993851,-0.629341,-0.777124,0.434471,...,0.023769,-0.015961,0.020897,0.018381,0.010438,0.001869,-0.006275,0.01234,-0.005278,spotify:artist:2sxmKe3CUrWnx7eoXMhOlW
4,0.962279,0.320027,-0.635379,0.310843,-1.377756,0.110828,-0.558085,-0.629734,-0.532682,-1.522356,...,0.000838,0.002465,0.000721,0.002336,-0.001504,-0.000857,0.000536,0.00137,0.000915,spotify:artist:6PyeXqjH8OMGnt1IOhWgrQ


In [13]:
df_merged = df_merged.fillna(0)

In [14]:
X_transformed = df_merged.drop(columns=['artist_uri','album_uri']).to_numpy()

MemoryError: Unable to allocate 11.1 GiB for an array with shape (1034, 1447059) and data type object

In [None]:
X_transformed.shape

# Find similar tracks

<img src="tenor.gif">

In [None]:
similar_items_with_description('spotify:track:3ZOEytgrvLwQaqXreDs2Jx',10,X_transformed)