# WIP: Data prep:  Content Based filtering using audio features, album_uri and artist_uri

In [2]:
import import_ipynb

In [3]:
import json
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import random
import scipy.sparse as sps

from sklearn.preprocessing import normalize
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import linear_kernel
from category_encoders import TargetEncoder
from time import time

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

#own functions
from evaluation import DCG
from evaluation import nDCG
from evaluation import R_Precision


%matplotlib inline

importing Jupyter notebook from evaluation.ipynb
DCG = 0.5
IDCG = 1.0
nDCG = 0.5


# Data transformation, PCA and merging 

In [3]:
with open('../data-processed/full-data/track_descriptions.json') as json_file:
    D_desc = json.load(json_file)
    
D_desc['spotify:track:0UaMYEvWZi0ZqiDOoHU3YI']

['Lose Control (feat. Ciara & Fat Man Scoop)', 'Missy Elliott', 'The Cookbook']

In [4]:
with open('../data-processed/full-data/track_artist_album.json') as json_file:
    D_album_artist = json.load(json_file)
    
D_album_artist['spotify:track:0UaMYEvWZi0ZqiDOoHU3YI']

['spotify:artist:2wIVse2owClT7go1WT98tk',
 'spotify:album:6vV5UrXcfyQD1wu4Qo2I9K']

In [5]:
path = '../data-processed/full-data/audio-features-combined.csv'
data = pd.read_csv(path)

In [6]:
data = data.reset_index()
data = data.rename(columns = {'index':'track_id'})
data['artist_uri'] = data.apply(lambda x:D_album_artist[x.uri][0], axis = 1)
data['album_uri'] = data.apply(lambda x:D_album_artist[x.uri][1], axis = 1)

In [7]:
# data.head()

In [8]:
D_track_uri_to_id = data.groupby('uri')['track_id'].min().to_dict()
D_track_id_to_uri = data.groupby('track_id')['uri'].min().to_dict()

In [9]:
len(D_track_uri_to_id), len(D_track_id_to_uri)

(2262190, 2262190)

In [10]:
data.head()

Unnamed: 0,track_id,uri,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,artist_uri,album_uri
0,0,spotify:track:5OSaZxhNj307YpTh7Qp8Xy,0.716,0.421,7,-11.56,0,0.0275,0.649,0.00207,0.0936,0.207,99.078,378440,4,spotify:artist:1vSHzGHsVOCrgPSCmKNimP,spotify:album:0oA1WetYmmrxkyjxz92yJg
1,1,spotify:track:4zytmsfZ7HtdXj3VLECcu2,0.141,0.441,5,-12.893,1,0.0892,0.7,0.687,0.128,0.0725,76.044,331000,4,spotify:artist:6gHYtzSvIIWvoUY2q2V3Rj,spotify:album:3c1gpGKfahKOFXtM7eIcFO
2,2,spotify:track:6lOWUS7iBVEw6ieJqxH17B,0.799,0.893,8,-5.496,1,0.0635,1.8e-05,0.914,0.362,0.529,128.009,469397,4,spotify:artist:7kxOVclB0zQamtBR0syCrg,spotify:album:0mX83KZvGWqBOvI4nIZ23H
3,3,spotify:track:3OSL6hJ9DoRGwr9OSEVrRS,0.789,0.548,4,-7.167,1,0.0418,0.707,0.000934,0.0616,0.593,117.713,153893,4,spotify:artist:2sxmKe3CUrWnx7eoXMhOlW,spotify:album:4hXCM8vqLJnlFcuHoH3zVP
4,4,spotify:track:6x0bgGOKckFsesVf7yPWJq,0.728,0.67,3,-7.912,0,0.102,0.157,0.000797,0.108,0.0647,119.963,196000,4,spotify:artist:6PyeXqjH8OMGnt1IOhWgrQ,spotify:album:1pjNAADvPDurRS42fqxN4k


In [11]:
##
genres = pd.read_csv('../data-processed/full-data/genres_by_artist.csv')

In [12]:
##
genres.head()

Unnamed: 0,artist_uri,21st century classical,432hz,8-bit,8d,a cappella,aarhus indie,aberdeen indie,abstract,abstract beats,...,zim hip hop,zim urban groove,zimdancehall,zither,zolo,zouglou,zouk,zouk riddim,zurich indie,zydeco
0,spotify:artist:0001ZVMPt41Vwzt1zsmuzp,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,spotify:artist:0001wHqxbF2YYRQxGdbyER,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,spotify:artist:000Dq0VqTZpxOP6jQMscVL,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,spotify:artist:000spuc3oKgwYmfg5IE26s,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,spotify:artist:000xagx3GkcunHTFdB4ly0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
# data_genres = data.merge(genres, how='left', left_on='artist_uri', right_on='artist_uri')

In [14]:
# data_genres.shape

# PCA on genres

In [15]:
X_genres = genres.iloc[:,1:]

In [16]:
X_genres.shape

(122142, 4967)

In [17]:
X_genres.values

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [18]:
pca = PCA(n_components=800)
pca = pca.fit(X_genres.values)
print(sum(pca.explained_variance_ratio_))

0.6067027958450439


In [19]:
X_genres_pca = pca.transform(X_genres)

In [20]:
genres_pca_df = pd.DataFrame(X_genres_pca)

In [21]:
genres_pca_df.shape

(122142, 800)

In [22]:
genres_pca_df.shape, genres.shape

((122142, 800), (122142, 4968))

In [23]:
genres_pca_df['artist_uri'] = genres['artist_uri']

In [24]:
new_cols = ['pca_genre_'+ str(el) for el in list(genres_pca_df.columns)]

In [25]:
genres_pca_df.columns = new_cols

In [26]:
genres_pca_df.head()

Unnamed: 0,pca_genre_0,pca_genre_1,pca_genre_2,pca_genre_3,pca_genre_4,pca_genre_5,pca_genre_6,pca_genre_7,pca_genre_8,pca_genre_9,...,pca_genre_791,pca_genre_792,pca_genre_793,pca_genre_794,pca_genre_795,pca_genre_796,pca_genre_797,pca_genre_798,pca_genre_799,pca_genre_artist_uri
0,-0.01428,-0.01822,0.000345,-0.002941,-0.004096,-0.003106,-0.01127,-0.014823,-0.002936,-0.004451,...,-0.000863,0.000508,0.004983,-0.019387,0.001351,-0.010258,0.016658,0.003399,-0.0012,spotify:artist:0001ZVMPt41Vwzt1zsmuzp
1,-0.015368,-0.019184,0.000772,-0.005146,-0.004233,-0.001388,-0.01045,-0.009148,-0.001779,-0.003654,...,-0.001587,-0.000826,0.018523,-0.03401,0.013823,0.005342,0.033496,-0.010388,0.002776,spotify:artist:0001wHqxbF2YYRQxGdbyER
2,-0.018196,-2.4e-05,0.033606,-0.030348,0.004702,0.003801,-0.006495,-0.00346,-0.000118,-0.003463,...,-0.007864,-0.018153,-0.000497,-0.002984,-0.000509,-0.012474,0.002996,-0.017271,-0.003976,spotify:artist:000Dq0VqTZpxOP6jQMscVL
3,-0.014593,-0.018131,0.000744,-0.004695,-0.003859,-0.001276,-0.009498,-0.008311,-0.001632,-0.003259,...,-0.003955,-0.000681,-0.024163,0.001024,-0.008494,-0.000973,0.007799,-0.029993,0.020919,spotify:artist:000spuc3oKgwYmfg5IE26s
4,-0.014374,-0.017789,0.000655,-0.004391,-0.003389,-0.001136,-0.009248,-0.007998,-0.00162,-0.003173,...,0.000559,0.001915,0.002472,-0.001066,0.000481,0.00035,0.000676,0.001083,0.001606,spotify:artist:000xagx3GkcunHTFdB4ly0


In [27]:
genres_pca_df.to_csv('../data-processed/transformation-matrices/cb_genres_pca_df.csv', index = None)

# Plot data

In [28]:
# data.iloc[:,1:-1].hist(figsize=(14,9))
# uncomment
# plt.show()

In [29]:
# fig, ax = plt.subplots(ncols=4, nrows=3, figsize=(14,9))

# column = data.columns[2:14]

#uncomment
# for i in range(3):
#     for j in range(4):
#         sns.boxplot(data=data[column[i*4+j]], palette="Set1", ax=ax[i,j]).set_title(column[i*4+j])

# Standardize data 

In [30]:
X = data.iloc[:,2:13]

In [31]:
#column orders
data.iloc[:,2:13].columns

Index(['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
       'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo'],
      dtype='object')

In [32]:
scaler = StandardScaler()
transformer = scaler.fit(X)
X_transformed = transformer.transform(X)

In [33]:
# uncomment
# pd.DataFrame(X_transformed).hist(figsize=(14,9))
# plt.show()

# Target encode album_uri and artist_uri

In [34]:
X_transformed.shape

(2262190, 11)

In [35]:
df_X_transformed = pd.DataFrame(X_transformed, columns=data.iloc[:,2:13].columns)
# df_X_transformed.head()

In [36]:
df_X_transformed.shape

(2262190, 11)

In [37]:
df_X_transformed.columns

Index(['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
       'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo'],
      dtype='object')

# Encode album_uri

In [38]:
cols = ['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
       'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo']

In [39]:
df_X_transformed['album_uri'] = data.album_uri
X = data['album_uri'].values.reshape(-1,1)

In [40]:
enc = TargetEncoder()
for col in cols:
    y = df_X_transformed[col].values.reshape(-1,1)  
    df_X_transformed[f'album_uri_{col}_enc'] = enc.fit_transform(X, y)

In [41]:
# df_X_transformed.head()

# Encode artist_uri

In [42]:
cols = ['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
       'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo']

In [43]:
df_X_transformed['artist_uri'] = data.artist_uri
X = data['artist_uri'].values.reshape(-1,1)

In [44]:
enc = TargetEncoder()
for col in cols:
    y = df_X_transformed[col].values.reshape(-1,1) 
    df_X_transformed[f'artist_uri_{col}_enc'] = enc.fit_transform(X, y)

In [45]:
# df_X_transformed.head()

In [46]:
#X_transformed = df_X_transformed.drop(columns=['artist_uri','album_uri']).to_numpy()

In [47]:
df_X_transformed.shape

(2262190, 35)

In [48]:
df_X_transformed.head()

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,...,artist_uri_energy_enc,artist_uri_key_enc,artist_uri_loudness_enc,artist_uri_mode_enc,artist_uri_speechiness_enc,artist_uri_acousticness_enc,artist_uri_instrumentalness_enc,artist_uri_liveness_enc,artist_uri_valence_enc,artist_uri_tempo_enc
0,0.89725,-0.616004,0.488537,-0.337567,-1.377756,-0.53584,0.830192,-0.62609,-0.608544,-0.995276,...,-0.417327,0.417911,-0.252375,-0.851086,-0.514105,0.431692,-0.535989,-0.594633,0.096855,-0.277456
1,-2.218729,-0.54082,-0.073421,-0.5745,0.725818,-0.000277,0.974099,1.33416,-0.42732,-1.493465,...,-1.044545,0.307907,-1.042904,0.425307,-0.391563,0.997275,1.700898,-0.114881,-1.420069,-1.051906
2,1.347035,1.15832,0.769516,0.740272,0.725818,-0.223356,-1.001042,1.983828,0.805424,0.197414,...,0.938618,0.277803,0.257273,-0.442834,-0.028102,-0.930045,1.338008,-0.334937,-0.178686,0.186234
3,1.292844,-0.13859,-0.3544,0.443262,0.725818,-0.411715,0.993851,-0.629341,-0.777124,0.434471,...,0.324209,0.174777,0.478093,0.611874,-0.373967,-0.203324,-0.55263,0.055331,0.641752,0.161622
4,0.962279,0.320027,-0.635379,0.310843,-1.377756,0.110828,-0.558085,-0.629734,-0.532682,-1.522356,...,0.344419,0.095155,0.490695,-0.956923,0.753063,-0.51073,-0.624991,-0.266135,-0.455287,0.179417


In [49]:
df_X_transformed.to_csv('../data-processed/transformation-matrices/cb_df_X_transformed.csv', index = None)

# Add Genres PCA

In [50]:
df_X_transformed.columns

Index(['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
       'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo',
       'album_uri', 'album_uri_danceability_enc', 'album_uri_energy_enc',
       'album_uri_key_enc', 'album_uri_loudness_enc', 'album_uri_mode_enc',
       'album_uri_speechiness_enc', 'album_uri_acousticness_enc',
       'album_uri_instrumentalness_enc', 'album_uri_liveness_enc',
       'album_uri_valence_enc', 'album_uri_tempo_enc', 'artist_uri',
       'artist_uri_danceability_enc', 'artist_uri_energy_enc',
       'artist_uri_key_enc', 'artist_uri_loudness_enc', 'artist_uri_mode_enc',
       'artist_uri_speechiness_enc', 'artist_uri_acousticness_enc',
       'artist_uri_instrumentalness_enc', 'artist_uri_liveness_enc',
       'artist_uri_valence_enc', 'artist_uri_tempo_enc'],
      dtype='object')

In [51]:
df_X_transformed

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,...,artist_uri_energy_enc,artist_uri_key_enc,artist_uri_loudness_enc,artist_uri_mode_enc,artist_uri_speechiness_enc,artist_uri_acousticness_enc,artist_uri_instrumentalness_enc,artist_uri_liveness_enc,artist_uri_valence_enc,artist_uri_tempo_enc
0,0.897250,-0.616004,0.488537,-0.337567,-1.377756,-0.535840,0.830192,-0.626090,-0.608544,-0.995276,...,-0.417327,0.417911,-0.252375,-0.851086,-0.514105,0.431692,-0.535989,-0.594633,0.096855,-0.277456
1,-2.218729,-0.540820,-0.073421,-0.574500,0.725818,-0.000277,0.974099,1.334160,-0.427320,-1.493465,...,-1.044545,0.307907,-1.042904,0.425307,-0.391563,0.997275,1.700898,-0.114881,-1.420069,-1.051906
2,1.347035,1.158320,0.769516,0.740272,0.725818,-0.223356,-1.001042,1.983828,0.805424,0.197414,...,0.938618,0.277803,0.257273,-0.442834,-0.028102,-0.930045,1.338008,-0.334937,-0.178686,0.186234
3,1.292844,-0.138590,-0.354400,0.443262,0.725818,-0.411715,0.993851,-0.629341,-0.777124,0.434471,...,0.324209,0.174777,0.478093,0.611874,-0.373967,-0.203324,-0.552630,0.055331,0.641752,0.161622
4,0.962279,0.320027,-0.635379,0.310843,-1.377756,0.110828,-0.558085,-0.629734,-0.532682,-1.522356,...,0.344419,0.095155,0.490695,-0.956923,0.753063,-0.510730,-0.624991,-0.266135,-0.455287,0.179417
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2262185,1.016470,-0.142349,-0.916358,-0.065263,-1.377756,1.013559,0.333573,1.855039,0.268074,0.734495,...,-0.285667,-0.143666,0.237857,-0.325969,1.037483,0.261660,-0.406749,-0.257554,0.065922,-0.496326
2262186,-0.219083,-0.104758,-1.197337,0.785242,0.725818,-0.330122,-0.716100,1.723388,-0.881960,0.697455,...,0.270450,0.001725,0.183347,-0.130288,-0.321674,-0.505742,1.356075,-0.104536,0.468690,0.391139
2262187,-0.408751,0.132069,1.612454,0.811014,0.725818,-0.317101,-0.244876,-0.632015,-0.584837,-0.147058,...,0.622563,-0.494828,0.859717,0.515397,-0.387015,-0.153716,-0.631937,-0.379491,0.669592,-0.084814
2262188,0.658810,0.534300,-0.073421,0.563062,0.725818,-0.445567,0.432332,-0.628838,-0.727077,0.423359,...,-0.052247,-0.126105,0.240723,0.397135,-0.492738,-0.031547,-0.489509,-0.433329,-0.169433,0.119670


In [4]:
df_X_transformed = pd.read_csv('../data-processed/transformation-matrices/cb_df_X_transformed.csv')

In [9]:
df_X_transformed.shape

(2262190, 35)

In [5]:
genres_pca_df = pd.read_csv('../data-processed/transformation-matrices/cb_genres_pca_df.csv')

In [10]:
genres_pca_df.shape

(122142, 801)

In [6]:
df_merged = df_X_transformed.merge(genres_pca_df, how = 'left', left_on = 'artist_uri', right_on ='pca_genre_artist_uri' )

In [7]:
df_merged.shape

(2262190, 836)

In [8]:
df_merged.head()

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,...,pca_genre_791,pca_genre_792,pca_genre_793,pca_genre_794,pca_genre_795,pca_genre_796,pca_genre_797,pca_genre_798,pca_genre_799,pca_genre_artist_uri
0,0.89725,-0.616004,0.488537,-0.337567,-1.377756,-0.53584,0.830192,-0.62609,-0.608544,-0.995276,...,0.014805,0.020956,-0.012763,0.016099,-0.013033,0.071684,0.053541,-0.013224,-0.023828,spotify:artist:1vSHzGHsVOCrgPSCmKNimP
1,-2.218729,-0.54082,-0.073421,-0.5745,0.725818,-0.000277,0.974099,1.33416,-0.42732,-1.493465,...,,,,,,,,,,
2,1.347035,1.15832,0.769516,0.740272,0.725818,-0.223356,-1.001042,1.983828,0.805424,0.197414,...,-0.000283,-0.008057,0.009964,0.006024,-0.013305,0.009614,0.005862,0.033953,0.007343,spotify:artist:7kxOVclB0zQamtBR0syCrg
3,1.292844,-0.13859,-0.3544,0.443262,0.725818,-0.411715,0.993851,-0.629341,-0.777124,0.434471,...,-0.008368,-0.000212,0.00213,-0.014916,-0.015205,-0.000168,0.002519,-0.008507,-0.013943,spotify:artist:2sxmKe3CUrWnx7eoXMhOlW
4,0.962279,0.320027,-0.635379,0.310843,-1.377756,0.110828,-0.558085,-0.629734,-0.532682,-1.522356,...,-0.000859,0.001006,-0.002154,0.003383,0.000224,-0.001266,0.00158,-0.00322,0.003751,spotify:artist:6PyeXqjH8OMGnt1IOhWgrQ


In [11]:
df_merged = df_merged.fillna(0)

In [12]:
df_merged.to_csv('../data-processed/transformation-matrices/cb_df_merged.csv', index = None)

In [None]:
X_transformed = df_merged.drop(columns=['artist_uri','album_uri']).to_numpy()