In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy
import seaborn as sns
import warnings
from scipy.sparse import coo_matrix, csr_matrix
from sklearn.metrics.pairwise import cosine_similarity
import os
import time
from sklearn.decomposition import NMF, TruncatedSVD

warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)

First We will load the two files into Data Frames to reference throughout

In [3]:
songData = pd.read_csv("Data/Music Info.csv")
userData = pd.read_csv("Data/User Listening History.csv")

We want to see a few entries from each dataframe and check the shape of the overall structures.

In [5]:
display(songData.head(5))
display(userData.head(5))
print("Song Data shape: {}".format(songData.shape))
print("user Data shape: {}".format(userData.shape))

Unnamed: 0,track_id,name,artist,spotify_preview_url,spotify_id,tags,genre,year,duration_ms,danceability,...,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
0,TRIOREW128F424EAF0,Mr. Brightside,The Killers,https://p.scdn.co/mp3-preview/4d26180e6961fd46...,09ZQ5TmUG8TSL56n0knqrj,"rock, alternative, indie, alternative_rock, in...",,2004,222200,0.355,...,1,-4.36,1,0.0746,0.00119,0.0,0.0971,0.24,148.114,4
1,TRRIVDJ128F429B0E8,Wonderwall,Oasis,https://p.scdn.co/mp3-preview/d012e536916c927b...,06UfBBDISthj1ZJAtX4xjj,"rock, alternative, indie, pop, alternative_roc...",,2006,258613,0.409,...,2,-4.373,1,0.0336,0.000807,0.0,0.207,0.651,174.426,4
2,TROUVHL128F426C441,Come as You Are,Nirvana,https://p.scdn.co/mp3-preview/a1c11bb1cb231031...,0keNu0t0tqsWtExGM3nT1D,"rock, alternative, alternative_rock, 90s, grunge",RnB,1991,218920,0.508,...,4,-5.783,0,0.04,0.000175,0.000459,0.0878,0.543,120.012,4
3,TRUEIND128F93038C4,Take Me Out,Franz Ferdinand,https://p.scdn.co/mp3-preview/399c401370438be4...,0ancVQ9wEcHVd0RrGICTE4,"rock, alternative, indie, alternative_rock, in...",,2004,237026,0.279,...,9,-8.851,1,0.0371,0.000389,0.000655,0.133,0.49,104.56,4
4,TRLNZBD128F935E4D8,Creep,Radiohead,https://p.scdn.co/mp3-preview/e7eb60e9466bc3a2...,01QoK9DA7VTeTSE3MNzp4I,"rock, alternative, indie, alternative_rock, in...",RnB,2008,238640,0.515,...,7,-9.935,1,0.0369,0.0102,0.000141,0.129,0.104,91.841,4


Unnamed: 0,track_id,user_id,playcount
0,TRIRLYL128F42539D1,b80344d063b5ccb3212f76538f3d9e43d87dca9e,1
1,TRFUPBA128F934F7E1,b80344d063b5ccb3212f76538f3d9e43d87dca9e,1
2,TRLQPQJ128F42AA94F,b80344d063b5ccb3212f76538f3d9e43d87dca9e,1
3,TRTUCUY128F92E1D24,b80344d063b5ccb3212f76538f3d9e43d87dca9e,1
4,TRHDDQG12903CB53EE,b80344d063b5ccb3212f76538f3d9e43d87dca9e,1


Song Data shape: (50683, 21)
user Data shape: (9711301, 3)


First, we will start by updating the song dataframe into something that can be processed mathematically. We will begin by creating a function to split the tags and create a new column for each of them. This is because most of the tags listed are similar to genres.

In [7]:
# split tags into list
def splitFunc(x):
    return str(x).split(", ")
temp = songData['tags'].apply(splitFunc)
print(temp)

0        [rock, alternative, indie, alternative_rock, i...
1        [rock, alternative, indie, pop, alternative_ro...
2        [rock, alternative, alternative_rock, 90s, gru...
3        [rock, alternative, indie, alternative_rock, i...
4        [rock, alternative, indie, alternative_rock, i...
                               ...                        
50678                                                [nan]
50679            [rock, alternative_rock, japanese, cover]
50680                    [metal, metalcore, post_hardcore]
50681                                                [nan]
50682                                     [rock, japanese]
Name: tags, Length: 50683, dtype: object


Below we will get all the unique tags so that we can create a new column for each and know how many colums we are adding.

In [9]:
uniqueTags = set()
for tags in temp:
    for tag in tags:
        uniqueTags.add(tag.lower())
print(uniqueTags)

{'symphonic_metal', 'hardcore', 'experimental', 'death_metal', 'instrumental', 'progressive_rock', 'rnb', 'metal', 'emo', 'thrash_metal', 'country', 'downtempo', 'classic_rock', '90s', 'rock', 'progressive_metal', 'beautiful', '70s', 'punk_rock', 'cover', 'hard_rock', 'doom_metal', '80s', 'alternative', 'polish', 'oldies', 'folk', 'japanese', 'classical', 'post_hardcore', '60s', 'dark_ambient', 'heavy_metal', 'house', '00s', 'nan', 'mellow', 'blues_rock', 'pop', 'post_punk', 'trance', 'german', 'jazz', 'gothic_metal', 'melodic_death_metal', 'techno', 'funk', 'grindcore', 'metalcore', 'rap', 'american', 'reggae', 'guitar', 'chill', 'female_vocalists', 'punk', 'french', 'britpop', 'psychedelic_rock', 'grunge', 'lounge', 'chillout', 'noise', 'ska', 'indie', 'indie_rock', 'avant_garde', 'male_vocalists', 'indie_pop', 'new_age', 'synthpop', 'piano', 'pop_rock', 'trip_hop', 'dance', 'screamo', 'nu_metal', 'british', 'black_metal', 'electro', 'ambient', 'love', 'new_wave', 'blues', 'alternati

If a song has the tag, put a 1 in the associated column

In [11]:
tempSongs = songData.copy()
for row in range(0, len(temp)):
    for column in temp[row]:
        tempSongs.loc[tempSongs.index[row], column] = 1
display(tempSongs)

Unnamed: 0,track_id,name,artist,spotify_preview_url,spotify_id,tags,genre,year,duration_ms,danceability,...,new_age,black_metal,death_metal,classical,japanese,melodic_death_metal,grindcore,j_pop,polish,nan
0,TRIOREW128F424EAF0,Mr. Brightside,The Killers,https://p.scdn.co/mp3-preview/4d26180e6961fd46...,09ZQ5TmUG8TSL56n0knqrj,"rock, alternative, indie, alternative_rock, in...",,2004,222200,0.355,...,,,,,,,,,,
1,TRRIVDJ128F429B0E8,Wonderwall,Oasis,https://p.scdn.co/mp3-preview/d012e536916c927b...,06UfBBDISthj1ZJAtX4xjj,"rock, alternative, indie, pop, alternative_roc...",,2006,258613,0.409,...,,,,,,,,,,
2,TROUVHL128F426C441,Come as You Are,Nirvana,https://p.scdn.co/mp3-preview/a1c11bb1cb231031...,0keNu0t0tqsWtExGM3nT1D,"rock, alternative, alternative_rock, 90s, grunge",RnB,1991,218920,0.508,...,,,,,,,,,,
3,TRUEIND128F93038C4,Take Me Out,Franz Ferdinand,https://p.scdn.co/mp3-preview/399c401370438be4...,0ancVQ9wEcHVd0RrGICTE4,"rock, alternative, indie, alternative_rock, in...",,2004,237026,0.279,...,,,,,,,,,,
4,TRLNZBD128F935E4D8,Creep,Radiohead,https://p.scdn.co/mp3-preview/e7eb60e9466bc3a2...,01QoK9DA7VTeTSE3MNzp4I,"rock, alternative, indie, alternative_rock, in...",RnB,2008,238640,0.515,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50678,TRQYCFV128F9322F50,Ryusei Rocket,アンティック-珈琲店-,https://p.scdn.co/mp3-preview/d2668a5a3e0b1fda...,0tt1RdeJX1RyuU4hMEZ19T,,,2008,273440,0.438,...,,,,,,,,,,1.0
50679,TRHQCSH128F42724B7,Colors Of The Wind,ACIDMAN,https://p.scdn.co/mp3-preview/8e22a7052ef3ecf7...,3wkdfXGf5JYErW4b35zP2h,"rock, alternative_rock, japanese, cover",,2004,275133,0.351,...,,,,,1.0,,,,,
50680,TRZRODK128F92D68D7,The Revelation,coldrain,https://p.scdn.co/mp3-preview/4b51a813f67e3853...,1gXMORZRGA40PE9rDE9cja,"metal, metalcore, post_hardcore",,2014,254826,0.434,...,,,,,,,,,,
50681,TRGLMEM128F9322F63,Koi no Dependence,アンティック-珈琲店-,https://p.scdn.co/mp3-preview/5a61e031df174666...,1dxMDGvIYHFYgRvmw1uMHG,,,2008,243293,0.513,...,,,,,,,,,,1.0


fill all the entries where a song did not have a tag with 0's

In [13]:
# cell to get rid of NAN values in our newly created columns
tempSongs[list(uniqueTags)] = tempSongs[list(uniqueTags)].fillna(0)

In [14]:
'''# cell for checking if values are updated correctly
newTemp = tempSongs.columns
for column in tempSongs.columns:
    print("{} : {}".format(column, tempSongs.loc[tempSongs.index[0], column]))'''

'# cell for checking if values are updated correctly\nnewTemp = tempSongs.columns\nfor column in tempSongs.columns:\n    print("{} : {}".format(column, tempSongs.loc[tempSongs.index[0], column]))'

once our new columns are added, we can get rid of the tags column

In [16]:
tempSongs = tempSongs.drop(columns=['tags'])

This cell is a note that we are keeping the nan column for now because songs with nan tags may be similar, but it may end up getting dropped in the future

Next, lets check to see which columns have na values and how many there are.

In [19]:
# check for NA values in each column
for column in tempSongs.columns:
    NAcount = tempSongs[column].isna().sum()
    if(NAcount > 0):
        print("{} : {}".format(column, NAcount))
print("Songs DF shape: {}".format(tempSongs.shape))

genre : 28335
Songs DF shape: (50683, 121)


We are now going to check to see what the unique values are in the genre column and if they overlap with the tags we already saw

In [21]:
uniqueGenres = set(str(x).lower() for x in tempSongs['genre'].unique())

print("Unique Genres: {}".format(sorted(uniqueGenres)))
print("Unique Tags: {}".format(sorted(uniqueTags)))


Unique Genres: ['blues', 'country', 'electronic', 'folk', 'jazz', 'latin', 'metal', 'nan', 'new age', 'pop', 'punk', 'rap', 'reggae', 'rnb', 'rock', 'world']
Unique Tags: ['00s', '60s', '70s', '80s', '90s', 'acoustic', 'alternative', 'alternative_rock', 'ambient', 'american', 'avant_garde', 'beautiful', 'black_metal', 'blues', 'blues_rock', 'british', 'britpop', 'chill', 'chillout', 'classic_rock', 'classical', 'country', 'cover', 'dance', 'dark_ambient', 'death_metal', 'doom_metal', 'downtempo', 'drum_and_bass', 'electro', 'electronic', 'emo', 'experimental', 'female_vocalists', 'folk', 'french', 'funk', 'german', 'gothic', 'gothic_metal', 'grindcore', 'grunge', 'guitar', 'hard_rock', 'hardcore', 'heavy_metal', 'hip_hop', 'house', 'idm', 'indie', 'indie_pop', 'indie_rock', 'industrial', 'instrumental', 'j_pop', 'japanese', 'jazz', 'lounge', 'love', 'male_vocalists', 'mellow', 'melodic_death_metal', 'metal', 'metalcore', 'nan', 'new_age', 'new_wave', 'noise', 'nu_metal', 'oldies', 'pia

In [22]:
uniqueGenres- uniqueTags

{'latin', 'new age', 'world'}

Because more than half of the genre column are null and they are just repeats of tags when applicable, we will ensure that the tag column is set to 1 or add a new column and delete the genre column after. Note that the 'new age' genre is already a tag called 'new_age' 

In [24]:
for row in range(0, tempSongs.shape[0]):
    if (str(tempSongs.loc[tempSongs.index[row], 'genre']).lower() == 'new age'):
        tempSongs.loc[tempSongs.index[row], 'new_age'] = 1
    else:
        tempSongs.loc[tempSongs.index[row], str(tempSongs.loc[tempSongs.index[row], 'genre']).lower()] = 1

In [25]:
print("Songs DF shape: {}".format(tempSongs.shape))

Songs DF shape: (50683, 123)


In [26]:
display(tempSongs)

Unnamed: 0,track_id,name,artist,spotify_preview_url,spotify_id,genre,year,duration_ms,danceability,energy,...,death_metal,classical,japanese,melodic_death_metal,grindcore,j_pop,polish,nan,latin,world
0,TRIOREW128F424EAF0,Mr. Brightside,The Killers,https://p.scdn.co/mp3-preview/4d26180e6961fd46...,09ZQ5TmUG8TSL56n0knqrj,,2004,222200,0.355,0.918,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,,
1,TRRIVDJ128F429B0E8,Wonderwall,Oasis,https://p.scdn.co/mp3-preview/d012e536916c927b...,06UfBBDISthj1ZJAtX4xjj,,2006,258613,0.409,0.892,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,,
2,TROUVHL128F426C441,Come as You Are,Nirvana,https://p.scdn.co/mp3-preview/a1c11bb1cb231031...,0keNu0t0tqsWtExGM3nT1D,RnB,1991,218920,0.508,0.826,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,
3,TRUEIND128F93038C4,Take Me Out,Franz Ferdinand,https://p.scdn.co/mp3-preview/399c401370438be4...,0ancVQ9wEcHVd0RrGICTE4,,2004,237026,0.279,0.664,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,,
4,TRLNZBD128F935E4D8,Creep,Radiohead,https://p.scdn.co/mp3-preview/e7eb60e9466bc3a2...,01QoK9DA7VTeTSE3MNzp4I,RnB,2008,238640,0.515,0.430,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50678,TRQYCFV128F9322F50,Ryusei Rocket,アンティック-珈琲店-,https://p.scdn.co/mp3-preview/d2668a5a3e0b1fda...,0tt1RdeJX1RyuU4hMEZ19T,,2008,273440,0.438,0.933,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,,
50679,TRHQCSH128F42724B7,Colors Of The Wind,ACIDMAN,https://p.scdn.co/mp3-preview/8e22a7052ef3ecf7...,3wkdfXGf5JYErW4b35zP2h,,2004,275133,0.351,0.693,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,,
50680,TRZRODK128F92D68D7,The Revelation,coldrain,https://p.scdn.co/mp3-preview/4b51a813f67e3853...,1gXMORZRGA40PE9rDE9cja,,2014,254826,0.434,0.975,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,,
50681,TRGLMEM128F9322F63,Koi no Dependence,アンティック-珈琲店-,https://p.scdn.co/mp3-preview/5a61e031df174666...,1dxMDGvIYHFYgRvmw1uMHG,,2008,243293,0.513,0.902,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,,


Make sure to fill in the na values from our newly added columns with 0

In [28]:
tempSongs[['latin','world']] = tempSongs[['latin','world']].fillna(0)

In [29]:
display(tempSongs)

Unnamed: 0,track_id,name,artist,spotify_preview_url,spotify_id,genre,year,duration_ms,danceability,energy,...,death_metal,classical,japanese,melodic_death_metal,grindcore,j_pop,polish,nan,latin,world
0,TRIOREW128F424EAF0,Mr. Brightside,The Killers,https://p.scdn.co/mp3-preview/4d26180e6961fd46...,09ZQ5TmUG8TSL56n0knqrj,,2004,222200,0.355,0.918,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,TRRIVDJ128F429B0E8,Wonderwall,Oasis,https://p.scdn.co/mp3-preview/d012e536916c927b...,06UfBBDISthj1ZJAtX4xjj,,2006,258613,0.409,0.892,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,TROUVHL128F426C441,Come as You Are,Nirvana,https://p.scdn.co/mp3-preview/a1c11bb1cb231031...,0keNu0t0tqsWtExGM3nT1D,RnB,1991,218920,0.508,0.826,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,TRUEIND128F93038C4,Take Me Out,Franz Ferdinand,https://p.scdn.co/mp3-preview/399c401370438be4...,0ancVQ9wEcHVd0RrGICTE4,,2004,237026,0.279,0.664,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,TRLNZBD128F935E4D8,Creep,Radiohead,https://p.scdn.co/mp3-preview/e7eb60e9466bc3a2...,01QoK9DA7VTeTSE3MNzp4I,RnB,2008,238640,0.515,0.430,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50678,TRQYCFV128F9322F50,Ryusei Rocket,アンティック-珈琲店-,https://p.scdn.co/mp3-preview/d2668a5a3e0b1fda...,0tt1RdeJX1RyuU4hMEZ19T,,2008,273440,0.438,0.933,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
50679,TRHQCSH128F42724B7,Colors Of The Wind,ACIDMAN,https://p.scdn.co/mp3-preview/8e22a7052ef3ecf7...,3wkdfXGf5JYErW4b35zP2h,,2004,275133,0.351,0.693,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
50680,TRZRODK128F92D68D7,The Revelation,coldrain,https://p.scdn.co/mp3-preview/4b51a813f67e3853...,1gXMORZRGA40PE9rDE9cja,,2014,254826,0.434,0.975,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
50681,TRGLMEM128F9322F63,Koi no Dependence,アンティック-珈琲店-,https://p.scdn.co/mp3-preview/5a61e031df174666...,1dxMDGvIYHFYgRvmw1uMHG,,2008,243293,0.513,0.902,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [30]:
tempSongs = tempSongs.drop(columns=['genre'])

Because milliseconds can vary largely and most users do not care about the difference of milliseconds in songs, we will convert it to a more meaningful measure such as seconds and rename the column

In [32]:
def MStoS(x):
    return round(x/1000)

tempSongs['duration_ms'] = tempSongs['duration_ms'].apply(MStoS)
tempSongs = tempSongs.rename(columns={"duration_ms": "duration_seconds"})

In [33]:
"""for column in sorted(tempSongs.columns):
    print(column)
# this is a statement to show all the song columns and is not needed anymore"""

'for column in sorted(tempSongs.columns):\n    print(column)\n# this is a statement to show all the song columns and is not needed anymore'

In [34]:
display(tempSongs)

Unnamed: 0,track_id,name,artist,spotify_preview_url,spotify_id,year,duration_seconds,danceability,energy,key,...,death_metal,classical,japanese,melodic_death_metal,grindcore,j_pop,polish,nan,latin,world
0,TRIOREW128F424EAF0,Mr. Brightside,The Killers,https://p.scdn.co/mp3-preview/4d26180e6961fd46...,09ZQ5TmUG8TSL56n0knqrj,2004,222,0.355,0.918,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,TRRIVDJ128F429B0E8,Wonderwall,Oasis,https://p.scdn.co/mp3-preview/d012e536916c927b...,06UfBBDISthj1ZJAtX4xjj,2006,259,0.409,0.892,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,TROUVHL128F426C441,Come as You Are,Nirvana,https://p.scdn.co/mp3-preview/a1c11bb1cb231031...,0keNu0t0tqsWtExGM3nT1D,1991,219,0.508,0.826,4,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,TRUEIND128F93038C4,Take Me Out,Franz Ferdinand,https://p.scdn.co/mp3-preview/399c401370438be4...,0ancVQ9wEcHVd0RrGICTE4,2004,237,0.279,0.664,9,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,TRLNZBD128F935E4D8,Creep,Radiohead,https://p.scdn.co/mp3-preview/e7eb60e9466bc3a2...,01QoK9DA7VTeTSE3MNzp4I,2008,239,0.515,0.430,7,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50678,TRQYCFV128F9322F50,Ryusei Rocket,アンティック-珈琲店-,https://p.scdn.co/mp3-preview/d2668a5a3e0b1fda...,0tt1RdeJX1RyuU4hMEZ19T,2008,273,0.438,0.933,6,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
50679,TRHQCSH128F42724B7,Colors Of The Wind,ACIDMAN,https://p.scdn.co/mp3-preview/8e22a7052ef3ecf7...,3wkdfXGf5JYErW4b35zP2h,2004,275,0.351,0.693,0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
50680,TRZRODK128F92D68D7,The Revelation,coldrain,https://p.scdn.co/mp3-preview/4b51a813f67e3853...,1gXMORZRGA40PE9rDE9cja,2014,255,0.434,0.975,10,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
50681,TRGLMEM128F9322F63,Koi no Dependence,アンティック-珈琲店-,https://p.scdn.co/mp3-preview/5a61e031df174666...,1dxMDGvIYHFYgRvmw1uMHG,2008,243,0.513,0.902,4,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [35]:
print(len(tempSongs['artist'].unique()))
''' Too many artists to create a column for each, will base the recomendation system off similarity of music rather than artist
'''

8317


' Too many artists to create a column for each, will base the recomendation system off similarity of music rather than artist\n'

After checking how many unique artists there are for music in our dataframe, it will be too much to create a new column for every single one. We will use the assumption however that music produced by the same artist will have similar features elsewhere such as year, all the tag/genre columns, and many of the song descriptions such as energy and danceability.

In [37]:
print(len(userData['user_id'].unique()))

962037


In [38]:
# create new DF where row index is the user_id, and column index is the track_id

#combined = pd.DataFrame(index = sorted(userData['user_id'].unique()), columns = sorted(tempSongs['track_id'].unique()), dtype=np.uint16)
#display(combined)




The code above generates a dataFrame that is too large to store in memory, therefore we will use coo_matrix for a sparse matrix after reducing the number of users. If we have a new user, we would just have to create a vector for them that includes their own song choices.

In [40]:
UsersPlaycount = userData.groupby(['user_id'])['playcount'].sum()
type(UsersPlaycount)

pandas.core.series.Series

In [41]:
print(UsersPlaycount)

user_id
00000b722001882066dff9d2da8a775658053ea0     1
00001638d6189236866af9bbf309ae6c2347ffdc     1
0000175652312d12576d9e6b84f600caa24c4715     1
00001cf0dce3fb22b0df0f3a1d9cd21e38385372    13
0000267bde1b3a70ea75cf2b2d216cb828e3202b     9
                                            ..
ffffdc6c89988cd6119067769162948eacf8b670    44
ffffe07df4bb5fd929efe42c5728f3a0c1621277     3
fffff3e690fcda840b716ce7249d8935ff3323fc     8
fffff67d54a40927c93d03bd6c816b034b59f087     8
fffff9534445f481b6ab91c345500083d2ce4df1    37
Name: playcount, Length: 962037, dtype: int64


We will use an arbitrary cut-off of 30 songs listened to to determine who to include in our dataframe. This does not mean that they need to listen to 30 different songs, but could include the same song 30 times.

In [43]:
userplaycountminimum = 30
UsersPlaycount = UsersPlaycount[UsersPlaycount >= userplaycountminimum]
print(UsersPlaycount)

user_id
0000f88f8d76a238c251450913b0d070e4a77d19     79
000138e252eea35fd73aaf66a9b34102b695a9c8     37
00020e8ba3f9041deed64ec9c60b26ff6bf41c66     55
00023f6ad10cd247d187b461e6b00b7bf3ebc568    131
00028f3cff4872bff3e9985cfa32e01a8d54e374    124
                                           ... 
ffffbfdc713a612d581f22218e45031b5ce13a1c    113
ffffcfeb0c1b66bd212ea58d918c7dc62fb9c3a5     78
ffffd330940a2a40754ec0383391f55c6129f48b     76
ffffdc6c89988cd6119067769162948eacf8b670     44
fffff9534445f481b6ab91c345500083d2ce4df1     37
Name: playcount, Length: 246177, dtype: int64


As we can see, we reduced our unique users by about 700 thousand by applying this filter. This will make our dataframe smaller and able to run on RAM

In [45]:
uniqueUsersBefore = len(userData['user_id'].unique())
userData = userData[userData['user_id'].isin(UsersPlaycount.keys())]

In [46]:
"""
# this cell is just to display an individual's userData
userData[userData['user_id'] == 'fffff9534445f481b6ab91c345500083d2ce4df1']
"""

"\n# this cell is just to display an individual's userData\nuserData[userData['user_id'] == 'fffff9534445f481b6ab91c345500083d2ce4df1']\n"

We can now create our combined matrix where the rows are the users and the columns are the songs. We will use user_id for users and track_id for songs and store the values as 16bit integers.

In [48]:
combinedMatrix = coo_matrix((len(userData['user_id'].unique()), len(tempSongs['track_id'].unique())), dtype=np.uint16).toarray()

We wil create some dictionaries to store the index for each user and movie in the combined matrix as well as dictionaries to reverse-map the index to the value

In [50]:
id2index = {value : index for index, value in enumerate(sorted(userData['user_id'].unique()))}
# id2index['000138e252eea35fd73aaf66a9b34102b695a9c8'] == 1

In [51]:
song2index = {value : index for index, value in enumerate(sorted(tempSongs['track_id'].unique()))}

In [52]:
index2song = dict((v,k) for k,v in song2index.items())
index2user = dict((v,k) for k,v in id2index.items())
trackIDtoTitle = tempSongs.set_index('track_id')['name'].to_dict()

In [53]:
#song2index['TRAAAED128E0783FAB'] == 1

In [54]:
# tempSongs[tempSongs['track_id'] == 'TRAAADT12903CCC339']

Next we will fill the combined matrix with the percentage that the user listens to an individual song. This will cause it to have a greater weight towards songs that fill up the user's playcount, but when the user begins to listen to many songs an almost equal amount, a single song being listened to one more time than another will not have as large of an impact on reccomendations.

In [56]:
for index, row in userData.iterrows():
    combinedMatrix[id2index[row['user_id']]][song2index[row['track_id']]] = round(row['playcount'] / UsersPlaycount[row['user_id']]) * 100
# dividing by the total user playcount to bring the values closer together

In [57]:
# this is a check to make sure values were updated properly, should equal 20 if worked properly or different value if we scale
# print(combinedMatrix[id2index['fffff9534445f481b6ab91c345500083d2ce4df1']][song2index['TRLHPYP128F4259AB7']])

In [58]:
newTemp = tempSongs.sort_values(by='track_id')
display(newTemp)

Unnamed: 0,track_id,name,artist,spotify_preview_url,spotify_id,year,duration_seconds,danceability,energy,key,...,death_metal,classical,japanese,melodic_death_metal,grindcore,j_pop,polish,nan,latin,world
9639,TRAAADT12903CCC339,Andalucia,Pink Martini,https://p.scdn.co/mp3-preview/37d9c700532305fc...,16uYYyMYpiJsUC9JzZX4Zk,1997,219,0.714,0.521,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
9086,TRAAAED128E0783FAB,It's About Time,Jamie Cullum,https://p.scdn.co/mp3-preview/9bb7b58f848a742b...,00ZCW9lUa2bqwpF6rhtaMI,2004,247,0.565,0.507,9,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6113,TRAAAHD128F42635A5,I'll Be Waiting,Adele,https://p.scdn.co/mp3-preview/5f1c0da2922b0952...,3QEYmDt1FwOkFBFD74cEzZ,2011,242,0.548,0.843,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4625,TRAAAQO12903CD8E1C,Take Time,The Books,https://p.scdn.co/mp3-preview/1de501d7c5968bca...,46lwuNGtO8mKkcFwhzGib1,2003,217,0.720,0.643,10,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
17366,TRAAAZF12903CCCF6B,Break My Stride,Matthew Wilder,https://p.scdn.co/mp3-preview/06ba33dcbe01b551...,0euwOdZrYl2lqo6tJl1zL5,2009,182,0.908,0.695,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37697,TRZZYMU128E0792400,Flutter Girl,Chris Cornell,https://p.scdn.co/mp3-preview/086e8821cd2cf926...,22KU2R34xEWwt8WqeMFTHD,1999,265,0.588,0.835,9,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
37421,TRZZZCL128F428BB80,The Ship of Pills and Needed Things,I Am Ghost,https://p.scdn.co/mp3-preview/331a9b39765bade1...,3bmZLuWJbeZ863SoRuRSwW,2006,222,0.172,0.948,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
40155,TRZZZCN128F9317A03,Coils,Robert Rich,https://p.scdn.co/mp3-preview/767efe836ec7a099...,2eBnWoWwDD4nZ00thUVEFQ,2001,365,0.487,0.601,11,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
18144,TRZZZHL128F9329CFB,Day Five: Voices,Ayreon,https://p.scdn.co/mp3-preview/9f62f3a9d9a232a6...,3FgA6RYkNKdHjmYyyTMSJh,2004,430,0.485,0.452,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


We can now drop columns that contain bad values such as links, name of song, track_id (because we already have it stored using the index), and artist name since we did not want to one-hot encode all the unique values.

In [60]:
newTemp = newTemp.drop(columns=['track_id', 'name', 'artist', 'spotify_preview_url', 'spotify_id'])
display(newTemp)

Unnamed: 0,year,duration_seconds,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,...,death_metal,classical,japanese,melodic_death_metal,grindcore,j_pop,polish,nan,latin,world
9639,1997,219,0.714,0.521,2,-9.828,1,0.0363,0.72700,0.825000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
9086,2004,247,0.565,0.507,9,-8.339,1,0.0541,0.70200,0.000066,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6113,2011,242,0.548,0.843,2,-2.674,1,0.0374,0.04670,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4625,2003,217,0.720,0.643,10,-7.805,1,0.1000,0.58000,0.072800,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
17366,2009,182,0.908,0.695,2,-7.680,0,0.0448,0.11800,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37697,1999,265,0.588,0.835,9,-5.497,1,0.0356,0.12100,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
37421,2006,222,0.172,0.948,0,-4.194,1,0.1800,0.00173,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
40155,2001,365,0.487,0.601,11,-12.319,1,0.0327,0.94700,0.889000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
18144,2004,430,0.485,0.452,2,-10.250,1,0.0274,0.12500,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


We should also have a function that will display the rows for the songs that we recomend to the user. In this case we will use the entire row, however in production it would either play the next song, show the link, or show the artist name and song name rather than the entire row

In [62]:
def showPredictions(recomendations):
    for recomendation in recomendations:
        display(songData[songData['name'] == trackIDtoTitle[index2song[recomendation]]])

We will also need a function that that fastly calculate the jaccard similarity between a matrix1 and a matrix2 where matrix 1 is a subset of matrix2. Matrix1 will be a small amount of movies compared to matrix2 which will be all movies.

In [64]:
# this code was obtained as a prompt from chatgpt to make finding the jaccard similarity faster
def jaccard_sim_vectorized(matrix1, matrix2):
    matrix1 = matrix1.values
    matrix2 = matrix2.values
    intersection = np.dot(matrix1, matrix2.T)
    union = np.add(
        np.sum(matrix1, axis=1)[:, None],
        np.sum(matrix2, axis=1)[None, :]
    ) - intersection
    with np.errstate(divide='ignore', invalid='ignore'):
        similarity_matrix = np.divide(intersection, union)
        similarity_matrix[union == 0] = 0
    return similarity_matrix

We will need a large matrix to store the data where it is of size (movies, movies) which is about (50000,500000) and will have to store 32bit float values. This is a large amount of space that can not be done with only RAM, and so we will take advantage of numpy being able to map memory to a disk and read it fast. We will also use cosine similarity for continuous data and take tha jaccard similarity for our tags columns and average the 2 together in order to produce a suedo-similarity that we can use. The weight of each similarity may change as we update the algorithm.

In [66]:
# method for using np.memmap to store cosine similarity matrix found using google and it's AI answer

songSimilarityFile = "songSimilarity.dat"
similarity_shape = (tempSongs.shape[0], tempSongs.shape[0])
similarity_dtype = np.float32
if os.path.exists(songSimilarityFile): #because we have ran this code already and it takes a bit, we will load it in since it is already saved to memory
    similarity_memmap = np.memmap(songSimilarityFile, dtype=similarity_dtype, mode='r', shape=similarity_shape)
    #display(similarity_memmap)
else:
    similarity_memmap = np.memmap(songSimilarityFile, dtype=similarity_dtype, mode='w+', shape=similarity_shape)

    chunk_size = 1000
    for row in range(0, tempSongs.shape[0], chunk_size):
        S = cosine_similarity(newTemp.iloc[row: row + chunk_size, 0:14], newTemp.iloc[:,0:14])
        J = jaccard_sim_vectorized(newTemp.iloc[row: row + chunk_size, 14:], newTemp.iloc[:, 14:])
        retval = (S+J)/2
        similarity_memmap[row:row + chunk_size] = retval

    similarity_memmap.flush()
    del similarity_memmap
    similarity_memmap = np.memmap(songSimilarityFile, dtype=similarity_dtype, mode='r', shape=similarity_shape)
    #display(similarity_memmap)

We need a function that will compute the dot product from the userID to the similarity matrix and return the top scores (recomended songs). In this case, the algorithm can still recomend the most listened to song, however we can change this by simply excluding from our search and enforcing that it can't be one of their top N songs.

In [68]:
# this method uses songs that the user has listened to and provides songs that are similar to those songs.
def predictSongsFromUser(userID, n):
    recItems = np.dot(combinedMatrix[id2index[userID]], similarity_memmap)
    retval = np.argpartition(recItems, -n)[-n:]
    return retval

predictions = predictSongsFromUser('fffff9534445f481b6ab91c345500083d2ce4df1', 4)
showPredictions(predictions)

Unnamed: 0,track_id,name,artist,spotify_preview_url,spotify_id,tags,genre,year,duration_ms,danceability,...,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
13890,TRCKGWO12903CBC4DD,Real Wild Child,Iggy Pop,https://p.scdn.co/mp3-preview/5f2f01f78b362af9...,00D2x2aCgoqo1AsnmcpdSI,"rock, punk, 80s, punk_rock, new_wave, post_punk",,2014,213520,0.338,...,9,-12.014,1,0.0549,0.000922,0.257,0.946,0.352,147.492,4


Unnamed: 0,track_id,name,artist,spotify_preview_url,spotify_id,tags,genre,year,duration_ms,danceability,...,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
28480,TRAICJY128F92C3FF2,My Perfect Cousin,The Undertones,https://p.scdn.co/mp3-preview/ee4e6cfbfb9ea6c1...,1DxNV3CnFJdsUSRoHAbDAw,"punk, 80s, punk_rock, new_wave, post_punk",,1980,156093,0.514,...,9,-10.525,1,0.0487,0.00448,0.00243,0.0594,0.372,158.661,4


Unnamed: 0,track_id,name,artist,spotify_preview_url,spotify_id,tags,genre,year,duration_ms,danceability,...,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
28221,TRPCPCW128F4269098,Second Skin,The Chameleons,https://p.scdn.co/mp3-preview/3145aa54ba8ea9a1...,0E26LBCuwJKKzt9dZdF0g0,"80s, new_wave, post_punk",Punk,2006,483266,0.446,...,4,-7.817,0,0.134,0.000709,0.203,0.871,0.15,127.917,4


Unnamed: 0,track_id,name,artist,spotify_preview_url,spotify_id,tags,genre,year,duration_ms,danceability,...,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
13741,TRLHPYP128F4259AB7,Hitsville U.K.,The Clash,https://p.scdn.co/mp3-preview/458093f031b22914...,0e6aQYSW4RjoVf2lSlnuBV,"punk, 80s, punk_rock, new_wave, post_punk, ska",Punk,1991,263200,0.618,...,0,-5.709,1,0.0409,0.0375,3e-06,0.214,0.666,99.807,4


for the next part of our recomendation system, we will test using matrix factorization to see if we can produce similar results by finding latent features. To do this, we will need to make a sparse matrix otherwise the algorithm would require too much RAM. We will try different numbers of components as well as Non-negative matrix factorization vs TruncatedSVD and see how each performs.

In [70]:
sparseCombined = csr_matrix(combinedMatrix)

In [71]:
matrixFactor = NMF(n_components=50, init='nndsvd', random_state=42)
W = matrixFactor.fit_transform(sparseCombined)
H = matrixFactor.components_

In [72]:
print("W shape: {}".format(W.shape))
print("H shape: {}".format(H.shape))
# this cell is to show that W is the size of (users, latentFeatures) and H is size (latentFeatures, movies)

W shape: (246177, 50)
H shape: (50, 50683)


In [73]:
def predictFromLatent(userID, n):
    recItems = np.dot(W[id2index[userID]], H)
    retval = np.argpartition(recItems, -n)[-n:]
    return retval

predictions = predictFromLatent('fffff9534445f481b6ab91c345500083d2ce4df1', 4)
showPredictions(predictions)

Unnamed: 0,track_id,name,artist,spotify_preview_url,spotify_id,tags,genre,year,duration_ms,danceability,...,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
7042,TRYRZFF128F933BBBB,I Never Told You,Colbie Caillat,https://p.scdn.co/mp3-preview/eecb5e9728c6493a...,0APq1pwGxJFTBw9mHwwnKx,"pop, female_vocalists, singer_songwriter, acou...",Pop,2009,235053,0.501,...,3,-5.195,1,0.029,0.172,0.0,0.13,0.374,144.203,4


Unnamed: 0,track_id,name,artist,spotify_preview_url,spotify_id,tags,genre,year,duration_ms,danceability,...,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
13741,TRLHPYP128F4259AB7,Hitsville U.K.,The Clash,https://p.scdn.co/mp3-preview/458093f031b22914...,0e6aQYSW4RjoVf2lSlnuBV,"punk, 80s, punk_rock, new_wave, post_punk, ska",Punk,1991,263200,0.618,...,0,-5.709,1,0.0409,0.0375,3e-06,0.214,0.666,99.807,4


Unnamed: 0,track_id,name,artist,spotify_preview_url,spotify_id,tags,genre,year,duration_ms,danceability,...,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
15267,TRRVJCK12903CD2DCB,U Smile,Justin Bieber,https://p.scdn.co/mp3-preview/b9b8a73b110ac5a4...,0KDJBhhe2OYnnoJtbtXy1f,"pop, hard_rock, black_metal, death_metal, heav...",Pop,2010,196906,0.705,...,1,-5.252,1,0.03,0.516,2e-06,0.117,0.419,112.505,3


Unnamed: 0,track_id,name,artist,spotify_preview_url,spotify_id,tags,genre,year,duration_ms,danceability,...,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
98,TRFOVTO128F4228CC3,Fortunate Son,Creedence Clearwater Revival,https://p.scdn.co/mp3-preview/3136857b0f683af8...,02W3Xt0zd4cXTQ923Cx9Q3,"rock, classic_rock, 60s",Rock,2005,138052,0.631,...,0,-3.905,1,0.0381,0.0562,0.26,0.201,0.45,132.498,4


In [74]:
matrixFactor2 = NMF(n_components=100, init='nndsvd', random_state = 42)
W = matrixFactor2.fit_transform(sparseCombined)
H = matrixFactor2.components_

predictions = predictFromLatent('fffff9534445f481b6ab91c345500083d2ce4df1', 4)
showPredictions(predictions)

Unnamed: 0,track_id,name,artist,spotify_preview_url,spotify_id,tags,genre,year,duration_ms,danceability,...,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
2498,TRMYZFA128F146DE4E,Rollin' & Scratchin',Daft Punk,https://p.scdn.co/mp3-preview/3b6b140e0b84d3c0...,3Wi8fiEg2RkoPaC3PGIQEv,"electronic, dance, 90s, house, techno, electro...",,2006,448626,0.823,...,1,-4.862,1,0.0417,0.0011,0.639,0.0524,0.207,130.038,4


Unnamed: 0,track_id,name,artist,spotify_preview_url,spotify_id,tags,genre,year,duration_ms,danceability,...,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
16090,TRYEGSH12903CD2DCE,Overboard,Justin Bieber,https://p.scdn.co/mp3-preview/79a899f597b18c3f...,0cfsbkanGUO3yzXCxA0iuF,"pop, black_metal, death_metal, industrial, thr...",,2010,251186,0.48,...,5,-5.214,1,0.0501,0.203,0.0,0.118,0.468,173.907,4


Unnamed: 0,track_id,name,artist,spotify_preview_url,spotify_id,tags,genre,year,duration_ms,danceability,...,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
38759,TRBZKDX128F4255A4A,Por Besarte,Lu,https://p.scdn.co/mp3-preview/f295fcc4a4baec43...,0JCLAFZZENDUGXVZrV9Mvy,"pop, guitar",Pop,2011,217400,0.443,...,6,-4.433,0,0.0422,0.193,0.0,0.0717,0.525,176.06,4


Unnamed: 0,track_id,name,artist,spotify_preview_url,spotify_id,tags,genre,year,duration_ms,danceability,...,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
13741,TRLHPYP128F4259AB7,Hitsville U.K.,The Clash,https://p.scdn.co/mp3-preview/458093f031b22914...,0e6aQYSW4RjoVf2lSlnuBV,"punk, 80s, punk_rock, new_wave, post_punk, ska",Punk,1991,263200,0.618,...,0,-5.709,1,0.0409,0.0375,3e-06,0.214,0.666,99.807,4


In [75]:
matrixFactor3 = TruncatedSVD(n_components=100, random_state = 42)
W = matrixFactor2.fit_transform(sparseCombined)
H = matrixFactor2.components_

predictions = predictFromLatent('fffff9534445f481b6ab91c345500083d2ce4df1', 4)
showPredictions(predictions)

Unnamed: 0,track_id,name,artist,spotify_preview_url,spotify_id,tags,genre,year,duration_ms,danceability,...,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
2498,TRMYZFA128F146DE4E,Rollin' & Scratchin',Daft Punk,https://p.scdn.co/mp3-preview/3b6b140e0b84d3c0...,3Wi8fiEg2RkoPaC3PGIQEv,"electronic, dance, 90s, house, techno, electro...",,2006,448626,0.823,...,1,-4.862,1,0.0417,0.0011,0.639,0.0524,0.207,130.038,4


Unnamed: 0,track_id,name,artist,spotify_preview_url,spotify_id,tags,genre,year,duration_ms,danceability,...,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
16090,TRYEGSH12903CD2DCE,Overboard,Justin Bieber,https://p.scdn.co/mp3-preview/79a899f597b18c3f...,0cfsbkanGUO3yzXCxA0iuF,"pop, black_metal, death_metal, industrial, thr...",,2010,251186,0.48,...,5,-5.214,1,0.0501,0.203,0.0,0.118,0.468,173.907,4


Unnamed: 0,track_id,name,artist,spotify_preview_url,spotify_id,tags,genre,year,duration_ms,danceability,...,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
38759,TRBZKDX128F4255A4A,Por Besarte,Lu,https://p.scdn.co/mp3-preview/f295fcc4a4baec43...,0JCLAFZZENDUGXVZrV9Mvy,"pop, guitar",Pop,2011,217400,0.443,...,6,-4.433,0,0.0422,0.193,0.0,0.0717,0.525,176.06,4


Unnamed: 0,track_id,name,artist,spotify_preview_url,spotify_id,tags,genre,year,duration_ms,danceability,...,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
13741,TRLHPYP128F4259AB7,Hitsville U.K.,The Clash,https://p.scdn.co/mp3-preview/458093f031b22914...,0e6aQYSW4RjoVf2lSlnuBV,"punk, 80s, punk_rock, new_wave, post_punk, ska",Punk,1991,263200,0.618,...,0,-5.709,1,0.0409,0.0375,3e-06,0.214,0.666,99.807,4


hitsville U.K is selected whether we choose 50 latent features or 100 as 
well as in the song-song similarity matrix approach.
Lets check some of the other songs features to see their features compared to hitsville U.K

It appears from a visual inspection that most of the songs recomended by matrix factorization are not similar to Hitsville U.K and provide mostly vastly different results. This could imply that other people that listen to hitsville U.K also listen to the other music and that there are similarities between these songs that we can not see directly. Our earlier algorithm though with similarities between songs is intuitive and we can see that the songs appear to sound similar by subjective inspection.

A way to verify if our models are working correctly would be to attempt to reccomend the songs to users. If they skip the song then they do not like it and our algorithm incorrectly reccomended a song. If they decide to listen to the song or even relisten to it, then our algorithm is working properly. We can take the feedback that we learn from this approach to fine tune our model further, however in this case it was static data obtained through kaggle and therefore we can not measure our model's effectiveness.

Lastly, the algorithm could be fine tuned by changing the song similarity metrics used. In our case we simply used a cosine similarity function and jaccard similarity function, however there might be a better way to measure weighted distances between certain columns that someone who studies music might know that would allow us a better performing algorithm.

In [80]:
# example using Mr brightside as the only song listened to because I like the song
Myself = np.zeros((tempSongs.shape[0]), dtype=np.int16)
Myself[song2index['TRIOREW128F424EAF0']] = 50 # setting Mr brightside as listened to 50 times
recItems = np.dot(Myself, similarity_memmap)
retval = np.argpartition(recItems, -4)[-4:]
showPredictions(retval)

Unnamed: 0,track_id,name,artist,spotify_preview_url,spotify_id,tags,genre,year,duration_ms,danceability,...,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
1091,TRLVRWW128F4293DDD,The Bucket,Kings of Leon,https://p.scdn.co/mp3-preview/1d06d8ee7914a860...,0hpE9ebmDck3MzARkvPvZr,"rock, alternative, indie, alternative_rock, in...",,2004,175546,0.498,...,0,-6.04,1,0.0328,0.0837,4.6e-05,0.194,0.18,139.832,4


Unnamed: 0,track_id,name,artist,spotify_preview_url,spotify_id,tags,genre,year,duration_ms,danceability,...,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
367,TRGRGQX128F9313DF4,Notion,Kings of Leon,https://p.scdn.co/mp3-preview/7cf9be1d6efd05fa...,0sy2eLJB1o7Fa8kAc7gVzW,"rock, alternative, indie, alternative_rock, in...",,2009,181493,0.451,...,4,-5.071,1,0.0445,0.00585,1.5e-05,0.121,0.402,142.485,4


Unnamed: 0,track_id,name,artist,spotify_preview_url,spotify_id,tags,genre,year,duration_ms,danceability,...,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
1875,TRFUCYR128F92DC67F,California Waiting,Kings of Leon,https://p.scdn.co/mp3-preview/83cf2e57ec31237d...,0txCPGqVRdntoHGc1ZcF6t,"rock, alternative, indie, alternative_rock, in...",,2003,208840,0.544,...,2,-4.079,1,0.0352,0.00639,7.1e-05,0.359,0.558,148.857,4


Unnamed: 0,track_id,name,artist,spotify_preview_url,spotify_id,tags,genre,year,duration_ms,danceability,...,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
0,TRIOREW128F424EAF0,Mr. Brightside,The Killers,https://p.scdn.co/mp3-preview/4d26180e6961fd46...,09ZQ5TmUG8TSL56n0knqrj,"rock, alternative, indie, alternative_rock, in...",,2004,222200,0.355,...,1,-4.36,1,0.0746,0.00119,0.0,0.0971,0.24,148.114,4
