In [1]:

import numpy as np
import pandas as pd

#######################################

## Last.fm dataset

#######################################

data = pd.read_csv('user_artists.csv')

users  = data['userID'].unique()
artists = data['artistID'].unique()

#print('users: ', len(users)  )
#print('artists: ', len(artists)  )
print(data.head())

#######################################

data_ratings = pd.read_csv('user_artists.csv')
data_artists  = pd.read_csv('artists.csv')

#print(data_artists.head())

######################################

ratings_mat = np.zeros(shape=(17632, 1892), dtype=np.uint8)

#print(ratings_mat.shape)

#####################################



   userID  artistID  weight
0       2        51   13883
1       2        52   11690
2       2        53   11351
3       2        54   10300
4       2        55    8983


In [2]:
ratings_array   = data_ratings.weight.values
artistIds_array = data_ratings.artistID.values
userIds_array   = data_ratings.userID.values

################################################
# map real values to ref values for artist id and user id

artists_uniques_dict_ref_real = {}
artists_uniques_dict_real_ref = {}
artists_uniques = np.unique(  artistIds_array   )
for idx, res in enumerate(artists_uniques):
    artists_uniques_dict_ref_real[idx] = res
    artists_uniques_dict_real_ref[res] = idx
    

users_uniques_dict_ref_real = {}
users_uniques_dict_real_ref = {}
users_uniques = np.unique(  userIds_array   )
for idx, res in enumerate(users_uniques):
    users_uniques_dict_ref_real[idx] = res
    users_uniques_dict_real_ref[res] = idx
      

#print(  artists_uniques_dict_ref_real  )        

################################################

for i in range(   len(ratings_array)    ):
    u = userIds_array[i]
    a = artistIds_array[i]
    
    ref_u = users_uniques_dict_real_ref[u]
    ref_a = artists_uniques_dict_real_ref[a]
    
    ratings_mat[ref_a, ref_u] = ratings_array[i]

################################################


print(  ratings_mat  )


[[  0   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0 152]
 ...
 [  0   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]]


In [3]:
U, S, V = np.linalg.svd(  ratings_mat  )

print(U.shape)
print(S.shape)
print(V.shape)


(17632, 17632)
(1892,)
(1892, 1892)


In [4]:
## goal: to recommend similar music artists

###################################################

## function to calculate the cosine similarity
## it sorts by most similar, and returns the top n


def top_cosine_similarity(data, artist_id, top_n):
    
    index = artist_id
        
    artist_row = data[index, : ]    
    
    similarity = np.dot(artist_row, data.T)   
                     
    ## returns indeces sorted from smallest to the biggest
    sort_indeces = np.argsort(similarity)
                     
    return sort_indeces[:top_n]
                           

###################################################
# sample maps -> Kanye West(325), RHCP(214), Queen(950), Rihanna(282), Disturbed(969), Katy Perry(294), Linkin Park(317),
#                Boston(2448), Michael Jackson(157), Eminem(469), Paramore(492)


def search():
   
    k = 60   ## principal components   
    sliced_matrix = U[:, :k]
    
    try:
        
        val = int(input("1) Search by ArtistID\n2) Search by ArtistName\n"))

        if(val == 1):
            artist_id = int(input("Enter artistID: "))
        elif(val == 2):
            name = str(input("Enter Artist Name: "))
            artist_id = int(data_artists[data_artists['name'] == name].index[0])


        top_indeces = top_cosine_similarity(sliced_matrix, artist_id, top_n=10)

        real_my_id = artists_uniques_dict_ref_real[artist_id]

   ##################### Output ###################### 
    
        print("You searched for similar artists to ",
                '\033[1m' + data_artists[data_artists.id == real_my_id].name.values[0] + "\n")

        print(f"{'Artist':.<20}{'Real ID':.>5}" + '\033[0m')

        for ref in top_indeces:
            real_artist_id = artists_uniques_dict_ref_real[ref] 
            print(f"{data_artists[data_artists.id == real_artist_id].name.values[0]:.<22}{real_artist_id:.>5}")
    except:
        print("The value you entered is invalid or does not exist.")
        search()
        




In [14]:
search()

1) Search by ArtistID
2) Search by ArtistName
2
Enter Artist Name: Led Zepplin
The value you entered is invalid or does not exist.
1) Search by ArtistID
2) Search by ArtistName
2
Enter Artist Name: Led Zeplin
The value you entered is invalid or does not exist.
1) Search by ArtistID
2) Search by ArtistName
2
Enter Artist Name: Metallica
You searched for similar artists to  [1mMetallica

Artist..............Real ID[0m
Franz Ferdinand........1090
Kings of Leon...........228
Placebo.................173
Blur....................203
Ramones................1513
Evanescence.............378
Linkin Park.............377
Michael Jackson.........157
Nightwish...............930
Within Temptation.......518


In [6]:
############### testing ######################
'''
def lookup(): ## I used this lookup function 
    val = int(input("Search by: 1) id 2) name: "))
    if(val == 1):
         # get name given id
        try:
            index = int(input("Give me the REF ID, I'll give you the NAME: "))
            print(index, '-->', data_artists.name[index], '\n')
        except:
            print("Oops! Could not find REF ID.")
    elif(val == 2):
        # get id given name
        try:
            
            name = str(input("Give me the NAME, I'll give you the REF ID: "))
            print(name, '-->', data_artists[data_artists['name'] == name].index[0])
        except:
            print("Oops! Could not find NAME.")
    else:
        print("Invalid selection")
        lookup()
    

lookup()
'''

'\ndef lookup(): ## I used this lookup function \n    val = int(input("Search by: 1) id 2) name: "))\n    if(val == 1):\n         # get name given id\n        try:\n            index = int(input("Give me the REF ID, I\'ll give you the NAME: "))\n            print(index, \'-->\', data_artists.name[index], \'\n\')\n        except:\n            print("Oops! Could not find REF ID.")\n    elif(val == 2):\n        # get id given name\n        try:\n            \n            name = str(input("Give me the NAME, I\'ll give you the REF ID: "))\n            print(name, \'-->\', data_artists[data_artists[\'name\'] == name].index[0])\n        except:\n            print("Oops! Could not find NAME.")\n    else:\n        print("Invalid selection")\n        lookup()\n    \n\nlookup()\n'

In [7]:

################# About the dataset ##################

# subset of a 2011 Last.fm dataset

# last.fm is a service that can be connected to your music streaming service of choice to track activity

# artists.csv (artistID, artistName | 2 x 17000 )
# user_artists.csv (userID, artistID, weight | 3 x 93000)
# weight is like 'rating', but can be considered the number of times a user listened to that artist

# best results were found with 50 - 60 principal components

######################################################
