In [12]:
import pandas as pd

In [22]:
user = pd.read_csv('./data/user_artists_gp6.dat', delimiter='\t').sort_values('userID')
user[user['userID'] == 4]
user.head(20)

Unnamed: 0,userID,artistID,weight
3831,2,61,3923
29764,2,55,8983
17043,2,93,1407
40565,2,100,1315
45098,2,74,2547
46039,2,90,1471
12182,2,89,1519
22763,2,57,5955
35680,2,97,1337
13510,2,84,1740


In [14]:
artists = pd.read_csv('./data/artists_type.csv', delimiter=';', index_col=0)
artists.head()

Unnamed: 0,artist_id,first_music,similar_artists1,similar_artists2,similar_artists3,music_tag1,music_tag2,music_tag3,music_tag4,music_tag5
1,15353,Cara mia,0,0,0,pop,swedish,male vocalists,dance,melodifestivalen
2,16249,SAMBA DE 恋して,0,0,0,jazz,japanese,instrumental,j-jazz,samurai jazz
3,14476,You Can't Teach An Old Dog New Tricks,0,0,0,blues,blues rock,folk,singer-songwriter,country
4,13505,Yo era ninya,9064,0,0,jewish,world,sephardic,turkish,ladino
5,17630,Elektro Hexe,0,0,0,,,,,


In [15]:
artists_weight = user.groupby('artistID').sum().drop('userID', axis=1).sort_values('weight', ascending=False).head(500).index
artists_weight.sort_values()

Index([    7,    30,    45,    51,    52,    54,    55,    56,    58,    59,
       ...
        6410,  6836,  7091,  7759,  8292,  8308,  9363, 14185, 14987, 15075],
      dtype='int64', name='artistID', length=500)

In [16]:
#top500 = pd.DataFrame(artists[artists['artist_id'].isin(artists_weight)].drop(['first_music', 'similar_artists1', 'similar_artists2', 'similar_artists3'], axis=1).set_index('artist_id'))
top500 = pd.DataFrame(artists[artists['artist_id'].isin(artists_weight)].set_index('artist_id'))
top500.sort_values('artist_id')

Unnamed: 0_level_0,first_music,similar_artists1,similar_artists2,similar_artists3,music_tag1,music_tag2,music_tag3,music_tag4,music_tag5
artist_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
7,The Beautiful People,7680,6257,0,industrial metal,industrial,metal,rock,alternative
30,Military Fashion Show,4317,0,3183,synthpop,ebm,electronic,futurepop,industrial
45,Shut Me Up,10024,0,0,industrial,industrial jungle pussy punk,punk,electronic,alternative
51,Invisible,2562,2556,2559,new wave,80s,pop,synth pop,rock
52,Post Houmous,0,0,7492,trip-hop,chillout,lush,electronic,downtempo
...,...,...,...,...,...,...,...,...,...
8292,Anun,9940,1908,0,soundtrack,japanese,instrumental,anime,anime ost
8308,Je te promets,0,3312,0,chanson francaise,french,rock,rock francais,johnny hallyday
9363,Here comes the King,2045,0,9364,instrumental,soundtrack,trailer music,epic,orchestral
14987,I BELIEVE IN,15003,14992,14997,christian,gospel,true story--years ago dear abby wrote of a man...,true story--years ago dear abby wrote of a man...,richard dixon radio


In [17]:
def get_fav_tag(userid):
    """
    This function takes an user id and return this favourite tags

    Arg:
        arg1: id of the user
    
    Return:
        A dictionary with as key each tag and as value their occurence
    """
    # We get all the rows that contain the userid and we keep only the artistID
    v_artists_listened = user[user['userID'] == int(userid)].sort_values('weight', ascending=False)['artistID'].to_list()   
    
    # We get only the rows of 'artsts' that are in 'v_reco' and drop all the columns except the music tags.
    artists_id = pd.DataFrame(artists[artists['artist_id'].isin(v_artists_listened)].drop(['first_music', 'similar_artists1', 'similar_artists2', 'similar_artists3'], axis=1))

    # We take the values of all the columns in a pandas Series
    tag_id = pd.Series(artists_id.iloc[:, 1:].values.flatten())

    # We return tag_id
    return tag_id

In [18]:
def get_artists_by_tag(tags, userid):
    """
    This function get the artists that the user doesn't listen
    but with tags that match

    Arg:
        arg1: Dictionary of tags of the user
        arg2: id of the user
    
    Return:
        the Serie of similar artists by tags
    """
    # We use the function value_counts to count the occurence of values of the serie 'tags'
    d_tags = tags.value_counts()
    sum_tags = sum(d_tags[:5])

    # We extract all the artists of top500 that have all theire tags matching with user's tags
    artists_with_tags = top500[top500.iloc[:, 4:].isin(d_tags.index.tolist()).all(axis=1)]

    # We get all the rows that contain the userid and we keep only the artistID
    v_artists_listened = user[user['userID'] == int(userid)].sort_values('weight', ascending=False)['artistID'].to_list()

    # We drop all the artists that are already listened by the user (using ~ .isin() that is the contrary of .isin())
    artists_with_tags = artists_with_tags[~artists_with_tags.index.isin(v_artists_listened)].drop(['first_music', 'similar_artists1', 'similar_artists2', 'similar_artists3'], axis=1)

    artists_with_tags['score'] = artists_with_tags.apply(lambda row: sum(d_tags[row[tag]] for tag in artists_with_tags[:]), axis=1)
    artists_with_tags['score_percent'] = round(artists_with_tags['score'] / sum_tags * 100, 0)
    artists_with_tags = artists_with_tags.sort_values('score', ascending=False)
    v_reco = artists_with_tags.head(5)
    return v_reco

In [20]:
#userid = input("Enter your userID: ")
userid = 2

if int(userid) in user['userID']:
    tags = get_fav_tag(userid)
    #print("Favourite tag of the user : \n", tags)
    v_reco = get_artists_by_tag(tags, userid)
    print(v_reco)
else:
    print("userID not in database")

           music_tag1        music_tag2        music_tag3  music_tag4  \
artist_id                                                               
1001         synthpop        electronic               pop         80s   
535               pop             dance  female vocalists     british   
1014              80s          synthpop               pop    new wave   
3502       electronic  female vocalists             dance         pop   
789               pop             dance  female vocalists  australian   

            music_tag5  score  score_percent  
artist_id                                     
1001          new wave     48           91.0  
535         electronic     48           91.0  
1014        electronic     48           91.0  
3502       electronica     46           87.0  
789         electronic     45           85.0  
