In [None]:
import pandas as pd
from math import sqrt
import numpy as np


In [None]:
music_df = pd.read_table('artists.dat')
ratings_df = pd.read_table('user_artists.dat')
print(music_df.info())
print(ratings_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17632 entries, 0 to 17631
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   id          17632 non-null  int64 
 1   name        17632 non-null  object
 2   url         17632 non-null  object
 3   pictureURL  17188 non-null  object
dtypes: int64(1), object(3)
memory usage: 551.1+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 92834 entries, 0 to 92833
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype
---  ------    --------------  -----
 0   userID    92834 non-null  int64
 1   artistID  92834 non-null  int64
 2   weight    92834 non-null  int64
dtypes: int64(3)
memory usage: 2.1 MB
None


In [None]:
userInput = [{'name':'Daft Punk', 'rating':5},
             {'name':'Kanye West', 'rating':1},
             {'name':'Chris Brown', 'rating':1},
             {'name':'Jonas Brothers', 'rating':5},
             {'name':'Coldplay', 'rating':4.5}]
inputMusics = pd.DataFrame(userInput)
print(inputMusics)

             name  rating
0       Daft Punk     5.0
1      Kanye West     1.0
2     Chris Brown     1.0
3  Jonas Brothers     5.0
4        Coldplay     4.5


In [None]:
inputId = music_df[music_df['name'].isin(inputMusics['name'].tolist())]
inputMusics = pd.merge(inputId, inputMusics)
inputMusics = inputMusics.drop('pictureURL', 1) #we don't really need this at the moment
inputMusics = inputMusics[['id','name','rating']]
print(inputMusics)

    id            name  rating
0   56       Daft Punk     5.0
1   65        Coldplay     4.5
2  321  Jonas Brothers     5.0
3  327     Chris Brown     1.0
4  331      Kanye West     1.0


  inputMusics = inputMusics.drop('pictureURL', 1) #we don't really need this at the moment


In [None]:
userSubset = ratings_df[ratings_df['artistID'].isin(inputMusics['id'].tolist())]
print(userSubset.groupby('artistID').count())

          userID  weight
artistID                
56           151     151
65           369     369
321           82      82
327           79      79
331          134     134


In [None]:
#Groupby creates several sub dataframes where they all have the same value in the column specified as the parameter
userSubsetGroup = userSubset.groupby(['userID'])

def take_5_elem(x):
    # print (len(x[1]))
    return len(x[1])
    

#Sorting it so users with movie most in common with the input will have priority
userSubsetGroup = sorted(userSubsetGroup, key=take_5_elem, reverse=True)

userSubsetGroup = userSubsetGroup[0:100]
print(userSubsetGroup[0:5])

[(905,        userID  artistID  weight
41002     905        65     369
41018     905       321     112
41019     905       327     115
41021     905       331     761), (1053,        userID  artistID  weight
47391    1053        56     144
47392    1053        65     221
47407    1053       327     180
47409    1053       331     448), (7,      userID  artistID  weight
287       7       321     674
293       7       327     587
297       7       331     516), (43,       userID  artistID  weight
1965      43        56    2691
1968      43        65     635
1986      43       331     133), (120,       userID  artistID  weight
5665     120        65      48
5679     120       327     144
5680     120       331      26)]


In [None]:
#Store the Pearson Correlation in a dictionary, where the key is the user Id and the value is the coefficient
pearsonCorrelationDict = {}

#For every user group in our subset
for name, group in userSubsetGroup:

    #Let's start by sorting the input and current user group so the values aren't mixed up later on
    group = group.sort_values(by='artistID')
    inputMusics = inputMusics.sort_values(by='id')

    #Get the N for the formula
    nRatings = len(group)

    #Get the review scores for the movies that they both have in common
    temp_df = inputMusics[inputMusics['id'].isin(group['artistID'].tolist())]

    #And then store them in a temporary buffer variable in a list format to facilitate future calculations
    tempRatingList = temp_df['rating'].tolist()
   
    #Let's also put the current user group reviews in a list format
    tempGroupList = group['weight'].tolist()
   
    
    #Now let's calculate the pearson correlation between two users, so called, x and y manually (check the formula from week 7 slide)
    Sxx = sum([i**2 for i in tempRatingList]) - pow(sum(tempRatingList),2)/float(nRatings)
    Syy = sum([i**2 for i in tempGroupList]) - pow(sum(tempGroupList),2)/float(nRatings)
    Sxy = sum( i*j for i, j in zip(tempRatingList, tempGroupList)) - sum(tempRatingList)*sum(tempGroupList)/float(nRatings)

    #If the denominator is different than zero, then divide, else, 0 correlation.
    if Sxx != 0 and Syy != 0:
        pearsonCorrelationDict[name] = Sxy/sqrt(Sxx*Syy)
    else:
        pearsonCorrelationDict[name] = 0
    

In [None]:
pearsonDF = pd.DataFrame.from_dict(pearsonCorrelationDict, orient='index')
pearsonDF.columns = ['similarityIndex']
pearsonDF['userID'] = pearsonDF.index
pearsonDF.index = range(len(pearsonDF))
print(pearsonDF.head())

   similarityIndex  userID
0        -0.403332     905
1        -0.573968    1053
2         0.893732       7
3         0.734427      43
4        -0.340440     120


In [None]:
topUsers=pearsonDF.sort_values(by='similarityIndex', ascending=False)[0:50]
print(topUsers.head())

    similarityIndex  userID
50              1.0     413
23              1.0       2
79              1.0    1012
58              1.0     496
29              1.0     100


In [None]:

topUsersRating=topUsers.merge(ratings_df, left_on='userID', right_on='userID', how='inner')
print(topUsersRating.head(100))

     similarityIndex  userID  artistID  weight
95               1.0       2        96    1342
96               1.0       2        97    1337
97               1.0       2        98    1332
98               1.0       2        99    1330
99               1.0       2       100    1315
100              1.0    1012         7     149
101              1.0    1012        45     547
102              1.0    1012        56      99
103              1.0    1012       173     134
104              1.0    1012       190    1442


In [None]:
#Applies a sum to the topUsers after grouping it up by movieId
tempTopUsersRating = topUsersRating.groupby('artistID').sum()[['similarityIndex','weight']]
tempTopUsersRating.columns = ['sum_similarityIndex','sum_weightedRating']
print(tempTopUsersRating.head())

          sum_similarityIndex  sum_weightedRating
artistID                                         
6                         1.0                  95
7                         2.0                 546
9                         2.0                 263
16                        1.0                  88
30                        2.0                6918


In [None]:

#Creates an empty dataframe
recommendation_df = pd.DataFrame()

#Now we take the weighted average
recommendation_df['weighted average recommendation score'] = tempTopUsersRating['sum_weightedRating']/tempTopUsersRating['sum_similarityIndex']
recommendation_df['artistID'] = tempTopUsersRating.index
print(recommendation_df.head(10))

          weighted average recommendation score  artistID
artistID                                                 
6                                          95.0         6
7                                         273.0         7
9                                         131.5         9
16                                         88.0        16
30                                       3459.0        30
39                                         89.0        39
45                                        547.0        45
46                                         50.0        46
51                                       7005.5        51
52                                      11690.0        52


In [None]:
recommendation_df = recommendation_df.sort_values(by='weighted average recommendation score', ascending=False)
print(recommendation_df)

          weighted average recommendation score  artistID
artistID                                                 
6619                                        inf      6619
11934                                       inf     11934
13296                                       inf     13296
13292                                       inf     13292
9391                                        inf      9391
...                                         ...       ...
13724                                  5.000000     13724
8555                                   5.000000      8555
3176                                   4.579945      3176
4753                                   4.579945      4753
16048                                  4.579945     16048

[1145 rows x 2 columns]


In [None]:
recommended_artist=music_df.loc[music_df['id'].isin(recommendation_df['artistID'])]

#we don't want to recommend the same movie
recommended_artist=recommended_artist.loc[~recommended_artist.id.isin(userSubset['artistID'])]

print(recommended_artist)

          id                    name  \
5          6               Moonspell   
6          7          Marilyn Manson   
8          9             Combichrist   
14        16   London After Midnight   
27        30                 And One   
...      ...                     ...   
15246  16044  ShababGamed(Wama4love)   
15247  16045                Steadman   
15248  16046   Cavaleiros do Zodíaco   
15249  16047            Paulo Miklos   
15250  16048                  Santos   

                                                     url  \
5                     http://www.last.fm/music/Moonspell   
6                http://www.last.fm/music/Marilyn+Manson   
8                   http://www.last.fm/music/Combichrist   
14        http://www.last.fm/music/London+After+Midnight   
27                      http://www.last.fm/music/And+One   
...                                                  ...   
15246  http://www.last.fm/music/ShababGamed%28Wama4lo...   
15247                  http://www.last.