In [2]:
import pandas as pd
from math import sqrt
import numpy as np

In [34]:
ratings_df = pd.read_csv('Data/BX-Book-Ratings.csv', sep =';', header=0, encoding='latin-1')
books_df = pd.read_csv('Data/BX-Books.csv', sep=';', escapechar='\\', header=0, encoding='latin-1')
user_df = pd.read_csv('Data/BX-Users.csv', sep =';', header=0, encoding='latin-1')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1149780 entries, 0 to 1149779
Data columns (total 3 columns):
 #   Column       Non-Null Count    Dtype 
---  ------       --------------    ----- 
 0   User-ID      1149780 non-null  int64 
 1   ISBN         1149780 non-null  object
 2   Book-Rating  1149780 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 26.3+ MB
None


In [88]:
userInput = [{'ISBN':'0971880107', 'Book-Rating':'9'},
             {'ISBN':'0316666343', 'Book-Rating':'6'},
             {'ISBN':'0385504209', 'Book-Rating':'4'},
             {'ISBN':'0060928336', 'Book-Rating':'7'},
             {'ISBN':'0312195516', 'Book-Rating':'5'}]
inputBooks = pd.DataFrame(userInput)
print(inputBooks)

         ISBN Book-Rating
0  0971880107           9
1  0316666343           6
2  0385504209           4
3  0060928336           7
4  0312195516           5


In [89]:
ratings_df['ISBN'].value_counts()

0971880107     2502
0316666343     1295
0385504209      883
0060928336      732
0312195516      723
               ... 
1568656386        1
1568656408        1
1569551553        1
1570081808        1
05162443314       1
Name: ISBN, Length: 340556, dtype: int64

In [90]:
userSubset = ratings_df[ratings_df['ISBN'].isin(inputBooks['ISBN'].tolist())]
print(userSubset.groupby('ISBN').count())

            User-ID  Book-Rating
ISBN                            
0060928336      732          732
0312195516      723          723
0316666343     1295         1295
0385504209      883          883
0971880107     2502         2502


In [91]:
userSubsetGroup = userSubset.groupby(['User-ID'])

def take_5_elem(x):
    return len(x[1])
    

#Sorting it so users with movie most in common with the input will have priority
userSubsetGroup = sorted(userSubsetGroup, key=take_5_elem, reverse=True)

userSubsetGroup = userSubsetGroup[0:100]
print(userSubsetGroup[0:5])

[(11676,        User-ID        ISBN  Book-Rating
45706    11676  0060928336            0
46899    11676  0312195516           10
47223    11676  0316666343            5
48823    11676  0385504209            9
55317    11676  0971880107            6), (16795,        User-ID        ISBN  Book-Rating
78071    16795  0060928336            0
78381    16795  0312195516            9
78485    16795  0316666343           10
78991    16795  0385504209           10
80759    16795  0971880107            0), (35859,         User-ID        ISBN  Book-Rating
157262    35859  0060928336            0
157711    35859  0312195516           10
157955    35859  0316666343            6
158759    35859  0385504209            0
162665    35859  0971880107            0), (46398,         User-ID        ISBN  Book-Rating
203528    46398  0060928336            0
203610    46398  0312195516            0
203626    46398  0316666343            6
203728    46398  0385504209            0
204119    46398  0971880107   

In [92]:
#Store the Pearson Correlation in a dictionary, where the key is the user Id and the value is the coefficient
pearsonCorrelationDict = {}

#For every user group in our subset
for name, group in userSubsetGroup:

    #Let's start by sorting the input and current user group so the values aren't mixed up later on
    group = group.sort_values(by='ISBN')
    inputBooks = inputBooks.sort_values(by='ISBN')

    #Get the N for the formula
    nRatings = len(group)

    #Get the review scores for the movies that they both have in common
    temp_df = inputBooks[inputBooks['ISBN'].isin(group['ISBN'].tolist())]

    #And then store them in a temporary buffer variable in a list format to facilitate future calculations
    tempRatingList = temp_df['Book-Rating'].astype('int').tolist()
   
    #Let's also put the current user group reviews in a list format
    tempGroupList = group['Book-Rating'].tolist()
   
    
    #Now let's calculate the pearson correlation between two users, so called, x and y manually (check the formula from week 7 slide)
    Sxx = sum([i**2 for i in tempRatingList]) - pow(sum(tempRatingList),2)/float(nRatings)
    Syy = sum([i**2 for i in tempGroupList]) - pow(sum(tempGroupList),2)/float(nRatings)
    Sxy = sum( i*j for i, j in zip(tempRatingList, tempGroupList)) - sum(tempRatingList)*sum(tempGroupList)/float(nRatings)

    #If the denominator is different than zero, then divide, else, 0 correlation.
    if Sxx != 0 and Syy != 0:
        pearsonCorrelationDict[name] = Sxy/sqrt(Sxx*Syy)
    else:
        pearsonCorrelationDict[name] = 0
    


In [93]:
pearsonDF = pd.DataFrame.from_dict(pearsonCorrelationDict, orient='index')
pearsonDF.columns = ['similarityIndex']
pearsonDF['userId'] = pearsonDF.index
pearsonDF.index = range(len(pearsonDF))
print(pearsonDF.head())

   similarityIndex  userId
0        -0.528194   11676
1        -0.851714   16795
2        -0.372602   35859
3         0.459509   46398
4        -0.854242   52002


In [94]:
topUsers=pearsonDF.sort_values(by='similarityIndex', ascending=False)[0:100]
print(topUsers.head())

    similarityIndex  userId
15         0.894427    8253
20         0.878310   37950
42         0.800641  167800
22         0.774597   40889
52         0.715626  204864


In [95]:
topUsersRating=topUsers.merge(ratings_df, left_on='userId', right_on='User-ID', how='inner')
print(topUsersRating.head(100))

    similarityIndex  userId  User-ID        ISBN  Book-Rating
0          0.894427    8253     8253  0030119502            0
1          0.894427    8253     8253  0060391626            9
2          0.894427    8253     8253  0060912529            0
3          0.894427    8253     8253  0060915544            9
4          0.894427    8253     8253  0060922532            0
..              ...     ...      ...         ...          ...
95         0.894427    8253     8253  0440127793            9
96         0.894427    8253     8253  0440156998            0
97         0.894427    8253     8253  0440167361           10
98         0.894427    8253     8253  0440211263            0
99         0.894427    8253     8253  044021145X            9

[100 rows x 5 columns]


In [96]:
#Multiplies the similarity by the user’s ratings
topUsersRating['weightedRating'] = topUsersRating['similarityIndex']*topUsersRating['Book-Rating']
print(topUsersRating.head())

   similarityIndex  userId  User-ID        ISBN  Book-Rating  weightedRating
0         0.894427    8253     8253  0030119502            0        0.000000
1         0.894427    8253     8253  0060391626            9        8.049845
2         0.894427    8253     8253  0060912529            0        0.000000
3         0.894427    8253     8253  0060915544            9        8.049845
4         0.894427    8253     8253  0060922532            0        0.000000


In [97]:
#Applies a sum to the topUsers after grouping it up by ISBN
tempTopUsersRating = topUsersRating.groupby('ISBN').sum()[['similarityIndex','weightedRating']]
tempTopUsersRating.columns = ['sum_similarityIndex','sum_weightedRating']
print(tempTopUsersRating.head())

               sum_similarityIndex  sum_weightedRating
ISBN                                                  
 0375404120              -0.058124            0.000000
 9022906116              -0.528194           -3.697356
0 7336 1053 6            -0.528194            0.000000
0000000000               -0.528194           -4.753744
00000000000              -0.528194           -4.225550


In [98]:
#Creates an empty dataframe
recommendation_df = pd.DataFrame()

#Weighted average
recommendation_df['weighted average recommendation score'] = tempTopUsersRating['sum_weightedRating']/tempTopUsersRating['sum_similarityIndex']
recommendation_df['ISBN'] = tempTopUsersRating.index
print(recommendation_df.head(10))

               weighted average recommendation score           ISBN
ISBN                                                               
 0375404120                                     -0.0     0375404120
 9022906116                                      7.0     9022906116
0 7336 1053 6                                   -0.0  0 7336 1053 6
0000000000                                       9.0     0000000000
00000000000                                      8.0    00000000000
0000001042283                                    NaN  0000001042283
0001047868                                       NaN     0001047868
0001055666                                       9.0     0001055666
0001056107                                       8.0     0001056107
0001714600                                       0.0     0001714600


In [102]:
recommendation_df = recommendation_df.sort_values(by='weighted average recommendation score', ascending=False)
print(recommendation_df.head(10))

            weighted average recommendation score        ISBN
ISBN                                                         
0590503235                                    inf  0590503235
0061020699                            7851.169150  0061020699
0061093343                            1599.193807  0061093343
1896860982                             873.886957  1896860982
0671729462                             586.549246  0671729462
0394800184                             375.145733  0394800184
0843104287                             332.972725  0843104287
0375506128                             324.327989  0375506128
0679456945                             288.291546  0679456945
0671729403                             288.274623  0671729403


In [104]:
recommended_book=books_df.loc[books_df['ISBN'].isin(recommendation_df['ISBN'])]

recommended_book=recommended_book.loc[~recommended_book.ISBN.isin(userSubset['ISBN'])]

print(recommended_book.head())

          ISBN                                         Book-Title  \
1   0002005018                                       Clara Callan   
3   0374157065  Flu: The Story of the Great Influenza Pandemic...   
5   0399135782                             The Kitchen God's Wife   
7   0671870432                                    PLEADING GUILTY   
10  0771074670                        Nights Below Station Street   

             Book-Author  Year-Of-Publication              Publisher  \
1   Richard Bruce Wright                 2001  HarperFlamingo Canada   
3       Gina Bari Kolata                 1999   Farrar Straus Giroux   
5                Amy Tan                 1991       Putnam Pub Group   
7            Scott Turow                 1993             Audioworks   
10  David Adams Richards                 1988        Emblem Editions   

                                          Image-URL-S  \
1   http://images.amazon.com/images/P/0002005018.0...   
3   http://images.amazon.com/images/P/