Steps in a user-based recommendation system:

1. Select a user with the movies the user has watched
2. Based on his rating to movies, find the top x neighbours
3. Get the watched movie record of the user for each neighbour.
4. Calculate a similarity score using some formula
5. Recommend the items with the highest score

In [20]:
import pandas as pd
from math import sqrt
import numpy as np


In [23]:
cellphones_df = pd.read_csv('cellphones data.csv')
ratings_df = pd.read_csv('cellphones ratings.csv')
print(cellphones_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33 entries, 0 to 32
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   cellphone_id      33 non-null     int64  
 1   brand             33 non-null     object 
 2   model             33 non-null     object 
 3   operating system  33 non-null     object 
 4   internal memory   33 non-null     int64  
 5   RAM               33 non-null     int64  
 6   performance       33 non-null     float64
 7   main camera       33 non-null     int64  
 8   selfie camera     33 non-null     int64  
 9   battery size      33 non-null     int64  
 10  screen size       33 non-null     float64
 11  weight            33 non-null     int64  
 12  price             33 non-null     int64  
 13  release date      33 non-null     object 
dtypes: float64(2), int64(8), object(4)
memory usage: 3.7+ KB
None


In [24]:
userInput = [{'model': 'iPhone 13 Mini', 'rating':9},
             {'model': 'Pixel 6', 'rating':4},
             {'model': 'Zenfone 8', 'rating':5},
             {'model': 'Galaxy A13', 'rating':7},
             {'model': 'Find X5 Pro', 'rating':8}]
inputCellphone = pd.DataFrame(userInput)
print(inputCellphone)

            model  rating
0  iPhone 13 Mini       9
1         Pixel 6       4
2       Zenfone 8       5
3      Galaxy A13       7
4     Find X5 Pro       8


In [25]:
inputId = cellphones_df[cellphones_df['model'].isin(inputCellphone['model'].tolist())]
inputCellphone = pd.merge(inputId, inputCellphone)
inputCellphone = inputCellphone.drop('brand', 1) #we don't really need this at the moment
inputCellphone = inputCellphone[['cellphone_id','model','rating']]
print(inputCellphone)

   cellphone_id           model  rating
0             1  iPhone 13 Mini       9
1             6       Zenfone 8       5
2             7      Galaxy A13       7
3            22     Find X5 Pro       8


  inputCellphone = inputCellphone.drop('brand', 1) #we don't really need this at the moment


In [28]:
userSubset = ratings_df[ratings_df['cellphone_id'].isin(inputCellphone['cellphone_id'].tolist())]
print(userSubset.groupby('cellphone_id').count())

              user_id  rating
cellphone_id                 
1                  24      24
6                  30      30
7                  29      29
22                 30      30


In [29]:
#Groupby creates several sub dataframes where they all have the same value in the column specified as the parameter
userSubsetGroup = userSubset.groupby(['user_id'])

def take_5_elem(x):
    # print (len(x[1]))
    return len(x[1])
    

#Sorting it so users with resto most in common with the input will have priority
userSubsetGroup = sorted(userSubsetGroup, key=take_5_elem, reverse=True)

userSubsetGroup = userSubsetGroup[0:100]
print(userSubsetGroup[0:5])


[(36,      user_id  cellphone_id  rating
172       36             6       1
173       36             1       6
176       36            22       4), (178,      user_id  cellphone_id  rating
650      178             1      10
657      178            22       9
658      178             7      10), (194,      user_id  cellphone_id  rating
672      194             1       9
677      194            22       8
679      194             7       8), (6,     user_id  cellphone_id  rating
23        6             1       2
27        6             7       9), (24,     user_id  cellphone_id  rating
74       24            22       7
75       24             6       7)]


In [30]:
#Store the Pearson Correlation in a dictionary, where the key is the user Id and the value is the coefficient
pearsonCorrelationDict = {}

#For every user group in our subset
for name, group in userSubsetGroup:

    #Let's start by sorting the input and current user group so the values aren't mixed up later on
    group = group.sort_values(by='cellphone_id')
    inputCellphone = inputCellphone.sort_values(by='cellphone_id')

    #Get the N for the formula
    nRatings = len(group)

    #Get the review scores for the movies that they both have in common
    temp_df = inputCellphone[inputCellphone['cellphone_id'].isin(group['cellphone_id'].tolist())]

    #And then store them in a temporary buffer variable in a list format to facilitate future calculations
    tempRatingList = temp_df['rating'].tolist()
   
    #Let's also put the current user group reviews in a list format
    tempGroupList = group['rating'].tolist()
   
    
    #Now let's calculate the pearson correlation between two users, so called, x and y manually (check the formula from week 7 slide)
    Sxx = sum([i**2 for i in tempRatingList]) - pow(sum(tempRatingList),2)/float(nRatings)
    Syy = sum([i**2 for i in tempGroupList]) - pow(sum(tempGroupList),2)/float(nRatings)
    Sxy = sum( i*j for i, j in zip(tempRatingList, tempGroupList)) - sum(tempRatingList)*sum(tempGroupList)/float(nRatings)

    #If the denominator is different than zero, then divide, else, 0 correlation.
    if Sxx != 0 and Syy != 0:
        pearsonCorrelationDict[name] = Sxy/sqrt(Sxx*Syy)
    else:
        pearsonCorrelationDict[name] = 0
    


In [31]:
pearsonDF = pd.DataFrame.from_dict(pearsonCorrelationDict, orient='index')
pearsonDF.columns = ['similarityIndex']
pearsonDF['user_id'] = pearsonDF.index
pearsonDF.index = range(len(pearsonDF))
print(pearsonDF.head())


   similarityIndex  user_id
0         0.986241       36
1         0.000000      178
2         0.866025      194
3        -1.000000        6
4         0.000000       24


In [32]:
topUsers=pearsonDF.sort_values(by='similarityIndex', ascending=False)[0:50]
print(topUsers.head())

    similarityIndex  user_id
23              1.0      234
13              1.0      124
29              1.0      258
24              1.0      240
18              1.0      156


In [33]:
topUsersRating=topUsers.merge(ratings_df, left_on='user_id', right_on='user_id', how='inner')
print(topUsersRating.head(100))

    similarityIndex  user_id  cellphone_id  rating
0               1.0      234            20       4
1               1.0      234            10       8
2               1.0      234            13       7
3               1.0      234             6       1
4               1.0      234            30       1
..              ...      ...           ...     ...
95              1.0      148             0      10
96              1.0      148             1      10
97              1.0      148            24       5
98              1.0      148             5       4
99              1.0      148             2      10

[100 rows x 4 columns]


In [34]:
#Multiplies the similarity by the user’s ratings
topUsersRating['weightedRating'] = topUsersRating['similarityIndex']*topUsersRating['rating']
print(topUsersRating.head())

   similarityIndex  user_id  cellphone_id  rating  weightedRating
0              1.0      234            20       4             4.0
1              1.0      234            10       8             8.0
2              1.0      234            13       7             7.0
3              1.0      234             6       1             1.0
4              1.0      234            30       1             1.0


In [35]:
#Applies a sum to the topUsers after grouping it up by movieId
tempTopUsersRating = topUsersRating.groupby('cellphone_id').sum()[['similarityIndex','weightedRating']]
tempTopUsersRating.columns = ['sum_similarityIndex','sum_weightedRating']
print(tempTopUsersRating.head())

              sum_similarityIndex  sum_weightedRating
cellphone_id                                         
0                        1.000000           10.000000
1                        7.852267           67.711677
2                        4.986241           45.862414
3                        1.866025           15.196152
4                        0.000000            0.000000


In [36]:
#Creates an empty dataframe
recommendation_df = pd.DataFrame()

#Now we take the weighted average
recommendation_df['weighted average recommendation score'] = tempTopUsersRating['sum_weightedRating']/tempTopUsersRating['sum_similarityIndex']
recommendation_df['cellphone_id'] = tempTopUsersRating.index
print(recommendation_df.head(10))

              weighted average recommendation score  cellphone_id
cellphone_id                                                     
0                                         10.000000             0
1                                          8.623201             1
2                                          9.197793             2
3                                          8.143594             3
4                                               NaN             4
5                                          7.001969             5
6                                          5.006891             6
7                                          8.000000             7
8                                          8.000000             8
9                                          6.415128             9


In [37]:
recommendation_df = recommendation_df.sort_values(by='weighted average recommendation score', ascending=False)
print(recommendation_df)


              weighted average recommendation score  cellphone_id
cellphone_id                                                     
0                                         10.000000             0
2                                          9.197793             2
12                                         8.666667            12
1                                          8.623201             1
3                                          8.143594             3
7                                          8.000000             7
8                                          8.000000             8
21                                         8.000000            21
22                                         7.752305            22
25                                         7.500000            25
19                                         7.493611            19
23                                         7.333333            23
28                                         7.250000            28
5         

In [38]:
recommended_cellphone=ratings_df.loc[ratings_df['cellphone_id'].isin(recommendation_df['cellphone_id'])]

#we don't want to recommend the same movie
recommended_cellphone=recommended_cellphone.loc[~recommended_cellphone.cellphone_id.isin(userSubset['cellphone_id'])]

print(recommended_cellphone)

     user_id  cellphone_id  rating
0          0            30       1
1          0             5       3
2          0            10       9
3          0             9       3
4          0            23       2
..       ...           ...     ...
985      258            31       5
986      258            17       8
987      258            23       9
988      258            27       8
989      258            24       6

[877 rows x 3 columns]
