In [1]:
import pandas as pd
from math import sqrt
import numpy as np


In [2]:
movies_df = pd.read_csv('movies.csv')
ratings_df = pd.read_csv('ratings.csv')
ratings_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB


In [3]:
userInput = [
    {'title': 'Die Hard (1988)', 'rating': 4},
    {'title': 'Bridget Jones\'s Diary (2001)', 'rating': 3},
    {'title': 'Spotlight (2015)', 'rating': 5},
    {'title': 'Frozen (2013)', 'rating': 4},
    {'title': 'The Matrix (1999)', 'rating': 5}
]
inputMovies = pd.DataFrame(userInput)
print(inputMovies)

                          title  rating
0               Die Hard (1988)       4
1  Bridget Jones's Diary (2001)       3
2              Spotlight (2015)       5
3                 Frozen (2013)       4
4             The Matrix (1999)       5


In [4]:
inputId = movies_df[movies_df['title'].isin(inputMovies['title'].tolist())]
inputMovies = pd.merge(inputId, inputMovies)
inputMovies = inputMovies[['movieId','title','rating']]
print(inputMovies)


   movieId                         title  rating
0     1036               Die Hard (1988)       4
1     4246  Bridget Jones's Diary (2001)       3
2   106696                 Frozen (2013)       4
3   142488              Spotlight (2015)       5


In [5]:
userSubset = ratings_df[ratings_df['movieId'].isin(inputMovies['movieId'].tolist())]
print(userSubset.groupby('movieId').count())


         userId  rating  timestamp
movieId                           
1036        145     145        145
4246         65      65         65
106696       29      29         29
142488       19      19         19


In [6]:
userSubsetGroup = userSubset.groupby(['userId'])

def take_5_elem(x):
    return len(x[1])

userSubsetGroup = sorted(userSubsetGroup, key=take_5_elem, reverse=True)

userSubsetGroup = userSubsetGroup[0:100]
print(userSubsetGroup[0:5])

[((249,),        userId  movieId  rating   timestamp
36440     249     1036     4.0  1346757908
36643     249     4246     3.5  1415415300
37282     249   106696     3.5  1396222978
37371     249   142488     4.5  1516415602), ((305,),        userId  movieId  rating   timestamp
46005     305     1036     4.0  1460135744
46162     305     4246     4.5  1460303017
46526     305   106696     4.0  1460303575
46595     305   142488     3.0  1460134123), ((525,),        userId  movieId  rating   timestamp
82674     525     1036     4.5  1476475914
82798     525     4246     3.5  1476479382
83064     525   106696     3.5  1476476140
83099     525   142488     3.5  1476476169), ((63,),       userId  movieId  rating   timestamp
9200      63     1036     4.0  1443199792
9290      63     4246     3.0  1443290259
9418      63   106696     5.0  1443461176), ((68,),        userId  movieId  rating   timestamp
10528      68     1036     3.0  1158531885
10978      68     4246     4.0  1158532209
11613 

In [7]:
# Initialize a dictionary to store Pearson Correlation coefficients
pearson_corr_dict = {}

for user_id, ratings in userSubsetGroup:

    sorted_user_ratings = ratings.sort_values(by='movieId')
    sorted_input_ratings = inputMovies.sort_values(by='movieId')

    common_count = len(sorted_user_ratings)

    common_ratings = inputMovies[inputMovies['movieId'].isin(sorted_user_ratings['movieId'].tolist())]
    user_ratings_list = common_ratings['rating'].tolist()
    input_ratings_list = sorted_user_ratings['rating'].tolist()

    sum_user_sq = sum([i**2 for i in user_ratings_list]) - pow(sum(user_ratings_list), 2) / common_count
    sum_input_sq = sum([i**2 for i in input_ratings_list]) - pow(sum(input_ratings_list), 2) / common_count
    sum_product = sum(i * j for i, j in zip(user_ratings_list, input_ratings_list)) - sum(user_ratings_list) * sum(input_ratings_list) / common_count

    # Calculate the Pearson correlation coefficient
    if sum_user_sq != 0 and sum_input_sq != 0:
        pearson_corr_dict[user_id] = sum_product / (sqrt(sum_user_sq) * sqrt(sum_input_sq))
    else:
        pearson_corr_dict[user_id] = 0


In [8]:
pearsonDF = pd.DataFrame.from_dict(pearson_corr_dict, orient='index')
pearsonDF.columns = ['similarityIndex']
pearsonDF['userId'] = pearsonDF.index
pearsonDF.index = range(len(pearsonDF))
print(pearsonDF.head())

   similarityIndex  userId
0         0.852803  (249,)
1        -0.973329  (305,)
2         0.000000  (525,)
3         0.866025   (63,)
4         0.327327   (68,)


In [9]:
topUsers=pearsonDF.sort_values(by='similarityIndex', ascending=False)[0:50]
print(topUsers.head())
print(topUsers['userId'].dtype)
print(topUsers['userId'].apply(type).value_counts())
topUsers['userId'] = topUsers['userId'].apply(lambda x: x[0])
topUsers['userId'] = topUsers['userId'].astype('int64')
topUsersRating = topUsers.merge(ratings_df, left_on='userId', right_on='userId', how='inner')



    similarityIndex  userId
32              1.0  (462,)
40              1.0  (561,)
42              1.0  (590,)
16              1.0   (82,)
30              1.0  (418,)
object
userId
<class 'tuple'>    50
Name: count, dtype: int64


In [10]:
topUsersRating['weightedRating'] = topUsersRating['similarityIndex']*topUsersRating['rating']
tempTopUsersRating = topUsersRating.groupby('movieId').sum()[['similarityIndex','weightedRating']]
tempTopUsersRating.columns = ['sum_similarityIndex','sum_weightedRating']
print(topUsersRating.head())

   similarityIndex  userId  movieId  rating   timestamp  weightedRating
0              1.0     462        1     1.5  1154037653             1.5
1              1.0     462       10     3.0  1269929071             3.0
2              1.0     462       16     3.5  1123893685             3.5
3              1.0     462       21     4.0  1121923492             4.0
4              1.0     462       25     3.0  1154037817             3.0


In [11]:
recommendation_df = pd.DataFrame()
recommendation_df['weighted average recommendation score'] = tempTopUsersRating['sum_weightedRating']/tempTopUsersRating['sum_similarityIndex']
recommendation_df['movieId'] = tempTopUsersRating.index
recommendation_df = recommendation_df.sort_values(by='weighted average recommendation score', ascending=False)
recommendation_df_sorted = recommendation_df[recommendation_df['weighted average recommendation score'] > 4.9]


In [12]:
recommended_movie = movies_df.loc[movies_df['movieId'].isin(recommendation_df_sorted['movieId'])]
recommended_movie = recommended_movie.loc[~recommended_movie.movieId.isin(userSubset['movieId'])]
print(recommended_movie)

      movieId                             title                  genres
251       290         Once Were Warriors (1994)             Crime|Drama
281       322       Swimming with Sharks (1995)            Comedy|Drama
361       417                  Barcelona (1994)          Comedy|Romance
515       599            Wild Bunch, The (1969)       Adventure|Western
681       899        Singin' in the Rain (1952)  Comedy|Musical|Romance
...       ...                               ...                     ...
9443   167064        I Am Not Your Negro (2017)             Documentary
9466   168326               The Big Sick (2017)          Comedy|Romance
9497   170705           Band of Brothers (2001)        Action|Drama|War
9709   187593                 Deadpool 2 (2018)    Action|Comedy|Sci-Fi
9711   187717  Won't You Be My Neighbor? (2018)             Documentary

[85 rows x 3 columns]
