User Based Collaborative Filtering

In [36]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error

In [2]:
rating = pd.read_csv('ratings.csv')
movie = pd.read_csv('movies.csv')

In [3]:
# Merging movie and rating matrix based on movieID
movies_ratings = movie.merge(rating, on='movieId', how='left')
print(movies_ratings.head(3))

   movieId             title                                       genres  \
0        1  Toy Story (1995)  Adventure|Animation|Children|Comedy|Fantasy   
1        1  Toy Story (1995)  Adventure|Animation|Children|Comedy|Fantasy   
2        1  Toy Story (1995)  Adventure|Animation|Children|Comedy|Fantasy   

   userId  rating     timestamp  
0     1.0     4.0  9.649827e+08  
1     5.0     4.0  8.474350e+08  
2     7.0     4.5  1.106636e+09  


In [4]:
# Making user item rating matrix
user_item_ratings = movies_ratings.pivot_table(index='userId', columns=['title'], values='rating')
user_item_ratings

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1.0,,,,,,,,,,,...,,,,,,,,,4.0,
2.0,,,,,,,,,,,...,,,,,,,,,,
3.0,,,,,,,,,,,...,,,,,,,,,,
4.0,,,,,,,,,,,...,,,,,,,,,,
5.0,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606.0,,,,,,,,,,,...,,,,,,,,,,
607.0,,,,,,,,,,,...,,,,,,,,,,
608.0,,,,,,,,,,,...,,,,,,4.5,3.5,,,
609.0,,,,,,,,,,,...,,,,,,,,,,


In [5]:
# Counting no of null ratings
total_ratings=0
ratings_given=0
for (row, col), value in np.ndenumerate(user_item_ratings):
  total_ratings=total_ratings+1
  if value>0.0:
    ratings_given=ratings_given+1

print("Total Ratings ", total_ratings)
print("Total Ratings given by users ",ratings_given)
print("Total Ratings not given by users ",total_ratings-ratings_given)

Total Ratings  5928590
Total Ratings given by users  100832
Total Ratings not given by users  5827758


In [6]:
# Filtering the user-item rating matrix
# Find the top 100 users who have rated the most movies
user_ratings_counts = user_item_ratings.sum(axis=1)
top_100_users = user_ratings_counts.sort_values(ascending=False).head(100)

# Calculate the number of ratings each movie has received from the top 100 users
movie_ratings_counts = user_item_ratings.loc[top_100_users.index].sum()

# Sort the movies based on the number of ratings received and keep the top 1000 movies
top_1000_movies = movie_ratings_counts.sort_values(ascending=False).head(1000).index

# Filter the user-item matrix to keep only the top 100 users' ratings and the selected 1000 movies
filtered_user_item_ratings = user_item_ratings.loc[top_100_users.index, top_1000_movies]

# Now, filtered_user_item_ratings contains the ratings of the top 100 users for the top 1000 movies.
filtered_user_item_ratings

title,Forrest Gump (1994),"Matrix, The (1999)",Pulp Fiction (1994),Star Wars: Episode IV - A New Hope (1977),Star Wars: Episode V - The Empire Strikes Back (1980),Fight Club (1999),"Shawshank Redemption, The (1994)","Silence of the Lambs, The (1991)",Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981),Star Wars: Episode VI - Return of the Jedi (1983),...,"Good Night, and Good Luck. (2005)",Strangers on a Train (1951),Modern Times (1936),American Splendor (2003),Whiplash (2014),Patch Adams (1998),Bulworth (1998),"Prince of Egypt, The (1998)",Hulk (2003),Freaky Friday (2003)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
414.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,4.0,5.0,5.0,...,4.0,,,4.5,4.0,,3.0,3.0,4.0,3.5
474.0,3.0,4.5,4.0,4.0,5.0,4.0,5.0,4.5,4.0,4.0,...,4.0,4.5,4.5,4.5,,1.0,,5.0,2.5,
599.0,3.5,5.0,5.0,5.0,5.0,5.0,4.0,3.0,3.5,5.0,...,,4.0,,2.5,3.0,,4.0,,3.0,2.5
448.0,3.0,2.0,5.0,5.0,5.0,4.0,,5.0,5.0,5.0,...,,,,,4.5,,,,1.0,
610.0,3.0,5.0,5.0,5.0,5.0,5.0,3.0,4.5,5.0,5.0,...,,,,4.0,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
332.0,4.5,5.0,4.0,,4.5,4.5,4.5,4.5,,3.5,...,,,,,,,,,,
104.0,4.0,,,3.0,,,,,,,...,,,,,,,,,,4.0
325.0,2.0,,5.0,,4.0,,5.0,5.0,,,...,,5.0,,,,,,,,
282.0,4.5,5.0,4.0,4.0,,4.5,4.5,4.0,,,...,,,,,,,,,,


In [7]:
total_ratings=0
ratings_given=0
for (row, col), value in np.ndenumerate(filtered_user_item_ratings):
  total_ratings=total_ratings+1
  if value>0.0:
    ratings_given=ratings_given+1

print("Total Ratings ", total_ratings)
print("Total Ratings given by users ",ratings_given)
print("Total Ratings not given by users ",total_ratings-ratings_given)

Total Ratings  100000
Total Ratings given by users  31878
Total Ratings not given by users  68122


In [8]:
user_item_ratings = filtered_user_item_ratings
user_item_ratings.fillna(0, inplace=True)
user_item_ratings

title,Forrest Gump (1994),"Matrix, The (1999)",Pulp Fiction (1994),Star Wars: Episode IV - A New Hope (1977),Star Wars: Episode V - The Empire Strikes Back (1980),Fight Club (1999),"Shawshank Redemption, The (1994)","Silence of the Lambs, The (1991)",Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981),Star Wars: Episode VI - Return of the Jedi (1983),...,"Good Night, and Good Luck. (2005)",Strangers on a Train (1951),Modern Times (1936),American Splendor (2003),Whiplash (2014),Patch Adams (1998),Bulworth (1998),"Prince of Egypt, The (1998)",Hulk (2003),Freaky Friday (2003)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
414.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,4.0,5.0,5.0,...,4.0,0.0,0.0,4.5,4.0,0.0,3.0,3.0,4.0,3.5
474.0,3.0,4.5,4.0,4.0,5.0,4.0,5.0,4.5,4.0,4.0,...,4.0,4.5,4.5,4.5,0.0,1.0,0.0,5.0,2.5,0.0
599.0,3.5,5.0,5.0,5.0,5.0,5.0,4.0,3.0,3.5,5.0,...,0.0,4.0,0.0,2.5,3.0,0.0,4.0,0.0,3.0,2.5
448.0,3.0,2.0,5.0,5.0,5.0,4.0,0.0,5.0,5.0,5.0,...,0.0,0.0,0.0,0.0,4.5,0.0,0.0,0.0,1.0,0.0
610.0,3.0,5.0,5.0,5.0,5.0,5.0,3.0,4.5,5.0,5.0,...,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
332.0,4.5,5.0,4.0,0.0,4.5,4.5,4.5,4.5,0.0,3.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
104.0,4.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
325.0,2.0,0.0,5.0,0.0,4.0,0.0,5.0,5.0,0.0,0.0,...,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
282.0,4.5,5.0,4.0,4.0,0.0,4.5,4.5,4.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
user_item_ratings.reset_index(drop=True, inplace=True)
user_item_ratings

title,Forrest Gump (1994),"Matrix, The (1999)",Pulp Fiction (1994),Star Wars: Episode IV - A New Hope (1977),Star Wars: Episode V - The Empire Strikes Back (1980),Fight Club (1999),"Shawshank Redemption, The (1994)","Silence of the Lambs, The (1991)",Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981),Star Wars: Episode VI - Return of the Jedi (1983),...,"Good Night, and Good Luck. (2005)",Strangers on a Train (1951),Modern Times (1936),American Splendor (2003),Whiplash (2014),Patch Adams (1998),Bulworth (1998),"Prince of Egypt, The (1998)",Hulk (2003),Freaky Friday (2003)
0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,4.0,5.0,5.0,...,4.0,0.0,0.0,4.5,4.0,0.0,3.0,3.0,4.0,3.5
1,3.0,4.5,4.0,4.0,5.0,4.0,5.0,4.5,4.0,4.0,...,4.0,4.5,4.5,4.5,0.0,1.0,0.0,5.0,2.5,0.0
2,3.5,5.0,5.0,5.0,5.0,5.0,4.0,3.0,3.5,5.0,...,0.0,4.0,0.0,2.5,3.0,0.0,4.0,0.0,3.0,2.5
3,3.0,2.0,5.0,5.0,5.0,4.0,0.0,5.0,5.0,5.0,...,0.0,0.0,0.0,0.0,4.5,0.0,0.0,0.0,1.0,0.0
4,3.0,5.0,5.0,5.0,5.0,5.0,3.0,4.5,5.0,5.0,...,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,4.5,5.0,4.0,0.0,4.5,4.5,4.5,4.5,0.0,3.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
96,4.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
97,2.0,0.0,5.0,0.0,4.0,0.0,5.0,5.0,0.0,0.0,...,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
98,4.5,5.0,4.0,4.0,0.0,4.5,4.5,4.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
user_item_ratings_test = user_item_ratings.copy()
user_item_ratings_test.iloc[:, :] = 0

# Finding non-zero indices (ratings) to move to the testing dataset
non_zero_indices = np.argwhere(user_item_ratings.values)
shuffled_indices = np.random.permutation(non_zero_indices)
num_ratings_to_move = int(0.2 * len(shuffled_indices))  # 20% of the non-zero ratings for testing

for i in range(num_ratings_to_move):
  row, col = shuffled_indices[i]
  user_item_ratings_test.iat[row, col] = user_item_ratings.iat[row, col]
  user_item_ratings.iat[row, col] = 0  # Move the rating to the test set

In [11]:
# Calculating user-user similarity matrix
user_similarity = cosine_similarity(user_item_ratings)
print(user_similarity)

[[1.         0.60240906 0.66213946 ... 0.28746798 0.40491381 0.33447433]
 [0.60240906 1.         0.54881801 ... 0.29834517 0.37682734 0.2702473 ]
 [0.66213946 0.54881801 1.         ... 0.27292706 0.39242332 0.27694636]
 ...
 [0.28746798 0.29834517 0.27292706 ... 1.         0.20868903 0.07340986]
 [0.40491381 0.37682734 0.39242332 ... 0.20868903 1.         0.22791522]
 [0.33447433 0.2702473  0.27694636 ... 0.07340986 0.22791522 1.        ]]


In [22]:
user_item_ratings

title,Forrest Gump (1994),"Matrix, The (1999)",Pulp Fiction (1994),Star Wars: Episode IV - A New Hope (1977),Star Wars: Episode V - The Empire Strikes Back (1980),Fight Club (1999),"Shawshank Redemption, The (1994)","Silence of the Lambs, The (1991)",Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981),Star Wars: Episode VI - Return of the Jedi (1983),...,"Good Night, and Good Luck. (2005)",Strangers on a Train (1951),Modern Times (1936),American Splendor (2003),Whiplash (2014),Patch Adams (1998),Bulworth (1998),"Prince of Egypt, The (1998)",Hulk (2003),Freaky Friday (2003)
0,5.0,5.0,5.0,0.0,5.0,5.0,5.0,4.0,5.0,5.0,...,4.0,0.0,0.0,0.0,4.0,0.0,3.0,3.0,4.0,3.5
1,3.0,4.5,0.0,4.0,5.0,4.0,5.0,4.5,0.0,4.0,...,0.0,4.5,4.5,4.5,0.0,1.0,0.0,5.0,2.5,0.0
2,0.0,5.0,5.0,5.0,0.0,5.0,0.0,0.0,3.5,0.0,...,0.0,4.0,0.0,2.5,3.0,0.0,4.0,0.0,3.0,2.5
3,3.0,0.0,5.0,5.0,5.0,4.0,0.0,0.0,5.0,5.0,...,0.0,0.0,0.0,0.0,4.5,0.0,0.0,0.0,1.0,0.0
4,3.0,5.0,5.0,5.0,5.0,5.0,3.0,4.5,5.0,0.0,...,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,4.5,5.0,4.0,0.0,4.5,4.5,4.5,4.5,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
96,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
97,2.0,0.0,5.0,0.0,4.0,0.0,5.0,5.0,0.0,0.0,...,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
98,4.5,0.0,4.0,4.0,0.0,4.5,4.5,4.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [28]:
# Set the number of neighbors (K)
K = 10

# Create an empty array to store the predicted ratings
predicted_user_item_matrix = user_item_ratings.copy()

for target_user_index in range(user_item_ratings.shape[0]):
    similar_users = np.argsort(user_similarity[target_user_index])[::-1][1:]
    top_k_similar_users = similar_users[:K]

    for item in range(user_item_ratings.shape[1]):
        if user_item_ratings.iat[target_user_index, item] == 0.0:
            # Calculate the predicted rating based on the K-nearest neighbors
            weighted_sum = 0
            similarity_sum = 0
            for neighbor in top_k_similar_users:
                if user_item_ratings.iat[neighbor, item] > 0:
                    similarity = user_similarity[target_user_index, neighbor]
                    weighted_sum += similarity * user_item_ratings.iat[neighbor, item]
                    similarity_sum += similarity
            if similarity_sum > 0:
                predicted_user_item_matrix.iat[target_user_index, item] = weighted_sum / similarity_sum

In [29]:
predicted_user_item_matrix

title,Forrest Gump (1994),"Matrix, The (1999)",Pulp Fiction (1994),Star Wars: Episode IV - A New Hope (1977),Star Wars: Episode V - The Empire Strikes Back (1980),Fight Club (1999),"Shawshank Redemption, The (1994)","Silence of the Lambs, The (1991)",Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981),Star Wars: Episode VI - Return of the Jedi (1983),...,"Good Night, and Good Luck. (2005)",Strangers on a Train (1951),Modern Times (1936),American Splendor (2003),Whiplash (2014),Patch Adams (1998),Bulworth (1998),"Prince of Egypt, The (1998)",Hulk (2003),Freaky Friday (2003)
0,5.000000,5.000000,5.000000,4.447006,5.000000,5.000000,5.000000,4.000000,5.000000,5.000000,...,4.000000,4.238191,4.500000,3.857658,4.000000,1.987947,3.000000,3.000000,4.000000,3.500000
1,3.000000,4.500000,4.774988,4.000000,5.000000,4.000000,5.000000,4.500000,4.298962,4.000000,...,3.907946,4.500000,4.500000,4.500000,3.523276,1.000000,3.133540,5.000000,2.500000,3.345360
2,4.018830,5.000000,5.000000,5.000000,4.684813,5.000000,4.298997,4.418321,3.500000,4.297510,...,3.772589,4.000000,4.500000,2.500000,3.000000,2.645994,4.000000,4.246592,3.000000,2.500000
3,3.000000,4.517666,5.000000,5.000000,5.000000,4.000000,4.300769,4.488312,5.000000,5.000000,...,3.842487,4.000000,0.000000,3.692264,4.500000,3.000000,3.349364,3.933559,1.000000,2.902279
4,3.000000,5.000000,5.000000,5.000000,5.000000,5.000000,3.000000,4.500000,5.000000,4.702793,...,4.000000,4.000000,4.238587,4.000000,4.074612,3.000000,3.495522,3.970928,2.761337,2.715948
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,4.500000,5.000000,4.000000,4.332087,4.500000,4.500000,4.500000,4.500000,4.564157,4.515817,...,3.836710,2.252073,4.000000,3.760736,4.124351,3.000000,3.112789,3.000000,3.032230,3.113105
96,3.942517,3.704076,4.503739,3.000000,3.801085,4.493046,4.635587,3.471298,3.891178,3.547726,...,0.000000,4.500000,4.500000,4.500000,0.000000,2.763154,0.000000,3.570636,2.500000,4.000000
97,2.000000,4.645534,5.000000,3.997535,4.000000,4.495793,5.000000,5.000000,4.335229,3.657117,...,4.000000,5.000000,4.309638,3.984300,4.000000,1.344767,3.703606,4.018568,3.236074,3.500000
98,4.500000,4.631632,4.000000,4.000000,4.863138,4.500000,4.500000,4.000000,4.139558,4.586213,...,3.748178,4.162270,4.248076,3.660861,3.829588,2.019245,3.492167,3.964072,2.578270,2.675874


In [34]:
# Recommending top 10 movies to a target user
target_user_index = 50
predicted_ratings = predicted_user_item_matrix.loc[target_user_index]

# Find the indices of the top 10 recommended items
top_10_indices = np.argsort(predicted_ratings)[::-1][:10]

# Get the corresponding movie titles for the top 10 items
top_10_movie_titles = user_item_ratings.columns[top_10_indices]
print("Top 10 Recommended Movies:")
for i in top_10_movie_titles:
  print(i)

Top 10 Recommended Movies:
Scarface (1983)
Brazil (1985)
Godfather: Part II, The (1974)
Third Man, The (1949)
City Slickers (1991)
Cruel Intentions (1999)
Pretty Woman (1990)
Once Upon a Time in America (1984)
Breakfast at Tiffany's (1961)
Dead Poets Society (1989)


In [35]:
# Calculate the absolute errors
absolute_errors = np.abs(predicted_user_item_matrix - user_item_ratings_test)

# Calculate the mean of absolute errors to get MAE
mae = np.mean(absolute_errors.values)

print("Mean Absolute Error (MAE):", mae)

Mean Absolute Error (MAE): 3.194277861218078


In [37]:
# Calculate the errors
errors = predicted_user_item_matrix - user_item_ratings_test

# Square the errors
squared_errors = errors ** 2

# Calculate the mean squared errors
mse = np.mean(squared_errors.values)

# Calculate RMSE by taking the square root of MSE
rmse = np.sqrt(mse)

print("Root Mean Square Error (RMSE):", rmse)

Root Mean Square Error (RMSE): 3.413149143710096
