<a href="https://colab.research.google.com/github/MusabUmama/Movie_Recommendation_system/blob/main/Recommendation_System.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Collaborative Movie Filtering

**The Dataset**

There are 4 datasets including movies, ratings, tags and links.


* Movies Dataset: This dataset contains information about movies, including movie
IDs, titles, and genres.

* Ratings Dataset: This dataset contains user ratings for movies, including user IDs, movie IDs, ratings, and timestamps.

* Tags Dataset: This dataset contains user-generated tags for movies, including user IDs, movie IDs, tags, and timestamps.

* Links Dataset: This dataset contains links between movie IDs in the dataset and external databases (IMDb and TMDB).

# **User-based Movie Filtering**

* The system makes recommendations based on similar users



In [63]:
import pandas as pd

In [64]:
# Importing the datasets
movies_data = pd.read_csv("/content/movies.csv")
ratings_data = pd.read_csv("/content/ratings.csv")
tags_data = pd.read_csv("/content/tags.csv")

In [65]:
movies_data.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [66]:
ratings_data.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [67]:
# Deleting the null rows
movies_data.dropna(inplace=True)
ratings_data.dropna(inplace=True)
tags_data.dropna(inplace=True)

# Deleting duplicate rows
movies_data.drop_duplicates(subset='movieId', keep='first')

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [68]:
movies_data.dtypes

movieId     int64
title      object
genres     object
dtype: object

In [69]:
ratings_data.dtypes

userId         int64
movieId        int64
rating       float64
timestamp      int64
dtype: object

In [70]:
tags_data.dtypes

userId        int64
movieId       int64
tag          object
timestamp     int64
dtype: object

In [71]:
# Merging the Ratings and Tags datasets based on 'movieId' and 'userId'
user_interactions_data = pd.merge(ratings_data, tags_data, on=['userId', 'movieId'], how='outer')

# Merging the user_interactions dataframe with the Movies dataset based on 'movieId' to create merged data frame
merged_data = pd.merge(user_interactions_data, movies_data, on='movieId', how='left')

In [72]:
merged_data.head()

Unnamed: 0,userId,movieId,rating,timestamp_x,tag,timestamp_y,title,genres
0,1,1,4.0,964982703.0,,,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,1,3,4.0,964981247.0,,,Grumpier Old Men (1995),Comedy|Romance
2,1,6,4.0,964982224.0,,,Heat (1995),Action|Crime|Thriller
3,1,47,5.0,964983815.0,,,Seven (a.k.a. Se7en) (1995),Mystery|Thriller
4,1,50,5.0,964982931.0,,,"Usual Suspects, The (1995)",Crime|Mystery|Thriller


The merged data frame has been created using relevant features from all the datasets.


In [73]:
from sklearn.metrics.pairwise import cosine_similarity

In [74]:
# Creating a user-item interaction matrix
interaction_matrix = pd.pivot_table(merged_data, values='rating', index='userId', columns='movieId', fill_value=0)

In [75]:
# Calculating the user similarity scores using cosine similarity
user_similarity_scores = cosine_similarity(interaction_matrix)

In [76]:
# Creating a data frame to store user similarity scores
user_similarity_data = pd.DataFrame(user_similarity_scores, index=interaction_matrix.index, columns=interaction_matrix.index)

# Replacing the diagonal values with zeros (self-similarity scores)
user_similarity_data.values[np.arange(user_similarity_data.shape[0]), np.arange(user_similarity_data.shape[0])] = 0

In [77]:
interaction_matrix.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,4.0,0.0,0.0,4.0,0.0,0,0.0,0.0,...,0.0,0,0,0,0.0,0,0.0,0.0,0.0,0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,...,0.0,0,0,0,0.0,0,0.0,0.0,0.0,0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,...,0.0,0,0,0,0.0,0,0.0,0.0,0.0,0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,...,0.0,0,0,0,0.0,0,0.0,0.0,0.0,0
5,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,...,0.0,0,0,0,0.0,0,0.0,0.0,0.0,0


In [78]:
user_similarity_data.head()

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.027283,0.05972,0.194395,0.12908,0.128152,0.158744,0.136968,0.064263,0.016875,...,0.080554,0.164455,0.221486,0.070669,0.153625,0.164191,0.269389,0.291097,0.093572,0.145321
2,0.027283,0.0,0.0,0.003726,0.016614,0.025333,0.027585,0.027257,0.0,0.067445,...,0.202671,0.016866,0.011997,0.0,0.0,0.028429,0.012948,0.046211,0.027565,0.102427
3,0.05972,0.0,0.0,0.002251,0.00502,0.003936,0.0,0.004941,0.0,0.0,...,0.005048,0.004892,0.024992,0.0,0.010694,0.012993,0.019247,0.021128,0.0,0.032119
4,0.194395,0.003726,0.002251,0.0,0.128659,0.088491,0.11512,0.062969,0.011361,0.031163,...,0.085938,0.128273,0.307973,0.052985,0.084584,0.200395,0.131746,0.149858,0.032198,0.107683
5,0.12908,0.016614,0.00502,0.128659,0.0,0.300349,0.108342,0.429075,0.0,0.030611,...,0.068048,0.418747,0.110148,0.258773,0.148758,0.106435,0.152866,0.135535,0.261232,0.060792


In [79]:
import numpy as np

In [80]:
# Choosing a target user
target_user_id = 300

The user with ID no. 300 has been choosen as the target user

In [81]:
# Finding the most similar users to the target user in descending order
similar_users = user_similarity_data[target_user_id].sort_values(ascending=False)

In [82]:
similar_users

userId
497    0.496362
581    0.442553
233    0.380322
443    0.378925
296    0.365242
         ...   
456    0.000000
388    0.000000
213    0.000000
481    0.000000
92     0.000000
Name: 300, Length: 610, dtype: float64

In [83]:
# Geting the movies that the user has not rated
rated_movies = interaction_matrix.loc[target_user_id]
unrated_movies = rated_movies[rated_movies == 0].index

In [84]:
# Calculating the weighted sum of ratings by similar users for unrated movies
unrated_movie_scores = interaction_matrix.loc[similar_users.index, unrated_movies].T.dot(similar_users)

In [85]:
# Sorting the recommended movies by their scores in descending order
recommended_movies_id = unrated_movie_scores.sort_values(ascending=False)

In [86]:
recommended_movies_id

movieId
296      167.124787
260      118.579536
4993     117.338409
50       116.769371
7153     114.773592
            ...    
1116       0.000000
3899       0.000000
103        0.000000
75446      0.000000
3567       0.000000
Length: 9692, dtype: float64

In [87]:
recommended_movie_titles = merged_data[merged_data['movieId'].isin(recommended_movies_id.index)]['title']

In [88]:
recommended_movie_titles

0                                          Toy Story (1995)
1                                   Grumpier Old Men (1995)
2                                               Heat (1995)
3                               Seven (a.k.a. Se7en) (1995)
4                                Usual Suspects, The (1995)
                                ...                        
102878                                    Score, The (2001)
102880                                     Daredevil (2003)
102881                                     Daredevil (2003)
102882    Mary Shelley's Frankenstein (Frankenstein) (1994)
102883                               Shame (Skammen) (1968)
Name: title, Length: 98835, dtype: object

Some movies are recommended multiple times in the list due to multiple similar users rating those movies highly.

In [89]:
# List to store unique movies
unique_movies = []

In [90]:
# Adding the movies to unique list
for movie_id in recommended_movie_titles:
    if movie_id not in unique_movies:
        unique_movies.append(movie_id)

In [91]:
# Top 10 movies recommended
print("Top 10 recommendations:\n")
for movie in range(10):
  print(unique_movies[movie])

Top 10 recommendations:

Toy Story (1995)
Grumpier Old Men (1995)
Heat (1995)
Seven (a.k.a. Se7en) (1995)
Usual Suspects, The (1995)
From Dusk Till Dawn (1996)
Bottle Rocket (1996)
Braveheart (1995)
Rob Roy (1995)
Canadian Bacon (1995)


# **Item-based movie filtering**

* Item-Based Collaborative Filtering recommends items to users based on the similarity between items rather than users.


All the datasets and libraries imported in user based filtering is being used here.

In [92]:
# Merging the Movies and Ratings datasets based on movieId
movie_ratings_data = pd.merge(ratings_data, movies_data, on='movieId')

In [93]:
movie_ratings_data

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,5,1,4.0,847434962,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,7,1,4.5,1106635946,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
3,15,1,2.5,1510577970,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
4,17,1,4.5,1305696483,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
...,...,...,...,...,...,...
100831,610,160341,2.5,1479545749,Bloodmoon (1997),Action|Thriller
100832,610,160527,4.5,1479544998,Sympathy for the Underdog (1971),Action|Crime|Drama
100833,610,160836,3.0,1493844794,Hazard (2005),Action|Drama|Thriller
100834,610,163937,3.5,1493848789,Blair Witch (2016),Horror|Thriller


The data frame movie_ratings_data has been created by merging movies and ratings datasets.

In [94]:
movie_ratings_data.dtypes

userId         int64
movieId        int64
rating       float64
timestamp      int64
title         object
genres        object
dtype: object

In [95]:
# creating the item-item similarity matrix
item_similarity_matrix = movie_ratings_data.pivot_table(index='movieId', columns='userId', values='rating', fill_value=0)

In [96]:
item_similarity_matrix

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4,0.0,0.0,0,4,0,4.5,0,0,0.0,...,4.0,0,4,3,4.0,2.5,4,2.5,3,5.0
2,0,0.0,0.0,0,0,4,0.0,4,0,0.0,...,0.0,4,0,5,3.5,0.0,0,2.0,0,0.0
3,4,0.0,0.0,0,0,5,0.0,0,0,0.0,...,0.0,0,0,0,0.0,0.0,0,2.0,0,0.0
4,0,0.0,0.0,0,0,3,0.0,0,0,0.0,...,0.0,0,0,0,0.0,0.0,0,0.0,0,0.0
5,0,0.0,0.0,0,0,5,0.0,0,0,0.0,...,0.0,0,0,3,0.0,0.0,0,0.0,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193581,0,0.0,0.0,0,0,0,0.0,0,0,0.0,...,0.0,0,0,0,0.0,0.0,0,0.0,0,0.0
193583,0,0.0,0.0,0,0,0,0.0,0,0,0.0,...,0.0,0,0,0,0.0,0.0,0,0.0,0,0.0
193585,0,0.0,0.0,0,0,0,0.0,0,0,0.0,...,0.0,0,0,0,0.0,0.0,0,0.0,0,0.0
193587,0,0.0,0.0,0,0,0,0.0,0,0,0.0,...,0.0,0,0,0,0.0,0.0,0,0.0,0,0.0


In [97]:
# Calculating the item-item similarity using cosine similarity
item_similarity = cosine_similarity(item_similarity_matrix.T)

In [98]:
# Creating a data frame to store item-item similarity scores
item_similarity_data = pd.DataFrame(item_similarity, index=item_similarity_matrix.columns, columns=item_similarity_matrix.columns)

# Replacing the diagonal values with zeros (self-similarity scores)
item_similarity_data.values[np.arange(item_similarity_data.shape[0]), np.arange(item_similarity_data.shape[0])] = 0

In [99]:
item_similarity_data

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.000000,0.027283,0.059720,0.194395,0.129080,0.128152,0.158744,0.136968,0.064263,0.016875,...,0.080554,0.164455,0.221486,0.070669,0.153625,0.164191,0.269389,0.291097,0.093572,0.145321
2,0.027283,0.000000,0.000000,0.003726,0.016614,0.025333,0.027585,0.027257,0.000000,0.067445,...,0.202671,0.016866,0.011997,0.000000,0.000000,0.028429,0.012948,0.046211,0.027565,0.102427
3,0.059720,0.000000,0.000000,0.002251,0.005020,0.003936,0.000000,0.004941,0.000000,0.000000,...,0.005048,0.004892,0.024992,0.000000,0.010694,0.012993,0.019247,0.021128,0.000000,0.032119
4,0.194395,0.003726,0.002251,0.000000,0.128659,0.088491,0.115120,0.062969,0.011361,0.031163,...,0.085938,0.128273,0.307973,0.052985,0.084584,0.200395,0.131746,0.149858,0.032198,0.107683
5,0.129080,0.016614,0.005020,0.128659,0.000000,0.300349,0.108342,0.429075,0.000000,0.030611,...,0.068048,0.418747,0.110148,0.258773,0.148758,0.106435,0.152866,0.135535,0.261232,0.060792
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,0.164191,0.028429,0.012993,0.200395,0.106435,0.102123,0.200035,0.099388,0.075898,0.088963,...,0.178084,0.116534,0.300669,0.066032,0.148141,0.000000,0.153063,0.262558,0.069622,0.201104
607,0.269389,0.012948,0.019247,0.131746,0.152866,0.162182,0.186114,0.185142,0.011844,0.010451,...,0.092525,0.199910,0.203540,0.137834,0.118780,0.153063,0.000000,0.283081,0.149190,0.139114
608,0.291097,0.046211,0.021128,0.149858,0.135535,0.178809,0.323541,0.187233,0.100435,0.077424,...,0.158355,0.197514,0.232771,0.155306,0.178142,0.262558,0.283081,0.000000,0.121993,0.322055
609,0.093572,0.027565,0.000000,0.032198,0.261232,0.214234,0.090840,0.423993,0.000000,0.021766,...,0.035653,0.335231,0.061941,0.236601,0.097610,0.069622,0.149190,0.121993,0.000000,0.053225


In [100]:
# The target user id
second_user_id = 22