<a href="https://colab.research.google.com/github/MusabUmama/Movie_Recommendation_system/blob/main/Recommendation_System.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Collaborative Movie Filtering

**The Dataset**

There are 4 datasets including movies, ratings, tags and links.


* Movies Dataset: This dataset contains information about movies, including movie
IDs, titles, and genres.

* Ratings Dataset: This dataset contains user ratings for movies, including user IDs, movie IDs, ratings, and timestamps.

* Tags Dataset: This dataset contains user-generated tags for movies, including user IDs, movie IDs, tags, and timestamps.

* Links Dataset: This dataset contains links between movie IDs in the dataset and external databases (IMDb and TMDB).

# **User-based Movie Filtering**

* The system makes recommendations based on the similarity between users.



In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd

In [3]:
# Importing the datasets
movies_data = pd.read_csv("/content/movies.csv")
ratings_data = pd.read_csv("/content/ratings.csv")
tags_data = pd.read_csv("/content/tags.csv")

In [4]:
movies_data.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [5]:
ratings_data.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [6]:
# Deleting the null rows
movies_data.dropna(inplace=True)
ratings_data.dropna(inplace=True)
tags_data.dropna(inplace=True)

In [12]:
movies_data.dtypes

movieId     int64
title      object
genres     object
dtype: object

In [13]:
ratings_data.dtypes

userId         int64
movieId        int64
rating       float64
timestamp      int64
dtype: object

In [14]:
tags_data.dtypes

userId        int64
movieId       int64
tag          object
timestamp     int64
dtype: object

In [15]:
# Merging the Ratings and Tags datasets based on 'movieId' and 'userId'
user_interactions_data = pd.merge(ratings_data, tags_data, on=['userId', 'movieId'], how='outer')

# Merging the user_interactions dataframe with the Movies dataset based on 'movieId' to create merged data frame
merged_data = pd.merge(user_interactions_data, movies_data, on='movieId', how='left')

In [16]:
merged_data.head()

Unnamed: 0,userId,movieId,rating,timestamp_x,tag,timestamp_y,title,genres
0,1,1,4.0,964982703.0,,,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,1,3,4.0,964981247.0,,,Grumpier Old Men (1995),Comedy|Romance
2,1,6,4.0,964982224.0,,,Heat (1995),Action|Crime|Thriller
3,1,47,5.0,964983815.0,,,Seven (a.k.a. Se7en) (1995),Mystery|Thriller
4,1,50,5.0,964982931.0,,,"Usual Suspects, The (1995)",Crime|Mystery|Thriller


The merged data frame has been created using relevant features from all the datasets.


In [17]:
from sklearn.metrics.pairwise import cosine_similarity

In [39]:
# Creating a user-item interaction matrix
interaction_matrix = pd.pivot_table(merged_data, values='rating', index='userId', columns='movieId', fill_value=0)

In [40]:
# Calculating the user similarity scores using cosine similarity
user_similarity_scores = cosine_similarity(interaction_matrix)

In [41]:
# Creating a data frame to store user similarity scores
user_similarity_data = pd.DataFrame(user_similarity_scores, index=interaction_matrix.index, columns=interaction_matrix.index)

# Replacing the diagonal values with zeros (self-similarity scores)
user_similarity_data.values[[range(user_similarity_data.shape[0])]*2] = 0

In [42]:
print("User-Item Interaction Matrix:")
interaction_matrix.head()

User-Item Interaction Matrix:


movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,4.0,0.0,0.0,4.0,0.0,0,0.0,0.0,...,0.0,0,0,0,0.0,0,0.0,0.0,0.0,0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,...,0.0,0,0,0,0.0,0,0.0,0.0,0.0,0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,...,0.0,0,0,0,0.0,0,0.0,0.0,0.0,0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,...,0.0,0,0,0,0.0,0,0.0,0.0,0.0,0
5,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,...,0.0,0,0,0,0.0,0,0.0,0.0,0.0,0


In [43]:
print("User Similarity Scores:")
user_similarity_data.head()

User Similarity Scores:


userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [44]:
# Choosing a target user
target_user_id = 13

# Geting the similarity scores for the target user
similar_users = user_similarity_data.loc[target_user_id].sort_values(ascending=False)

In [45]:
# Selecting the top-k most similar users
k = 10
top_similar_users = similar_users.iloc[1:k+1]

In [46]:
top_similar_users

userId
410    0.0
403    0.0
404    0.0
405    0.0
406    0.0
407    0.0
408    0.0
409    0.0
411    0.0
458    0.0
Name: 13, dtype: float64

In [47]:
# Geting the movie ratings for the top-k similar users
similar_users_ratings = interaction_matrix.loc[top_similar_users.index]

In [48]:
similar_users_ratings

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
410,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0,0.0,0.0,...,0.0,0,0,0,0.0,0,0.0,0.0,0.0,0
403,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,...,0.0,0,0,0,0.0,0,0.0,0.0,0.0,0
404,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,...,0.0,0,0,0,0.0,0,0.0,0.0,0.0,0
405,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,...,0.0,0,0,0,0.0,0,0.0,0.0,0.0,0
406,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,...,0.0,0,0,0,0.0,0,0.0,0.0,0.0,0
407,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,...,0.0,0,0,0,0.0,0,0.0,0.0,0.0,0
408,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,4.5,...,0.0,0,0,0,0.0,0,0.0,0.0,0.0,0
409,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,...,0.0,0,0,0,0.0,0,0.0,0.0,0.0,0
411,5.0,4.0,0.0,2.0,0.0,0.0,3.0,0,0.0,3.0,...,0.0,0,0,0,0.0,0,0.0,0.0,0.0,0
458,0.0,4.0,0.0,0.0,3.0,0.0,0.0,0,0.0,4.0,...,0.0,0,0,0,0.0,0,0.0,0.0,0.0,0
