### Content-Based Filtering on MovieLens Dataset

In [1]:
import torch
import numpy as np
import pandas as pd
import seaborn as sns
from tqdm import tqdm
import torch.nn as nn
import matplotlib.pyplot as plt

# To print all outputs to console
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pytorch-lightning
  Downloading pytorch_lightning-1.9.4-py3-none-any.whl (827 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m827.8/827.8 KB[0m [31m39.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting lightning-utilities>=0.6.0.post0
  Downloading lightning_utilities-0.7.1-py3-none-any.whl (18 kB)
Collecting torchmetrics>=0.7.0
  Downloading torchmetrics-0.11.3-py3-none-any.whl (518 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m518.6/518.6 KB[0m [31m44.7 MB/s[0m eta [36m0:00:00[0m
Collecting aiohttp!=4.0.0a0,!=4.0.0a1
  Downloading aiohttp-3.8.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m67.1 MB/s[0m eta [36m0:00:00[0m
Collecting multidict<7.0,>=4.5
  Downloading multidict-6.0.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x8

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# Load datasets
movies = pd.read_csv('/content/drive/MyDrive/dataset/movie.csv')
ratings = pd.read_csv('/content/drive/MyDrive/small_dataset/rating.csv')

ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,31,1,3.0,2015-02-23 23:18:07
1,31,110,5.0,2015-02-23 23:17:53
2,31,260,5.0,2015-02-23 23:17:13
3,31,364,3.0,2015-02-25 06:13:27
4,31,527,0.5,2015-02-23 23:19:58


In [4]:
movies_with_id = movies.set_index('movieId')
movies_with_id.index.name = None
movies_with_id.head()
print(f'Shape: {movies_with_id.shape}')

Unnamed: 0,title,genres
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,Jumanji (1995),Adventure|Children|Fantasy
3,Grumpier Old Men (1995),Comedy|Romance
4,Waiting to Exhale (1995),Comedy|Drama|Romance
5,Father of the Bride Part II (1995),Comedy


Shape: (27278, 2)


In [5]:
# Load ratings for each movie, average of all user ratings for that movie
ratings_per_movie = pd.DataFrame({'rating': ratings.groupby('movieId')['rating'].mean()})
ratings_per_movie.index.name = None

content = movies_with_id.join(ratings_per_movie)
content.head()
print(f'Shape: {content.shape}')

Unnamed: 0,title,genres,rating
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,3.934426
2,Jumanji (1995),Adventure|Children|Fantasy,3.267199
3,Grumpier Old Men (1995),Comedy|Romance,3.005319
4,Waiting to Exhale (1995),Comedy|Drama|Romance,2.571429
5,Father of the Bride Part II (1995),Comedy,2.823607


Shape: (27278, 3)


In [6]:
# Remove year from title
content['title'] = content.title.apply(lambda x: x.split('(')[0].strip())
content.head()

Unnamed: 0,title,genres,rating
1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,3.934426
2,Jumanji,Adventure|Children|Fantasy,3.267199
3,Grumpier Old Men,Comedy|Romance,3.005319
4,Waiting to Exhale,Comedy|Drama|Romance,2.571429
5,Father of the Bride Part II,Comedy,2.823607


In [7]:
# Expand the genres into columns
genres = set()
for index, row in content.iterrows():
  for value in row.genres.split('|'):
    genres.add(value)

# In general, the values of the columns are set to 0 or 1 based on the genre list available
# But in this case, I have used the rating as the value so that a stronger linkage can be found
content[[list(genres)]] = 0
for index, row in content.iterrows():
  for column in genres:
    content.loc[index, column] = row.rating if column in row.genres else 0

content.head()

Unnamed: 0,title,genres,rating,Animation,Thriller,Sci-Fi,Comedy,Mystery,Fantasy,War,...,Adventure,IMAX,Documentary,Children,Horror,(no genres listed),Musical,Film-Noir,Drama,Western
1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,3.934426,3.934426,0.0,0.0,3.934426,0.0,3.934426,0.0,...,3.934426,0.0,0.0,3.934426,0.0,0.0,0.0,0.0,0.0,0.0
2,Jumanji,Adventure|Children|Fantasy,3.267199,0.0,0.0,0.0,0.0,0.0,3.267199,0.0,...,3.267199,0.0,0.0,3.267199,0.0,0.0,0.0,0.0,0.0,0.0
3,Grumpier Old Men,Comedy|Romance,3.005319,0.0,0.0,0.0,3.005319,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Waiting to Exhale,Comedy|Drama|Romance,2.571429,0.0,0.0,0.0,2.571429,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.571429,0.0
5,Father of the Bride Part II,Comedy,2.823607,0.0,0.0,0.0,2.823607,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
movie_genres = content.drop(columns=['title', 'genres', 'rating'])

movie_genres.head()

Unnamed: 0,Animation,Thriller,Sci-Fi,Comedy,Mystery,Fantasy,War,Action,Crime,Romance,Adventure,IMAX,Documentary,Children,Horror,(no genres listed),Musical,Film-Noir,Drama,Western
1,3.934426,0.0,0.0,3.934426,0.0,3.934426,0.0,0.0,0.0,0.0,3.934426,0.0,0.0,3.934426,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,3.267199,0.0,0.0,0.0,0.0,3.267199,0.0,0.0,3.267199,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,3.005319,0.0,0.0,0.0,0.0,0.0,3.005319,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,2.571429,0.0,0.0,0.0,0.0,0.0,2.571429,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.571429,0.0
5,0.0,0.0,0.0,2.823607,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [19]:
# I have built Content-based filtering as a purely mathematical model using similarity of its genres
def get_recommendations(movie_title, n=20):
  movie_id = content[content['title'] == movie_title].index[0]
  movie_of_interest = movie_genres.loc[movie_id]

  result = movie_genres.dot(movie_of_interest)

  recommendations_index = result.sort_values(ascending=False)[:n].index
  recommendations = content.loc[recommendations_index]
  return recommendations

In [20]:
# Getting results/recommendations for `Toy Story`
result = get_recommendations('Toy Story')
print(result['title'])

80158                      Cartoon All-Stars to the Rescue
131248                                      Brother Bear 2
78499                                          Toy Story 3
1                                                Toy Story
26340                         Twelve Tasks of Asterix, The
4886                                        Monsters, Inc.
3114                                           Toy Story 2
108932                                      The Lego Movie
4306                                                 Shrek
4016                             Emperor's New Groove, The
2987                              Who Framed Roger Rabbit?
56152                                            Enchanted
114552                                      Boxtrolls, The
114240                                             Aladdin
33463     DuckTales: The Movie - Treasure of the Lost Lamp
115875                  Toy Story Toons: Hawaiian Vacation
91355                              Asterix and the Vikin