# Content-Based Filtering



## Imports and Libraries

In [96]:
import random
import json
import os
import csv
import ast

import numpy as np
import pandas as pd

from tqdm import tqdm

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

random.seed(42)
np.random.seed(42)

## Section 1: Load the Movielens data, do the needed preprocessing and merging, and calculate the movie-by-movie score matrix

### Load the movies and merge them with their IMDB links

In [97]:
# Load the movies
raw_movies = pd.read_csv("../../data/movies.csv")
movie_id_links = pd.read_csv("../../data/links.csv")

movies_merged = pd.merge(raw_movies, movie_id_links, on="movieId")
movies_merged["tmdbId"] = movies_merged["tmdbId"].fillna(0).astype(int)
movies_merged["genres"] = movies_merged["genres"].str.lower()
print(f"Total Movies: {len(movies_merged)}")

movies_merged.head()

Total Movies: 9742


Unnamed: 0,movieId,title,genres,imdbId,tmdbId
0,1,Toy Story (1995),adventure|animation|children|comedy|fantasy,114709,862
1,2,Jumanji (1995),adventure|children|fantasy,113497,8844
2,3,Grumpier Old Men (1995),comedy|romance,113228,15602
3,4,Waiting to Exhale (1995),comedy|drama|romance,114885,31357
4,5,Father of the Bride Part II (1995),comedy,113041,11862


In [98]:
# Load the tags provided 
tags_raw = pd.read_csv("../../data/tags.csv").drop(columns=["timestamp", "userId"])

grouped_tags = tags_raw.groupby("movieId")

movie_tags = []
for movieID, group in grouped_tags:
    keyword_list = list(set(group["tag"]))
    keywords = ("|".join(keyword_list)).lower()
    movie_tags.append((movieID, keywords))

tags = pd.DataFrame(movie_tags, columns=["movieId", "tags"])

movies_merged_tags = pd.merge(movies_merged, tags, on="movieId", how="left")
movies_merged_tags["tags"] = movies_merged_tags["tags"].fillna("")

print(f"Total Movies: {len(movies_merged_tags)}")
movies_merged_tags.head()

Total Movies: 9742


Unnamed: 0,movieId,title,genres,imdbId,tmdbId,tags
0,1,Toy Story (1995),adventure|animation|children|comedy|fantasy,114709,862,pixar|fun
1,2,Jumanji (1995),adventure|children|fantasy,113497,8844,fantasy|robin williams|magic board game|game
2,3,Grumpier Old Men (1995),comedy|romance,113228,15602,moldy|old
3,4,Waiting to Exhale (1995),comedy|drama|romance,114885,31357,
4,5,Father of the Bride Part II (1995),comedy,113041,11862,pregnancy|remake


### Load the movie metadata to have an overview for each movie

Movie metadata scrapped from https://www.kaggle.com/datasets/rounakbanik/the-movies-dataset/data 

In [99]:
#unzip the file available
cast = pd.read_csv("../../data/credits.csv")

cast.rename(columns={"cast": "cast-raw"}, inplace=True)
cast.rename(columns={"crew": "crew-raw"}, inplace=True)
cast.rename(columns={"id": "tmdbId"}, inplace=True)

cast["cast-raw"].fillna("[]", inplace=True)
cast["crew-raw"].fillna("[]", inplace=True)

cast["cast-raw"] = cast["cast-raw"].apply(ast.literal_eval)
cast["crew-raw"] = cast["crew-raw"].apply(ast.literal_eval)

cast.dropna(inplace=True)
cast.drop_duplicates(subset=['tmdbId'], inplace=True)

for i, row in tqdm(cast.iterrows()):
    cast_list = row["cast-raw"]
    actor_names = [item["name"].lower() for item in cast_list]
    cast.loc[i, "cast"] = "|".join(actor_names)
    
    crew_list = row["crew-raw"]
    crew_names = [item["name"].lower() for item in crew_list]
    cast.loc[i, "crew"] = "|".join(crew_names)
    
cast.drop(columns=["cast-raw", "crew-raw"], inplace=True)

print(cast["tmdbId"].nunique())

cast.head()


45432it [00:03, 12599.43it/s]

45432





Unnamed: 0,tmdbId,cast,crew
0,862,tom hanks|tim allen|don rickles|jim varney|wal...,john lasseter|joss whedon|andrew stanton|joel ...
1,8844,robin williams|jonathan hyde|kirsten dunst|bra...,larry j. franco|jonathan hensleigh|james horne...
2,15602,walter matthau|jack lemmon|ann-margret|sophia ...,howard deutch|mark steven johnson|mark steven ...
3,31357,whitney houston|angela bassett|loretta devine|...,forest whitaker|ronald bass|ronald bass|ezra s...
4,11862,steve martin|diane keaton|martin short|kimberl...,alan silvestri|elliot davis|nancy meyers|nancy...


In [100]:
metadata = pd.read_csv("../../data/movies_metadata.csv", low_memory=False)[
    ["imdb_id", "overview"]
]

metadata.rename(columns={"imdb_id": "imdbId"}, inplace=True)

metadata["overview"] = metadata["overview"].fillna("")

metadata.dropna(inplace=True)
metadata.drop_duplicates(subset=["imdbId"], inplace=True)

metadata["imdbId"] = (
    metadata["imdbId"].apply(lambda x: str(x).replace("tt", "")).astype(int)
)

metadata.head(10)

Unnamed: 0,imdbId,overview
0,114709,"Led by Woody, Andy's toys live happily in his ..."
1,113497,When siblings Judy and Peter discover an encha...
2,113228,A family wedding reignites the ancient feud be...
3,114885,"Cheated on, mistreated and stepped on, the wom..."
4,113041,Just when George Banks has recovered from his ...
5,113277,"Obsessive master thief, Neil McCauley leads a ..."
6,114319,An ugly duckling having undergone a remarkable...
7,112302,"A mischievous young boy, Tom Sawyer, witnesses..."
8,114576,International action superstar Jean Claude Van...
9,113189,James Bond must unmask the mysterious head of ...


### Merge the movies with their overview data

In [101]:
movies = pd.merge(movies_merged_tags, metadata, on="imdbId", how="left")
movies["overview"] = movies["overview"].fillna("")
print(f"Total Movies: {len(movies)}")
movies.head()

Total Movies: 9742


Unnamed: 0,movieId,title,genres,imdbId,tmdbId,tags,overview
0,1,Toy Story (1995),adventure|animation|children|comedy|fantasy,114709,862,pixar|fun,"Led by Woody, Andy's toys live happily in his ..."
1,2,Jumanji (1995),adventure|children|fantasy,113497,8844,fantasy|robin williams|magic board game|game,When siblings Judy and Peter discover an encha...
2,3,Grumpier Old Men (1995),comedy|romance,113228,15602,moldy|old,A family wedding reignites the ancient feud be...
3,4,Waiting to Exhale (1995),comedy|drama|romance,114885,31357,,"Cheated on, mistreated and stepped on, the wom..."
4,5,Father of the Bride Part II (1995),comedy,113041,11862,pregnancy|remake,Just when George Banks has recovered from his ...


### Merge the movies with the cast data



In [102]:
movies = pd.merge(movies, cast, on="tmdbId", how="left")
movies["cast"] = movies["cast"].fillna("")
movies["crew"] = movies["crew"].fillna("")

movieIdToIndex = pd.Series(movies.index, index=movies["movieId"]).drop_duplicates()

print(f"Final processed movies: {len(movies)}")
movies.head()

Final processed movies: 9742


Unnamed: 0,movieId,title,genres,imdbId,tmdbId,tags,overview,cast,crew
0,1,Toy Story (1995),adventure|animation|children|comedy|fantasy,114709,862,pixar|fun,"Led by Woody, Andy's toys live happily in his ...",tom hanks|tim allen|don rickles|jim varney|wal...,john lasseter|joss whedon|andrew stanton|joel ...
1,2,Jumanji (1995),adventure|children|fantasy,113497,8844,fantasy|robin williams|magic board game|game,When siblings Judy and Peter discover an encha...,robin williams|jonathan hyde|kirsten dunst|bra...,larry j. franco|jonathan hensleigh|james horne...
2,3,Grumpier Old Men (1995),comedy|romance,113228,15602,moldy|old,A family wedding reignites the ancient feud be...,walter matthau|jack lemmon|ann-margret|sophia ...,howard deutch|mark steven johnson|mark steven ...
3,4,Waiting to Exhale (1995),comedy|drama|romance,114885,31357,,"Cheated on, mistreated and stepped on, the wom...",whitney houston|angela bassett|loretta devine|...,forest whitaker|ronald bass|ronald bass|ezra s...
4,5,Father of the Bride Part II (1995),comedy,113041,11862,pregnancy|remake,Just when George Banks has recovered from his ...,steve martin|diane keaton|martin short|kimberl...,alan silvestri|elliot davis|nancy meyers|nancy...


### Extend overview with the genres and tags (in a naive way)

In [103]:
tags_str = movies["tags"].apply(lambda x: x.replace("|", " "))
genres_str = movies["genres"].apply(lambda x: x.replace("|", " "))
cast_str = movies["cast"].apply(lambda x: x.replace("|", " "))
crew_str = movies["crew"].apply(lambda x: x.replace("|", " "))

movies["overview_extented"] = movies["overview"] + " " + tags_str + " " + genres_str + " " + cast_str + " " + crew_str
print(movies.loc[0]["overview_extented"])

movies.head()

Led by Woody, Andy's toys live happily in his room until Andy's birthday brings Buzz Lightyear onto the scene. Afraid of losing his place in Andy's heart, Woody plots against Buzz. But when circumstances separate Buzz and Woody from their owner, the duo eventually learns to put aside their differences. pixar fun adventure animation children comedy fantasy tom hanks tim allen don rickles jim varney wallace shawn john ratzenberger annie potts john morris erik von detten laurie metcalf r. lee ermey sarah freeman penn jillette john lasseter joss whedon andrew stanton joel cohen alec sokolow bonnie arnold ed catmull ralph guggenheim steve jobs lee unkrich ralph eggleston robert gordon mary helen leasman kim blanchette marilyn mccoppen randy newman dale e. grahn robin cooper john lasseter pete docter joe ranft patsy bouge norm decarlo ash brannon randy newman roman figun don davis james flamberg mary beth smith rick mackay susan bradley william reeves randy newman andrew stanton pete docter 

Unnamed: 0,movieId,title,genres,imdbId,tmdbId,tags,overview,cast,crew,overview_extented
0,1,Toy Story (1995),adventure|animation|children|comedy|fantasy,114709,862,pixar|fun,"Led by Woody, Andy's toys live happily in his ...",tom hanks|tim allen|don rickles|jim varney|wal...,john lasseter|joss whedon|andrew stanton|joel ...,"Led by Woody, Andy's toys live happily in his ..."
1,2,Jumanji (1995),adventure|children|fantasy,113497,8844,fantasy|robin williams|magic board game|game,When siblings Judy and Peter discover an encha...,robin williams|jonathan hyde|kirsten dunst|bra...,larry j. franco|jonathan hensleigh|james horne...,When siblings Judy and Peter discover an encha...
2,3,Grumpier Old Men (1995),comedy|romance,113228,15602,moldy|old,A family wedding reignites the ancient feud be...,walter matthau|jack lemmon|ann-margret|sophia ...,howard deutch|mark steven johnson|mark steven ...,A family wedding reignites the ancient feud be...
3,4,Waiting to Exhale (1995),comedy|drama|romance,114885,31357,,"Cheated on, mistreated and stepped on, the wom...",whitney houston|angela bassett|loretta devine|...,forest whitaker|ronald bass|ronald bass|ezra s...,"Cheated on, mistreated and stepped on, the wom..."
4,5,Father of the Bride Part II (1995),comedy,113041,11862,pregnancy|remake,Just when George Banks has recovered from his ...,steve martin|diane keaton|martin short|kimberl...,alan silvestri|elliot davis|nancy meyers|nancy...,Just when George Banks has recovered from his ...


### Create the TF-iDF matrix for the movies based on their text data

In [104]:
# Define a TF-IDF Vectorizer Object. Remove all english stop words such as 'the', 'a'
tfidf = TfidfVectorizer(stop_words="english")

# Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform(movies["overview_extented"])

# Output the shape of tfidf_matrix
tfidf_matrix.shape

(9742, 100795)

### Based on TFiDF, calculate the similarities of the movies

In [105]:
# Compute the cosine similarity matrix
movie_similarities = linear_kernel(tfidf_matrix, tfidf_matrix)
movie_similarities.shape

(9742, 9742)

## Section 2: Load and process the user ratings

In [106]:
# Load user ratings
user_ratings = pd.read_csv("../../data/train_ratings.csv").drop(columns=["timestamp"])
grouped_user_ratings = user_ratings.groupby("userId")

ratings_list = []
for user_id, group in grouped_user_ratings:
    rats = []
    for movie_id, rating in zip(list(group["movieId"]), list(group["rating"])):
        rats.append((movie_id, rating))

    ratings_list.append((user_id, rats))

# Transfrom the ratings to a dataframe
ratings = pd.DataFrame(ratings_list, columns=["userId", "ratings"])

userIdToIndex = pd.Series(ratings.index, index=ratings["userId"]).drop_duplicates()
ratings.head()

Unnamed: 0,userId,ratings
0,1,"[(2090, 5.0), (2427, 5.0), (1030, 3.0), (1793,..."
1,2,"[(68157, 4.5), (8798, 3.5), (80906, 5.0), (774..."
2,3,"[(26409, 4.5), (1093, 0.5), (1302, 0.5), (1272..."
3,4,"[(1733, 5.0), (2150, 5.0), (2186, 5.0), (4239,..."
4,5,"[(410, 3.0), (58, 5.0), (261, 4.0), (527, 5.0)..."


In [107]:
def similarity(movie_id_1, movie_id_2):    
    index1 = movieIdToIndex[movie_id_1]
    index2 = movieIdToIndex[movie_id_2]
        
    return movie_similarities[index1][index2]


def predict_rating(user_id, movie_id):
    # Get the index of the movie
    user_data = ratings.loc[userIdToIndex[user_id]]

    # Get the movies that the user has rated
    user_ratings = user_data["ratings"]

    # Get the similarity of the user's movies to the movie we are predicting
    r = d = 0
    for i, rating in enumerate(user_ratings):
        ratedMovieID = rating[0]
        ratedMovieRating = rating[1]
        
        sim = similarity(movie_id, ratedMovieID)

        r += sim * ratedMovieRating
        d += sim

        if ratedMovieID == movie_id:
            return ratedMovieRating

    if d == 0:
        return 0
    
    return r / d

## Section 3: Generate the predictions

In [93]:
test_data = pd.read_csv("../../data/test_set_no_ratings.csv")
results = []
for index, row in tqdm(test_data.iterrows(), desc="Generating predictions"):
    id = row["Id"]
    user_id = row["userId"]
    movie_id = row["movieId"]

    predicted_rating = predict_rating(user_id, movie_id)
    results.append((id, predicted_rating))

# Save the results stored in data list to a CSV file
csv_file = "../../submissions/content-based-output-ext-c.csv"

with open(csv_file, mode="w", newline="") as file:
    writer = csv.writer(file)
    writer.writerow(["Id", "rating"])
    writer.writerows(results)

print(f"CSV file '{csv_file}' has been created.")

Generating predictions: 20168it [00:32, 630.15it/s]


CSV file '../../submissions/content-based-output-ext-c.csv' has been created.
