In [None]:
!wget https://lazyprogrammer.me/course_files/nlp/tmdb_5000_movies.csv

Exercise Prompt Hints

- Key step: how to combine movie data into a single string?
- Recall: TfidfVectorizer expects one string per "document"
- Transform the strings using TF-IDF

- Assume the query is always an existing movie in the database:

     E.g. query ="Scream 3", then recommend other movies based on this

- Get TF-IDF representation of Scream 3 
- Compute similarity between Scream 3 and all other vectors

- Sort by similarity

- Print out the top 5 closest movies

- Try movies from other genres

In [243]:
import ast
import matplotlib.pyplot as plt
from typing import Generator


from scipy.sparse._csr import csr_matrix
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances

In [283]:
class MovieRecommender:
    def __init__(self, dataset_path: str):
        self.df = self._load_df(dataset_path)
        self.df = self._fix_loading(self.df)
        self.df = self._add_mix_column(self.df)
        
        self.tfid = TfidfVectorizer(max_features=5000)


    def _load_df(self, dataset_path: str) -> pd.DataFrame:
        return pd.read_csv(dataset_path)
    
    def _fix_loading(self, df: pd.DataFrame) -> pd.DataFrame:
        df["genres"] = df['genres'].apply(ast.literal_eval)
        df["keywords"] = df['keywords'].apply(ast.literal_eval)
        return df
    
    def _add_mix_column(self, df: pd.DataFrame) -> pd.DataFrame:
        df["mix"] = df.apply(self._get_row_string_representation, axis=1)
        return df

    def _get_row_string_representation(self, row) -> str:
        genres = self._cast_dict_value_to_string(row["genres"])
        keywords = self._cast_dict_value_to_string(row["keywords"])

        return genres + keywords

    @staticmethod
    def _cast_dict_value_to_string(row: list):
        return " ".join( [cell.get("name") for cell in row] )

    def get_movie_recommendation(self, movie: str, distance_type: str = "cosine", recommendations: int = 5) -> pd.Series:

        movie_idx = self._get_movie_idx(movie)

        X_train, X_test = self.vectorize(movie_idx)
        scores = self.compute_scores(X_train, X_test, distance_type)

        return self.df.iloc[scores[1:recommendations+1]]["title"]

    def vectorize(self, movie_idx: int) -> tuple[csr_matrix, csr_matrix]:
        X_train_trans = self.tfid.fit_transform( self.df["mix"] )
        X_test_trans = X_train_trans[movie_idx]
        return X_train_trans, X_test_trans

    def compute_scores(self, X_train: csr_matrix, X_test: csr_matrix, distance_type: str):

        if distance_type == "cosine":
            scores = -cosine_similarity( X_test, X_train )

        elif distance_type == "euclidean":
            scores = euclidean_distances( X_test, X_train )

        return scores.argsort()[0]


    def _get_movie_idx(self, movie: str) -> pd.Index:
        try:
            return self.df[ self.df["title"] == movie ].index[0]
        except IndexError as e:
            raise IndexError(f"Movie '{movie}' not in the DB. %s")

In [286]:
recommender = MovieRecommender("../datasets/tmdb_5000_movies.csv")

movie = "Runaway Bride"
recommender.get_movie_recommendation(movie, distance_type="cosine")

2290                  Just Married
3408         Two Lovers and a Bear
4719             The Married Woman
2325    My Big Fat Greek Wedding 2
4115                    House of D
Name: title, dtype: object