## Importing Libraries

In [1]:
import ast
import ipywidgets as widgets
import json
import pandas as pd

from IPython.display import display, clear_output
from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate

In [2]:
import warnings
warnings.filterwarnings("ignore")

# Metadata

In [3]:
df_metadata_ori = pd.read_csv("data/movies_metadata.csv", low_memory=False)
df_metadata_ori.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,30/10/1995,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,15/12/1995,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,22/12/1995,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,22/12/1995,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,10/2/1995,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [4]:
df_metadata_ori.shape

(45466, 24)

In [5]:
df_metadata_ori.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45466 entries, 0 to 45465
Data columns (total 24 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   adult                  45466 non-null  object 
 1   belongs_to_collection  4494 non-null   object 
 2   budget                 45466 non-null  object 
 3   genres                 45466 non-null  object 
 4   homepage               7782 non-null   object 
 5   id                     45466 non-null  object 
 6   imdb_id                45449 non-null  object 
 7   original_language      45455 non-null  object 
 8   original_title         45466 non-null  object 
 9   overview               44512 non-null  object 
 10  popularity             45461 non-null  object 
 11  poster_path            45080 non-null  object 
 12  production_companies   45463 non-null  object 
 13  production_countries   45463 non-null  object 
 14  release_date           45379 non-null  object 
 15  re

### Checking if there are missing values.

In [6]:
df_metadata_ori.isnull().sum()

adult                        0
belongs_to_collection    40972
budget                       0
genres                       0
homepage                 37684
id                           0
imdb_id                     17
original_language           11
original_title               0
overview                   954
popularity                   5
poster_path                386
production_companies         3
production_countries         3
release_date                87
revenue                      6
runtime                    263
spoken_languages             6
status                      87
tagline                  25054
title                        6
video                        6
vote_average                 6
vote_count                   6
dtype: int64

## Data Preprocessing

In [7]:
df_metadata = df_metadata_ori.copy()

### Columns that are needed to build our recommendation system

In [8]:
df_metadata = df_metadata[['id', 'title', 'genres', 'original_language', 'overview', 'tagline', 'production_countries', 'release_date', 'status', 'vote_average', 'vote_count', 'runtime']]

### Check if duplicates titles have same release date 

In [9]:
df_metadata[['title', 'release_date']].duplicated().sum()

32

### Number of movies with no overviews

In [10]:
df_metadata[df_metadata.overview.isnull()].shape[0]

954

### Number of movies that habe not yet been released

In [11]:
df_metadata[df_metadata.status != "Released"].shape[0]

452

**TODO:**  

We will remove movies which:
- have same titles and release date.
- have no overviews.
- have not yet been released.

### Genres and Production Countries

In [12]:
df_metadata['genres'][0]

"[{'id': 16, 'name': 'Animation'}, {'id': 35, 'name': 'Comedy'}, {'id': 10751, 'name': 'Family'}]"

In [13]:
df_metadata['production_countries'][0]

"[{'iso_3166_1': 'US', 'name': 'United States of America'}]"

**TODO:**  

We need to extract the names from the data.

### Extract the movie feature names from the data

In [14]:
def extract_names(data: str) -> str:
    """
    Extract the names from the data.

    :param data: A string representing a list of objects. Each object should have a 'name' key.
    :return: A string containing the names of the features, separated by commas and spaces.
    
    **Example:**
    >>> extract_names("[{'id': 16, 'name': 'Animation'}, {'id': 35, 'name': 'Comedy'}, {'id': 10751, 'name': 'Family'}]")
    'Animation, Comedy, Family'
    """
    if data:
        try:
            # Convert the input string to a JSON-formatted string
            json_str = json.dumps(ast.literal_eval(data))
            # Load the JSON-formatted string into a Python object
            python_obj = json.loads(json_str)
            # Extract the names from the Python object
            data_names = [data['name'] for data in python_obj]
             # Join the names into a single string separated by spaces
            data_names_str = ', '.join(data_names)

            return data_names_str
        
        except TypeError:
            return ""
    else:
        # If the input is empty, return an empty string
        return ""

In [15]:
def clean_movies_data_set(df: pd.DataFrame) -> pd.DataFrame:
    """
    Clean the movies dataset by removing duplicates, null values and non-released movies, 
    and extracting the genre names and production countries from the data.

    :param df: The movie dataset to be cleaned.
    :return: A cleaned pandas DataFrame.
    """
    print(f"The number of movies in the original data set is: {df.shape[0]}")
    
    # Removes duplicates titles that have same release date
    df.drop_duplicates(subset = ['title', 'release_date'], inplace = True)
    
    # Removes movies that have no overview or have not yet been released
    index_drop = df[(df.overview.isnull()) | (df.status != "Released")].index
    df.drop(index_drop, inplace=True)

    # Fills the rows with empty production_countries to NaN
    df.loc[df.production_countries == "[]", 'production_countries'] = pd.NA
    
    # Replaces all the null values with empty string
    df.fillna("", inplace = True)
    
    # Extracts the genre names and production countries from the data
    df['genres'] = df['genres'].apply(extract_names)
    df['production_countries'] = df['production_countries'].apply(extract_names)

    print(f"The number of movies in the cleaned data set is: {df.shape[0]}")
    
    return df

In [16]:
df_metadata = clean_movies_data_set(df_metadata)

The number of movies in the original data set is: 45466
The number of movies in the cleaned data set is: 44065


In [17]:
df_metadata.head()

Unnamed: 0,id,title,genres,original_language,overview,tagline,production_countries,release_date,status,vote_average,vote_count,runtime
0,862,Toy Story,"Animation, Comedy, Family",en,"Led by Woody, Andy's toys live happily in his ...",,United States of America,30/10/1995,Released,7.7,5415.0,81.0
1,8844,Jumanji,"Adventure, Fantasy, Family",en,When siblings Judy and Peter discover an encha...,Roll the dice and unleash the excitement!,United States of America,15/12/1995,Released,6.9,2413.0,104.0
2,15602,Grumpier Old Men,"Romance, Comedy",en,A family wedding reignites the ancient feud be...,Still Yelling. Still Fighting. Still Ready for...,United States of America,22/12/1995,Released,6.5,92.0,101.0
3,31357,Waiting to Exhale,"Comedy, Drama, Romance",en,"Cheated on, mistreated and stepped on, the wom...",Friends are the people who let you be yourself...,United States of America,22/12/1995,Released,6.1,34.0,127.0
4,11862,Father of the Bride Part II,Comedy,en,Just when George Banks has recovered from his ...,Just When His World Is Back To Normal... He's ...,United States of America,10/2/1995,Released,5.7,173.0,106.0


# Keywords

In [18]:
df_keywords = pd.read_csv('data/keywords.csv')

In [19]:
df_keywords = df_keywords.drop_duplicates()

In [20]:
df_keywords.shape

(45432, 2)

In [21]:
df_keywords.head()

Unnamed: 0,id,keywords
0,862,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,8844,"[{'id': 10090, 'name': 'board game'}, {'id': 1..."
2,15602,"[{'id': 1495, 'name': 'fishing'}, {'id': 12392..."
3,31357,"[{'id': 818, 'name': 'based on novel'}, {'id':..."
4,11862,"[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n..."


In [22]:
df_keywords.keywords[0]

"[{'id': 931, 'name': 'jealousy'}, {'id': 4290, 'name': 'toy'}, {'id': 5202, 'name': 'boy'}, {'id': 6054, 'name': 'friendship'}, {'id': 9713, 'name': 'friends'}, {'id': 9823, 'name': 'rivalry'}, {'id': 165503, 'name': 'boy next door'}, {'id': 170722, 'name': 'new toy'}, {'id': 187065, 'name': 'toy comes to life'}]"

### Extract keywords

In [23]:
df_keywords['keywords'] = df_keywords['keywords'].apply(extract_names)

In [24]:
df_keywords.keywords[0]

'jealousy, toy, boy, friendship, friends, rivalry, boy next door, new toy, toy comes to life'

In [25]:
df_keywords.head()

Unnamed: 0,id,keywords
0,862,"jealousy, toy, boy, friendship, friends, rival..."
1,8844,"board game, disappearance, based on children's..."
2,15602,"fishing, best friend, duringcreditsstinger, ol..."
3,31357,"based on novel, interracial relationship, sing..."
4,11862,"baby, midlife crisis, confidence, aging, daugh..."


## Merge two data frames

In [26]:
df_metadata['id'] = df_metadata['id'].astype(str)
df_keywords['id'] = df_keywords['id'].astype(str)
df_merge = pd.merge(df_keywords, df_metadata, on='id')

In [27]:
df_merge.head()

Unnamed: 0,id,keywords,title,genres,original_language,overview,tagline,production_countries,release_date,status,vote_average,vote_count,runtime
0,862,"jealousy, toy, boy, friendship, friends, rival...",Toy Story,"Animation, Comedy, Family",en,"Led by Woody, Andy's toys live happily in his ...",,United States of America,30/10/1995,Released,7.7,5415.0,81.0
1,8844,"board game, disappearance, based on children's...",Jumanji,"Adventure, Fantasy, Family",en,When siblings Judy and Peter discover an encha...,Roll the dice and unleash the excitement!,United States of America,15/12/1995,Released,6.9,2413.0,104.0
2,15602,"fishing, best friend, duringcreditsstinger, ol...",Grumpier Old Men,"Romance, Comedy",en,A family wedding reignites the ancient feud be...,Still Yelling. Still Fighting. Still Ready for...,United States of America,22/12/1995,Released,6.5,92.0,101.0
3,31357,"based on novel, interracial relationship, sing...",Waiting to Exhale,"Comedy, Drama, Romance",en,"Cheated on, mistreated and stepped on, the wom...",Friends are the people who let you be yourself...,United States of America,22/12/1995,Released,6.1,34.0,127.0
4,11862,"baby, midlife crisis, confidence, aging, daugh...",Father of the Bride Part II,Comedy,en,Just when George Banks has recovered from his ...,Just When His World Is Back To Normal... He's ...,United States of America,10/2/1995,Released,5.7,173.0,106.0


In [28]:
df_merge.shape

(44064, 13)

In [29]:
df_merge.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44064 entries, 0 to 44063
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   id                    44064 non-null  object 
 1   keywords              44064 non-null  object 
 2   title                 44064 non-null  object 
 3   genres                44064 non-null  object 
 4   original_language     44064 non-null  object 
 5   overview              44064 non-null  object 
 6   tagline               44064 non-null  object 
 7   production_countries  44064 non-null  object 
 8   release_date          44064 non-null  object 
 9   status                44064 non-null  object 
 10  vote_average          44064 non-null  float64
 11  vote_count            44064 non-null  float64
 12  runtime               44064 non-null  float64
dtypes: float64(3), object(10)
memory usage: 4.4+ MB


# Content-Based Filtering

### Create a new column named 'soup', which is a string contains all the data that we want to feed to the model.  
- Soup: genres, original language, overview, tagline, keywords,  production countries

In [30]:
def create_soup(movie: pd.Series) -> str:
    """
    Concatenates several movies features into a single string to create a soup of text.

    :param movie: A movie containing features to concatenate.
    :return: A string containing the concatenated movie features.
    """
    return (movie.genres + " "+ movie.original_language + " "  \
            + movie.overview + " " + movie.tagline + " " + movie.keywords + " " \
            + movie.production_countries).lower()

In [31]:
df_merge["soup"] = df_merge.apply(create_soup, axis = 1)

In [32]:
df_merge.soup[0]

"animation, comedy, family en led by woody, andy's toys live happily in his room until andy's birthday brings buzz lightyear onto the scene. afraid of losing his place in andy's heart, woody plots against buzz. but when circumstances separate buzz and woody from their owner, the duo eventually learns to put aside their differences.  jealousy, toy, boy, friendship, friends, rivalry, boy next door, new toy, toy comes to life united states of america"

In [33]:
df_merge.head()

Unnamed: 0,id,keywords,title,genres,original_language,overview,tagline,production_countries,release_date,status,vote_average,vote_count,runtime,soup
0,862,"jealousy, toy, boy, friendship, friends, rival...",Toy Story,"Animation, Comedy, Family",en,"Led by Woody, Andy's toys live happily in his ...",,United States of America,30/10/1995,Released,7.7,5415.0,81.0,"animation, comedy, family en led by woody, and..."
1,8844,"board game, disappearance, based on children's...",Jumanji,"Adventure, Fantasy, Family",en,When siblings Judy and Peter discover an encha...,Roll the dice and unleash the excitement!,United States of America,15/12/1995,Released,6.9,2413.0,104.0,"adventure, fantasy, family en when siblings ju..."
2,15602,"fishing, best friend, duringcreditsstinger, ol...",Grumpier Old Men,"Romance, Comedy",en,A family wedding reignites the ancient feud be...,Still Yelling. Still Fighting. Still Ready for...,United States of America,22/12/1995,Released,6.5,92.0,101.0,"romance, comedy en a family wedding reignites ..."
3,31357,"based on novel, interracial relationship, sing...",Waiting to Exhale,"Comedy, Drama, Romance",en,"Cheated on, mistreated and stepped on, the wom...",Friends are the people who let you be yourself...,United States of America,22/12/1995,Released,6.1,34.0,127.0,"comedy, drama, romance en cheated on, mistreat..."
4,11862,"baby, midlife crisis, confidence, aging, daugh...",Father of the Bride Part II,Comedy,en,Just when George Banks has recovered from his ...,Just When His World Is Back To Normal... He's ...,United States of America,10/2/1995,Released,5.7,173.0,106.0,comedy en just when george banks has recovered...


In [34]:
from sentence_transformers import SentenceTransformer

# Initialize the model with the 'all-MiniLM-L6-v2' pre-trained model and load it to GPU if available
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2').to('cuda')

In [35]:
# Encode a list of sentences in the 'soup' column using the pre-trained model
sentence_embeddings = model.encode(df_merge['soup'].tolist())

In [36]:
# Compute similarity
from sklearn.metrics.pairwise import cosine_similarity
cos_sim = cosine_similarity(sentence_embeddings)

In [37]:
def lowercase_title(title: str) -> str:
    """
    Creates a lowercase version of movie title.
    
    :param title: The movie title.
    :return: A lowercase version of the movie title.
    """
    return title.lower()

# Construct a reverse map of movie titles to indices
movie_indices = pd.Series(df_merge.index, index=df_merge['title'].apply(lowercase_title)).drop_duplicates()


In [38]:
movie_indices.head()

title
toy story                      0
jumanji                        1
grumpier old men               2
waiting to exhale              3
father of the bride part ii    4
dtype: int64

### IMDB's weighted rating

A movie with an average rating of 9 based on and only 2 votes cannot be considered better than a movie with a lower average rating of 8 but has 1000 votes. So we will be using IMDB's weighted rating to determine the quality of a movie.

In [39]:
def weighted_rating(movie: pd.Series, 
                    min_vote_counts: int, 
                    mean_vote_average: float) -> float:
    """
    Calculate the weighted rating for a movie based on its vote count, vote average,
    and the minimum vote counts and mean vote average across the dataset.

    :param movie: A DataFrame row representing a movie.
    :param min_votes: The mnimum votes required to be listed in the chart.
    :param mean_vote_average: The mean vote average across the whole dataset.
    :return: The weighted rating for the movie.
    """
    vote_count = movie['vote_count']
    vote_average = movie['vote_average']
    return (vote_count/(vote_count+min_vote_counts) * vote_average) + (min_vote_counts/(min_vote_counts+vote_count) * mean_vote_average)

In [80]:
def get_recommended_movie_indices(title: str) -> list[int]:
    """
    Get the indices of the top 50 most similar movies based on the cosine similarity of their metadata and keywords, regardless of the quality of the movie.

    :param title: The title of the movie to find similar movies for.
    :param cosine_sim: A matrix of pairwise cosine similarity scores between all movies, defaults to `cos_sim`.
    :return: A list of recommended movie indices.
    """

    try:
        # Get the index of the movie that matches the title
        movie_index = movie_indices[title.lower()]
        
        # If there are multiple movies with the same title, pick the first one.
        if isinstance(movie_index, pd.Series):
            movie_index = movie_index[0]
        
    except KeyError:
        return(f"Movie '{title}' not found. Please enter a valid movie title.")

    # Get the pairwise similarity scores of all movies with that movie
    sim_scores = list(enumerate(cos_sim[movie_index]))

    # Sort the movies based on the similarity scores and get the first 50 movie recommendations
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:51]

    # Get the movie indices
    recommended_movie_indices = [sim_score[0] for sim_score in sim_scores]
    
    return recommended_movie_indices

In [81]:
def filter_out_bad_movies(recommended_movie_indices: list[int]) -> None:
    """
    Print the details of the recommended movies after filtering out bad movies based on IMDB's weighted rating.

    :param recommended_movie_indices: The indices of recommended movies.
    """
    movie_details = ['id', 'title', 'genres','original_language', 'production_countries', 'release_date', 'overview', 'vote_count', 'vote_average', 'runtime']
    
    movies = df_merge.loc[recommended_movie_indices, movie_details]

    # Get the vote counts and vote averages for the movies
    vote_counts = movies[movies['vote_count'].notnull()]['vote_count']
    vote_averages = movies[movies['vote_average'].notnull()]['vote_average']
    
    # Calculate the minimum votes required to be listed in the chart and the mean vote average
    min_vote_counts = vote_counts.quantile(0.6)
    mean_vote_average = vote_averages.mean()

    # Filter the movies based on the conditions for vote count and vote average
    qualified_movies = movies[(movies['vote_count'] >= min_vote_counts) & (movies['vote_count'].notnull()) & (movies['vote_average'].notnull())]
    
    # Apply the weighted_rating function to calculate the weighted ratings for each movie
    qualified_movies['weighted_rating'] = qualified_movies.apply(lambda movie: weighted_rating(movie, min_vote_counts, mean_vote_average), axis=1)
    qualified_movies = qualified_movies.sort_values('weighted_rating', ascending=False)
    return qualified_movies

    

In [82]:
def get_movie_recommendations_cbf(title: str,
                                  num_recommendations: int = 5) -> None:
    """
    Get movie recommendations based on a given title using content-based filtering.

    :param title: The title of the movie to find similar movies for.
    :param num_recommendations: The number of recommended movies, defaults to 5.
    """
    
    recommended_movie_indices = get_recommended_movie_indices(title)
    recommended_movies = filter_out_bad_movies(recommended_movie_indices).head(num_recommendations)

    for _, movie in recommended_movies.iterrows():
        # Print the movie details
        print(f"Title: {movie['title']}")
        print(f"Overview: {movie['overview']}")
        print(f"Genres: {movie['genres']}")
        print(f"Original Language: {movie['original_language']}")
        print(f"Runtime: {int(movie['runtime'])} mins")
        print(f"Production Countries: {movie['production_countries']}")
        print(f"Release Date: {movie['release_date']}")
        print(f"Rating: {movie['vote_average']} out of 10 ({int(movie['vote_count'])} ratings)")
        print("")

In [83]:
dropdown = widgets.Dropdown(options=df_merge['title'].unique())
search_box = widgets.Text(placeholder='Search movie title...')
button = widgets.Button(description='Get Movie Recommendations')
button.layout.width = "200px"

def on_search_box_value_change(change):
    """
    Event listener for the search box widget. Updates the dropdown options based on the search query.
    """

    search_value = change.new.lower()
    options = df_merge[df_merge['title'].str.lower().str.contains(search_value)]['title'].unique()
    dropdown.options = options if len(options) > 0 else ['Movie not found']
    dropdown.label = options[0] if len(options) > 0 else 'Movie not found'

def on_button_click(button):
    """
    Event listener for the button widget. Displays the recommended movies.
    """

    clear_output()
    display(search_box)
    display(dropdown)
    display(button)
    title = dropdown.value

    if title != "Movie not found":
        search_box.value = dropdown.value
        get_movie_recommendations_cbf(title)
        
    else:
        print("Movie Not Found")

# Attach event listeners to the widgets
search_box.observe(on_search_box_value_change, names='value')
button.on_click(on_button_click)

# Display the widgets
display(search_box)
display(dropdown)
display(button)


Text(value='Transfor', placeholder='Search movie title...')

Dropdown(index=4, options=('The Transformers: The Movie', 'Transformers', 'Transformers: Revenge of the Fallen…

Button(description='Get Movie Recommendations', layout=Layout(width='200px'), style=ButtonStyle())

Title: Star Wars
Overview: Princess Leia is captured and held hostage by the evil Imperial forces in their effort to take over the galactic Empire. Venturesome Luke Skywalker and dashing captain Han Solo team together with the loveable robot duo R2-D2 and C-3PO to rescue the beautiful princess and restore peace and justice in the Empire.
Genres: Adventure, Action, Science Fiction
Original Language: en
Runtime: 121 mins
Production Countries: United States of America
Release Date: 25/5/1977
Rating: 8.1 out of 10 (6778 ratings)

Title: Avengers: Age of Ultron
Overview: When Tony Stark tries to jumpstart a dormant peacekeeping program, things go awry and Earth’s Mightiest Heroes are put to the ultimate test as the fate of the planet hangs in the balance. As the villainous Ultron emerges, it is up to The Avengers to stop him from enacting his terrible plans, and soon uneasy alliances and unexpected action pave the way for an epic and unique global adventure.
Genres: Action, Adventure, Scien

# Collaborative Filtering

In [44]:
df_ratings = pd.read_csv("data/ratings_small.csv")

In [45]:
df_ratings.shape

(100004, 4)

In [46]:
df_ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [47]:
reader = Reader()
data = Dataset.load_from_df(df_ratings[['userId', 'movieId', 'rating']], reader)

In [48]:
svd = SVD()
cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8908  0.8918  0.8999  0.9007  0.8991  0.8965  0.0043  
MAE (testset)     0.6868  0.6875  0.6946  0.6932  0.6906  0.6906  0.0030  
Fit time          0.82    0.87    0.67    0.72    0.87    0.79    0.08    
Test time         0.09    0.08    0.08    0.08    0.10    0.09    0.01    


{'test_rmse': array([0.89078263, 0.89182513, 0.89992119, 0.9006902 , 0.8991206 ]),
 'test_mae': array([0.68678832, 0.68754638, 0.69455704, 0.69322746, 0.69063624]),
 'fit_time': (0.824134111404419,
  0.8717362880706787,
  0.665790319442749,
  0.7157456874847412,
  0.8705189228057861),
 'test_time': (0.08936953544616699,
  0.08322405815124512,
  0.08230996131896973,
  0.07801151275634766,
  0.09910368919372559)}

In [49]:
trainset = data.build_full_trainset()
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x209ff4dc220>

In [50]:
# Predict the rating of User 1 for the movie with the movie ID 1000."
svd.predict(1, 1029)

Prediction(uid=1, iid=1029, r_ui=None, est=2.580544496354618, details={'was_impossible': False})

# Hybrid Recommender

In [51]:
df_ids = pd.read_csv("data/links.csv")[['movieId', 'tmdbId']]
df_ids.head()

Unnamed: 0,movieId,tmdbId
0,1,862.0
1,2,8844.0
2,3,15602.0
3,4,31357.0
4,5,11862.0


In [52]:
df_ids.dropna(inplace=True)

In [53]:
df_ids.shape

(45624, 2)

In [54]:
df_ids.columns = ['movieId', 'id']
df_ids['id'] = df_ids['id'].astype(int)
df_ids['id'] = df_ids['id'].astype(str)

In [55]:
df_merge_ids = df_merge.merge(df_ids, on='id')

In [56]:
df_merge_ids.head()

Unnamed: 0,id,keywords,title,genres,original_language,overview,tagline,production_countries,release_date,status,vote_average,vote_count,runtime,soup,movieId
0,862,"jealousy, toy, boy, friendship, friends, rival...",Toy Story,"Animation, Comedy, Family",en,"Led by Woody, Andy's toys live happily in his ...",,United States of America,30/10/1995,Released,7.7,5415.0,81.0,"animation, comedy, family en led by woody, and...",1
1,8844,"board game, disappearance, based on children's...",Jumanji,"Adventure, Fantasy, Family",en,When siblings Judy and Peter discover an encha...,Roll the dice and unleash the excitement!,United States of America,15/12/1995,Released,6.9,2413.0,104.0,"adventure, fantasy, family en when siblings ju...",2
2,15602,"fishing, best friend, duringcreditsstinger, ol...",Grumpier Old Men,"Romance, Comedy",en,A family wedding reignites the ancient feud be...,Still Yelling. Still Fighting. Still Ready for...,United States of America,22/12/1995,Released,6.5,92.0,101.0,"romance, comedy en a family wedding reignites ...",3
3,31357,"based on novel, interracial relationship, sing...",Waiting to Exhale,"Comedy, Drama, Romance",en,"Cheated on, mistreated and stepped on, the wom...",Friends are the people who let you be yourself...,United States of America,22/12/1995,Released,6.1,34.0,127.0,"comedy, drama, romance en cheated on, mistreat...",4
4,11862,"baby, midlife crisis, confidence, aging, daugh...",Father of the Bride Part II,Comedy,en,Just when George Banks has recovered from his ...,Just When His World Is Back To Normal... He's ...,United States of America,10/2/1995,Released,5.7,173.0,106.0,comedy en just when george banks has recovered...,5


In [57]:
indices_map = df_merge_ids.set_index('id')
indices_map.head()

Unnamed: 0_level_0,keywords,title,genres,original_language,overview,tagline,production_countries,release_date,status,vote_average,vote_count,runtime,soup,movieId
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
862,"jealousy, toy, boy, friendship, friends, rival...",Toy Story,"Animation, Comedy, Family",en,"Led by Woody, Andy's toys live happily in his ...",,United States of America,30/10/1995,Released,7.7,5415.0,81.0,"animation, comedy, family en led by woody, and...",1
8844,"board game, disappearance, based on children's...",Jumanji,"Adventure, Fantasy, Family",en,When siblings Judy and Peter discover an encha...,Roll the dice and unleash the excitement!,United States of America,15/12/1995,Released,6.9,2413.0,104.0,"adventure, fantasy, family en when siblings ju...",2
15602,"fishing, best friend, duringcreditsstinger, ol...",Grumpier Old Men,"Romance, Comedy",en,A family wedding reignites the ancient feud be...,Still Yelling. Still Fighting. Still Ready for...,United States of America,22/12/1995,Released,6.5,92.0,101.0,"romance, comedy en a family wedding reignites ...",3
31357,"based on novel, interracial relationship, sing...",Waiting to Exhale,"Comedy, Drama, Romance",en,"Cheated on, mistreated and stepped on, the wom...",Friends are the people who let you be yourself...,United States of America,22/12/1995,Released,6.1,34.0,127.0,"comedy, drama, romance en cheated on, mistreat...",4
11862,"baby, midlife crisis, confidence, aging, daugh...",Father of the Bride Part II,Comedy,en,Just when George Banks has recovered from his ...,Just When His World Is Back To Normal... He's ...,United States of America,10/2/1995,Released,5.7,173.0,106.0,comedy en just when george banks has recovered...,5


In [85]:
def predict_user_rating(userId: int, 
                        qualified_movies: pd.DataFrame) -> pd.DataFrame:
    """
    Predict user ratings for qualified movies based on user ID using Singular Value Decomposition (SVD).

    :param userId: The ID of the user.
    :param qualified_movies:  A Pandas DataFrame containing qualified movies data.
    :return: A Pandas DataFrame containing the final qualified movies sorted by estimated user ratings.
    """

    # Calculate estimated user ratings for qualified movies using SVD
    qualified_movies['estimated_user_rating'] = qualified_movies['id'].apply(lambda x: svd.predict(userId, indices_map.loc[x]['movieId']).est)
    final_qualified_movies = qualified_movies.sort_values(by=['estimated_user_rating'], ascending=False)
    
    return final_qualified_movies


In [86]:
def get_movie_recommendations_hybrid(title: str, 
                              userId: int) -> None:
    """
    Get movie recommendations based on a given title and user ID.
     
    :param title: The title of the movie to find similar movies for.
    :param userId: The ID of the user.
    """
    # Get recommended movie indices based on the given title
    recommended_movie_indices = get_recommended_movie_indices(title)

    # Filter out bad movies and select the top 10 qualified movies
    qualified_movies = filter_out_bad_movies(recommended_movie_indices).head(10)
    
    # Predict user ratings for qualified movies and select the top recommended movies
    final_qualified_movies = predict_user_rating(userId, qualified_movies).head(5)
    
    for _, movie in final_qualified_movies.iterrows():
        # Print the movie details
        print(f"Title: {movie['title']}")
        print(f"Overview: {movie['overview']}")
        print(f"Genres: {movie['genres']}")
        print(f"Original Language: {movie['original_language']}")
        print(f"Runtime: {int(movie['runtime'])} mins")
        print(f"Production Countries: {movie['production_countries']}")
        print(f"Release Date: {movie['release_date']}")
        print(f"Rating: {movie['vote_average']} out of 10 ({int(movie['vote_count'])} ratings)")
        print("")

In [None]:
dropdown = widgets.Dropdown(options=df_merge['title'].unique())
search_box = widgets.Text(placeholder='Search movie title...')
userId_text = widgets.Text(placeholder = 'Enter a user ID')
button = widgets.Button(description='Get Movie Recommendations')
button.layout.width ='200px'

container = widgets.HBox([search_box, userId_text])
container2 = widgets.HBox([dropdown, button])

def on_search_box_value_change(change):
    """
    Event listener for the search box widget. Updates the dropdown options based on the search query.
    """
    search_value = change.new.lower()
    options = df_merge[df_merge['title'].str.lower().str.contains(search_value)]['title'].unique()
    dropdown.options = options if len(options) > 0 else ['Movie not found']
    dropdown.label = options[0] if len(options) > 0 else 'Movie not found'

def on_button_click(button):
    """
    Event listener for the button widget. Displays the recommended movies.
    """
    clear_output()
    display(container)
    display(container2)
    try:
        userId = int(userId_text.value)
        title = dropdown.value
        if title != "Movie not found":
            search_box.value = dropdown.value
            get_movie_recommendations_hybrid(title, userId)
        else:
            print("Movie Not Found")

    except:
        print("Please enter a valid user ID!")
    
# Attach event listeners to the widgets
search_box.observe(on_search_box_value_change, names='value')
button.on_click(on_button_click)

print("Movie Recommendation System")
display(container)
display(container2)

HBox(children=(Text(value='Toy Story', placeholder='Search movie title...'), Text(value='67', placeholder='Ent…

HBox(children=(Dropdown(options=('Toy Story', 'Toy Story 2', 'Toy Story 3', 'Toy Story of Terror!', 'Toy Story…

Title: Big
Overview: A young boy, Josh Baskin makes a wish at a carnival machine to be big. He wakes up the following morning to find that it has been granted and his body has grown older overnight. But he is still the same 13-year-old boy inside. Now he must learn how to cope with the unfamiliar world of grown-ups including getting a job and having his first romantic encounter with a woman. What will he find out about this strange world?
Genres: Fantasy, Drama, Comedy, Romance, Family
Original Language: en
Runtime: 104 mins
Production Countries: United States of America
Release Date: 3/6/1988
Rating: 6.9 out of 10 (1022 ratings)

Title: Toy Story 3
Overview: Woody, Buzz, and the rest of Andy's toys haven't been played with in years. With Andy about to go to college, the gang find themselves accidentally left at a nefarious day care center. The toys must band together to escape and return home to Andy.
Genres: Animation, Family, Comedy
Original Language: en
Runtime: 103 mins
Production