In [99]:
# Import pandas library for data manipulation and analysis
import pandas as pd
# Load the movies dataset from CSV file into a DataFrame
movies = pd.read_csv("movies.csv")
# Load the ratings dataset from CSV file into a DataFrame
ratings = pd.read_csv("ratings.csv")

In [101]:
import re

def clean_title(title):
    """
    Clean a title string by removing all non-alphanumeric characters.
    
    Args:
        title (str): The title string to clean
        
    Returns:
        str: The cleaned title with only alphanumeric characters and spaces
    """
    # Remove any character that is not a letter, number, or space using regex
    title = re.sub("[^a-zA-Z0-9 ]", "", title)
    return title

In [86]:

movies["clean_title"] = movies["title"].apply(clean_title)
    

In [87]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the TfidfVectorizer with ngram range of 1 to 2
vectorizer = TfidfVectorizer(ngram_range=(1, 2))

# Fit and transform the 'clean_title' column from the movies DataFrame
tfidf_matrix = vectorizer.fit_transform(movies["clean_title"])


In [88]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def search(title):
    # Clean the input title
    title = clean_title(title)
    
    # Transform the cleaned title into a TF-IDF vector
    query_vec = vectorizer.transform([title])
    
    # Compute cosine similarity between the query vector and the TF-IDF matrix
    similarity = cosine_similarity(query_vec, tfidf).flatten()
    
    # Get the indices of the top 5 most similar titles
    indices = np.argpartition(similarity, -5)[-5:]
    
    # Retrieve the top 5 most similar titles from the movies DataFrame
    results = movies.iloc[indices].iloc[::-1]
    
    return results


In [89]:
import ipywidgets as widgets
from IPython.display import display

# Create a text input widget for movie title
movie_input = widgets.Text(
    description="Movie title:",
    disabled=False
)

# Create an output widget to display the search results
movie_list = widgets.Output()

# Define the function to handle typing events
def on_type(data):
    with movie_list:
        movie_list.clear_output()
        title = data["new"]
        if len(title) > 5:
            display(search(title))

# Observe changes in the movie_input widget and call the on_type function
movie_input.observe(on_type, names='value')

# Display the input widget and the output widget
display(movie_input, movie_list)


Text(value='', description='Movie title:')

Output()

In [90]:
import pandas as pd

def find_similar_movies(movie_id):
    # Find users who rated the given movie highly
    similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4)]["userId"].unique()
    
    # Find movies that these similar users also rated highly
    similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]
    similar_user_recs = similar_user_recs.value_counts() / len(similar_users)

    # Filter recommendations to those with a significant percentage of similar users
    similar_user_recs = similar_user_recs[similar_user_recs > .10]
    
    # Find all users who rated these recommended movies highly
    all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 4)]
    all_user_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())
    
    # Combine the similar user recommendations and all user recommendations
    rec_percentages = pd.concat([similar_user_recs, all_user_recs], axis=1)
    rec_percentages.columns = ["similar", "all"]
    
    # Calculate a score for each recommended movie
    rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]
    rec_percentages = rec_percentages.sort_values("score", ascending=False)
    
    # Return the top 10 recommended movies with their scores, titles, and genres
    return rec_percentages.head(10).merge(movies, left_index=True, right_on="movieId")[["score", "title", "genres"]]


In [82]:
import ipywidgets as widgets
from IPython.display import display

# Create a text input widget for movie title
movie_input_name = widgets.Text(
    description="Movie Title: ",
    disabled=False
)

# Create an output widget to display the recommendations
recommendation_list = widgets.Output()

# Define the function to handle typing events
def on_type(data):
    with recommendation_list:
        recommendation_list.clear_output()
        title = data["new"]
        if len(title) > 5:
            results = search(title)
            movie_id = results.iloc[0]["movieId"]
            display(find_similar_movies(movie_id))

# Observe changes in the movie_input_name widget and call the on_type function
movie_input_name.observe(on_type, names="value")

# Display the input widget and the output widget
display(movie_input_name, recommendation_list)


Text(value='', description='Movie Title: ')

Output()