In [14]:
# Importing essential libraries
import pandas as pd
import numpy as np
import re
# For text vectorization and similarity calculation
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
# For creating interactive widgets in Jupyter/Colab
import ipywidgets as widgets
from IPython.display import display

In [15]:
# Load the movies dataset
movies = pd.read_csv('movies.csv')

In [16]:
# Preview
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
62418,209157,We (2018),Drama
62419,209159,Window of the Soul (2001),Documentary
62420,209163,Bad Poems (2018),Comedy|Drama
62421,209169,A Girl Thing (2001),(no genres listed)


In [17]:
# Function to clean movie titles by removing special characters
def clean_title(title):
  return re.sub('[^a-zA-Z0-9 ]','',title)

In [18]:
# Apply the cleaning function to all titles
movies['clean_title'] = movies['title'].apply(clean_title)

In [19]:
# Preview of the updated dataframe
movies

Unnamed: 0,movieId,title,genres,clean_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji 1995
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men 1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale 1995
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II 1995
...,...,...,...,...
62418,209157,We (2018),Drama,We 2018
62419,209159,Window of the Soul (2001),Documentary,Window of the Soul 2001
62420,209163,Bad Poems (2018),Comedy|Drama,Bad Poems 2018
62421,209169,A Girl Thing (2001),(no genres listed),A Girl Thing 2001


In [20]:
# Create tfidf vectors from cleaned titles
vectorizer = TfidfVectorizer(ngram_range=(1,2))
tfidf = vectorizer.fit_transform(movies['clean_title'])

In [21]:
# Search function to find top 5 similar movies by title using cosine similarity
def search(title):
  title = clean_title(title)
  query_vec = vectorizer.transform([title])
  similarity = cosine_similarity(query_vec, tfidf).flatten()
  indices = np.argpartition(similarity, -5)[-5:]
  results = movies.iloc[indices][::-1]
  return results

In [22]:
# Widget for interactive search input
movie_input = widgets.Text(
    value = 'Toy Story',
    description = 'Movie Title:',
    disabled = False
)

# Output widget to display search results
movie_list =  widgets.Output()

# Callback function triggered when input is typed
def on_type(data):
  with movie_list:
    movie_list.clear_output()
    title = data['new']
    if len(title) > 5:      # Only search if input length > 5
      display(search(title))

# Bind the event listener
movie_input.observe(on_type, names='value')

# Display input and output widgets
display(movie_input, movie_list)

Text(value='Toy Story', description='Movie Title:')

Output()

In [23]:
# Load the user ratings dataset
ratings = pd.read_csv('ratings.csv')

In [24]:
# Preview
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1.147880e+09
1,1,306,3.5,1.147869e+09
2,1,307,5.0,1.147869e+09
3,1,665,5.0,1.147879e+09
4,1,899,3.5,1.147869e+09
...,...,...,...,...
15578590,100887,6539,5.0,1.152229e+09
15578591,100887,7153,4.0,1.152229e+09
15578592,100887,8917,4.5,1.152230e+09
15578593,100887,8957,4.0,1.152230e+09


In [25]:
# Function to find similar movies based on collaborative filtering
def find_similar_movies(movie_id):

    # Find users who rated the given movie > 4
    similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4)]["userId"].unique()

    # Get all movies those users rated > 4
    similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]

    # Normalize how many users liked each movie
    similar_user_recs = similar_user_recs.value_counts() / len(similar_users)

    # Filter out less popular movies among similar users
    similar_user_recs = similar_user_recs[similar_user_recs > 0.10]

    # Get how popular these movies are among all users
    all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 4)]
    all_user_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())

    # Combine both ratios to compute a relevance score
    rec_percentage = pd.concat([similar_user_recs, all_user_recs], axis=1)
    rec_percentage.columns = ["similar", "all"]
    rec_percentage["score"] = rec_percentage["similar"] / rec_percentage["all"]

    # Sort by score and merge with movie metadata
    rec_percentage = rec_percentage.sort_values("score", ascending=False)
    return rec_percentage.head(5).merge(movies, left_index=True, right_on="movieId")[["score", "title", "genres"]]


In [26]:
# New widget for collaborative filtering-based recommendation
movie_name_input = widgets.Text(
    value='Toy Story',
    description='Movie Title:',
    disabled=False
)

# Output widget for recommendations
recommendation_list = widgets.Output()

# Callback function for generating recommendations
def on_type(data):
    with recommendation_list:
        recommendation_list.clear_output()
        title = data["new"]
        if len(title) > 5:
            results = search(title)  # First find the matching movie
            movie_id = results.iloc[0]["movieId"]  # Take top match
            display(find_similar_movies(movie_id))  # Show similar movies

# Bind the event
movie_name_input.observe(on_type, names='value')

# Display the final recommendation widget
display(movie_name_input, recommendation_list)


Text(value='Toy Story', description='Movie Title:')

Output()