In [1]:
import pandas as pd, numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import ipywidgets as widgets
from IPython.display import display

# We start by importing necessary libraries such as pandas, numpy, re, TfidfVectorizer, 
# cosine_similarity, ipywidgets, and display from IPython.


In [2]:
def clean_title(title):
    return re.sub("[^a-zA-z0-9 ]", "", title)

# This is a helper function, that takes a movie title as input and removes 
# any special characters or symbols, leaving only alphanumeric characters and spaces.


In [3]:
movies = pd.read_csv("movies.csv")
movies.head()

# this creates a DataFrame called "movies" to store movie data. 

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
movies["clean_title"] = movies.title.apply(clean_title)
movies.head()

# this adds a new column called "clean_title" by applying the clean_title function 
# to the existing "title" column.


Unnamed: 0,movieId,title,genres,clean_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji 1995
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men 1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale 1995
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II 1995


In [5]:
ratings = pd.read_csv("ratings.csv")
ratings.head()

# this creates a DataFrame called "ratings" to store movie data. 

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [6]:

vectorizer = TfidfVectorizer(ngram_range = (1,2)) 
# This function searches for groups of two word that are consecutives, those are called 'ngram'
tfidf = vectorizer.fit_transform(movies['clean_title'])
# this function transforms the sets of titles into a matrix(sets of numbers)

In [7]:
def search(title):
    title = clean_title(title)
    query_vec = vectorizer.transform([title])
    similarity = cosine_similarity(query_vec, tfidf).flatten()
    indices = np.argpartition(similarity, -5)[-5:]
    results = movies.iloc[indices][::-1]
    return results

# The search function takes a movie title as input. It cleans the title using the clean_title function. 
# Then, it transforms the cleaned title into a query vector using the vectorizer's transform method. 
# The cosine similarity between the query vector and all the movie vectors in the tfidf matrix 
# is calculated. The indices of the top 5 most similar movies are extracted using argpartition, 
# and the corresponding movies are returned as the search results.

In [8]:
def find_similar_movies(movie_id):
    similar_users = ratings[(ratings["movieId"] == movie_id ) & (ratings['rating'] > 4)]['userId'].unique()
    similar_user_recs = ratings[(ratings["userId"].isin(similar_users) ) & (ratings['rating'] > 4)]['movieId']
    
    similar_user_recs = similar_user_recs.value_counts()/len(similar_users)
    similar_user_recs = similar_user_recs[similar_user_recs > .10]
    
    all_users = ratings[(ratings['movieId'].isin(similar_user_recs.index)) & (ratings["rating"] > 4)]
    all_users_recs = all_users['movieId'].value_counts() / len(all_users['userId'].unique())
    
    rec_percentages = pd.concat([similar_user_recs, all_users_recs], axis = 1)
    rec_percentages.columns = ["similar", "all"]
    
    rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]
    rec_percentages = rec_percentages.sort_values('score', ascending = False)
    
    return rec_percentages.head(10).merge(movies, left_index = True, right_on = "movieId" )[['score','title','genres']]


# This function takes a movie_id as input. It first identifies the users
# who have rated the given movie highly (rating > 4) and extracts their unique user IDs. 
# Then, it selects all the movies that these similar users have rated highly. 
# These movies are stored in the similar_user_recs variable.

In [9]:

movie_name_input = widgets.Text(
    value = "Toy Story",
    description = "Movie Title:",
    disabled = False
)
recomendation_list = widgets.Output()

def on_type(data):
    with recomendation_list:
        recomendation_list.clear_output()
        title = data["new"]
        if len(title) > 5:
            results = (search(title))
            movie_id = results.iloc[0]["movieId"]
            display(find_similar_movies(movie_id))

movie_name_input.observe(on_type, names = 'value')

# Display the user interface, consisting of the input field for movie titles and the recommendation list
display(movie_name_input, recomendation_list)

Text(value='Toy Story', description='Movie Title:')

Output()