In [1]:
# Importing the libraries

import numpy as np
import pandas as pd
import re

import ipywidgets as widgets
from IPython.display import display

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


In [2]:
# Reading the data of movies
movies = pd.read_csv("movies.csv")
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
62418,209157,We (2018),Drama
62419,209159,Window of the Soul (2001),Documentary
62420,209163,Bad Poems (2018),Comedy|Drama
62421,209169,A Girl Thing (2001),(no genres listed)


In [7]:
# This function removes the unneccessary characters in the title (eg: space, symbols)
def clean_title(title):
    return re.sub("[^a-zA-Z0-9 ]", "",title)

# Creating a new column for the clean titles
movies["clean_title"] = movies["title"].apply(clean_title)

# Vectorizing the clean titles
vectorizer = TfidfVectorizer(ngram_range=(1,2))
tfidf = vectorizer.fit_transform(movies["clean_title"])

# This fuction find out the best matches for a given movie title
def search(title):
    title = clean_title(title)                                  # Removing the unneccessary characters in the title
    query_vec = vectorizer.transform([title])                   # Vectorizing the title(eg: space, symbols)
    similarity = cosine_similarity(query_vec, tfidf).flatten()  # Finding the similar matches
    indices = np.argpartition(similarity,-5)[-5:]               # Selecting the indexes of  best 5  matchings
    results = movies.iloc[indices][::-1]                        # Sorting out the best 5 matchings
    return results

In [8]:
# Reading the data of ratings
ratings = pd.read_csv("ratings.csv")
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510
...,...,...,...,...
25000090,162541,50872,4.5,1240953372
25000091,162541,55768,2.5,1240951998
25000092,162541,56176,2.0,1240950697
25000093,162541,58559,4.0,1240953434


In [9]:
# This function gives a list of 10 recommendations for a given movie title 
def find_similar_movies(movie_id):

    # Creating a list of users(IDs) who gives more than 4 for the ratings
    similar_users = ratings[(ratings["movieId"]==1)& (ratings["rating"]>4)]["userId"].unique()
    # Creating a list of movies(IDs) which are watched by the above users & have more than 4 for the ratings
    similar_users_recs = ratings[(ratings["userId"].isin(similar_users))& (ratings["rating"]>4)]["movieId"]
    # Finiding the watching rate among the above users for each movie and sorting out them in ascending order
    similar_users_recs = similar_users_recs.value_counts() /len(similar_users)
    # Choosing the movies which have more than 0.1 for the watching rate 
    similar_users_recs = similar_users_recs[similar_users_recs>.1]

    # Creating a dataframe(considering the all users) which have the same movie Ids as above list and have more than 4 for the ratings 
    all_users = ratings[(ratings["movieId"].isin(similar_users_recs.index))& (ratings["rating"]>4)]
    # Finiding the watching rate among all the users for each movie and sorting out them in ascending order
    all_users_recs = all_users["movieId"].value_counts()/len(all_users["userId"].unique())
    
    # Creating a data frame of the movies including the watching rates among similar users and all users for each movie
    rec_percentages = pd.concat([similar_users_recs ,all_users_recs], axis =1)
    rec_percentages.columns = ["similar","all"]
    
    # Finding the  score of recommandation for each movie in the above list  
    rec_percentages["score"] = rec_percentages["similar"]/rec_percentages["all"]
    # Sorting out the above movies in ascending order of scores
    rec_percentages = rec_percentages.sort_values("score",ascending=False)
    
    # Choosing the best 10 recommendations and merging them with the data in "movie.csv" dataframe
    return rec_percentages.head(10).merge(movies,left_index=True,right_on= "movieId")[["score","title","genres"]]

In [6]:
# Creating the input widget for enter the movie title
movie_input = widgets.Text(description = "Movie Title: ", disabled = False)
# Creating the output widget
recommended_movies = widgets.Output()

# This function display the best 10 recommendations for the input which is typed in the input widget
def on_type(data):
    with recommended_movies:
        # Clearing the output widget
        recommended_movies.clear_output()
        # Reading the input
        title = data["new"]

        # When the length of title is more than 5
        if len(title) > 5:
            # Removing the unneccessary characters in the title
            result = search(title)
            # Filtering out the movie Id of best matching for given input 
            movie_id = result.iloc[0]["movieId"]
            # Displaying the best 10 recommendations
            display(find_similar_movies(movie_id))
            
# Applying the input to the above function          
movie_input.observe(on_type, names ="value")
# Displaying the input & output widgets
display(movie_input,recommended_movies)

Text(value='', description='Movie Title: ')

Output()