In [2]:
#Dan's Movie Search Engine
import pandas as pa

movies = pa.read_csv("movies.csv")

In [3]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [9]:
movies.dtypes

movieId     int64
title      object
genres     object
dtype: object

In [10]:
import re

# This is to get rid of the parentheses in any given title using regular expressions to define a new string for each title
def cleanTitle(title):
    title = re.sub("[^a-zA-Z0-9 ]", "", title)
    return title

In [11]:
movies["cleanTitle"] = movies["title"].apply(cleanTitle) 

In [12]:
movies

Unnamed: 0,movieId,title,genres,cleanTitle
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji 1995
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men 1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale 1995
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II 1995
...,...,...,...,...
62418,209157,We (2018),Drama,We 2018
62419,209159,Window of the Soul (2001),Documentary,Window of the Soul 2001
62420,209163,Bad Poems (2018),Comedy|Drama,Bad Poems 2018
62421,209169,A Girl Thing (2001),(no genres listed),A Girl Thing 2001


In [14]:
# this is a term frequency–inverse document frequency table. (tfidf)
#Will take titles and up to two word combinations for finding similar titles

from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(ngram_range=(1,2)) 
#gives us a way to split movie titles into its words that can then be compared to other movie terms

tfidf = vectorizer.fit_transform(movies["cleanTitle"]) # table of all terms to compare search terms to

In [18]:
# we will use cosine similarity to find the most similar title terms to the search term by 
# turning each term that matches into a weighted vector that will allow for us to find the most similar titles accurately
# (this even works for the movie years)

from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def search(title):
    title = cleanTitle(title) #we clean the title up so it can be compared
    query_vec = vectorizer.transform([title]) # collects all terms from searched title 
    similarity = cosine_similarity(query_vec, tfidf).flatten() # creates array showing how similar terms are to the search
    indices = np.argpartition(similarity, -5)[-5:] # gets the 5 most highest weighted titles
    results = movies.iloc[indices].iloc[::-1] # matches the weighted terms movie id to the actual movies table
    
    return results
    

In [16]:
# pip install ipywidgets
#jupyter labextension install @jupyter-widgets/jupyterlab-manager

In [20]:
import ipywidgets as widgets
from IPython.display import display

movie_input = widgets.Text(
    value='Toy Story',
    description='Movie Title:',
    disabled=False
)
movie_list = widgets.Output()

def on_type(data):
    with movie_list:
        movie_list.clear_output()
        title = data["new"]
        if len(title) > 3:
            display(search(title))

movie_input.observe(on_type, names='value')


display(movie_input, movie_list)

Text(value='Toy Story', description='Movie Title:')

Output()

In [31]:
movie_id = 1
ratings = pa.read_csv("ratings.csv")
movie = movies[movies["movieId"] == movie_id]

In [32]:
def find_similar_movies(movie_id):
    similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4)]["userId"].unique()
    similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]
    similar_user_recs = similar_user_recs.value_counts() / len(similar_users)

    similar_user_recs = similar_user_recs[similar_user_recs > .10]
    all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 4)]
    all_user_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())
    rec_percentages = pa.concat([similar_user_recs, all_user_recs], axis=1)
    rec_percentages.columns = ["similar", "all"]
    
    rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]
    rec_percentages = rec_percentages.sort_values("score", ascending=False)
    return rec_percentages.head(10).merge(movies, left_index=True, right_on="movieId")[["score", "title", "genres"]]

In [34]:
import ipywidgets as widgets
from IPython.display import display

movie_name_input = widgets.Text(
    value='Toy Story',
    description='Movie Title:',
    disabled=False
)
recommendation_list = widgets.Output()

def on_type(data):
    with recommendation_list:
        recommendation_list.clear_output()
        title = data["new"]
        if len(title) > 5:
            results = search(title)
            movie_id = results.iloc[0]["movieId"]
            display(find_similar_movies(movie_id))

movie_name_input.observe(on_type, names='value')

display(movie_name_input, recommendation_list)

Text(value='Toy Story', description='Movie Title:')

Output()