In [104]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

import ipywidgets as widgets
from IPython.display import display

#get the movies data

movies_path = "data/ml-25m/movies.csv"

movies = pd.read_csv(movies_path)

In [105]:
import re

def clean_title(title):
    #Remove characters that aren't space, digit, or upper/lower case letter from the title
    return re.sub("[^a-zA-Z0-9 ]","",title)

In [106]:
movies['clean_title'] = movies["title"].apply(clean_title)

#build the search engine

In [107]:
#Build a term-frequency matrix
#Inverse Document Frequency
#Get a vector that describes each movie title
    
vectorizer = TfidfVectorizer(ngram_range=(1,2))

tfidf = vectorizer.fit_transform(movies['clean_title'])

In [108]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def search(title):

    title = clean_title(title)
    #Transform search term into a vector
    query_vec = vectorizer.transform([title])
    #compare query term and will return similar items
    similarity = cosine_similarity(query_vec, tfidf).flatten()

    #find title with greatest similarities
    indices = np.argpartition(similarity, -1)[-5:]

    results = movies.iloc[indices][::-1]
    
    return results

In [109]:
#Recommendation system

#Get recommendation data
ratings_path = 'data/ml-25m/ratings.csv'
ratings = pd.read_csv(ratings_path)

In [110]:
def find_similar_movies(movie_id):
    
    similar_users = ratings[(ratings['movieId'] == movie_id) & (ratings['rating'] >=5)]['userId'].unique()
    #find all movies that people who are similar to us 10% or more also liked
    similar_user_recs = ratings[(ratings['userId'].isin(similar_users)) & (ratings['rating'] > 4)]['movieId']
    similar_user_recs = similar_user_recs.value_counts()/len(similar_users)
    similar_user_recs = similar_user_recs[similar_user_recs > .1]
    
    # Find out how much all users like movies
    all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 4)]

    #Find what percentage of all users recommend all movies
    all_user_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())
    
    #Creating a recommendation score
    rec_percentages = pd.concat([similar_user_recs, all_user_recs], axis = 1)
    rec_percentages.columns = ["similar","all"]

    rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]

    rec_percentages = rec_percentages.sort_values("score", ascending = False)

    return rec_percentages.head(10).merge(movies, left_index=True, right_on = "movieId")[["score","title","genres"]]

In [111]:
#Build interactive Jupyter Notebook Widget

movie_name_input = widgets.Text(
    value = "Toy Story",
    description = "Movie Title: ",
    disabled = False
)

recommendation_list = widgets.Output()

def on_type(data):
    with recommendation_list:
        recommendation_list.clear_output()
        title = data["new"]
        if len(title) >5:
            results = search(title)
            movie_id = results.iloc[0]["movieId"]
            display(find_similar_movies(movie_id))

movie_name_input.observe(on_type,names = 'value')

display(movie_name_input, recommendation_list)

Text(value='Toy Story', description='Movie Title: ')

Output()

Next Steps
- Improve quality of Recommendations
- Add input box for showing genres as filter or recommendation driver
- Use metadata from other files to improve recommendations