In [2]:
import pandas as pd
import numpy as np
import re 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import ipywidgets as widgets
from IPython.display import display


In [3]:
import os

for dirpath, dirnames, filenames in os.walk('./ml-25m/ml-25m/'):
    for f in filenames:
        if f.endswith(".csv"):  
            print(os.path.join(dirpath, f))


./ml-25m/ml-25m/genome-scores.csv
./ml-25m/ml-25m/genome-tags.csv
./ml-25m/ml-25m/links.csv
./ml-25m/ml-25m/movies.csv
./ml-25m/ml-25m/ratings.csv
./ml-25m/ml-25m/tags.csv


In [5]:
movies = pd.read_csv(".\ml-25m\ml-25m\movies.csv")
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
62418,209157,We (2018),Drama
62419,209159,Window of the Soul (2001),Documentary
62420,209163,Bad Poems (2018),Comedy|Drama
62421,209169,A Girl Thing (2001),(no genres listed)


In [6]:
def clean_title(title):
    return re.sub("[^a-zA-Z0-9]"," ", title)

In [7]:
movies["title_clean"] = movies["title"].apply(clean_title)
movies

Unnamed: 0,movieId,title,genres,title_clean
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji 1995
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men 1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale 1995
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II 1995
...,...,...,...,...
62418,209157,We (2018),Drama,We 2018
62419,209159,Window of the Soul (2001),Documentary,Window of the Soul 2001
62420,209163,Bad Poems (2018),Comedy|Drama,Bad Poems 2018
62421,209169,A Girl Thing (2001),(no genres listed),A Girl Thing 2001


In [8]:
vectorizer = TfidfVectorizer(ngram_range=(1,2))
tfidf =  vectorizer.fit_transform(movies["title_clean"])

In [9]:
def search(title):
    title = clean_title(title)
    query_vec = vectorizer.transform([title])
    similarity = cosine_similarity(query_vec, tfidf).flatten()
    indices = np.argpartition(similarity, -5)[-5:]
    results = movies.iloc[indices][::-1]

    return results

In [11]:
movie_input =  widgets.Text( 
    value = "Toy Story",
    description= "Movie Title:",
    disabled = False
    )

movie_list = widgets.Output()

def on_type(data):
    with movie_list:
        movie_list.clear_output()
        title = data["new"]
        if len(title) > 5:
            display(search(title))
            
            
movie_input.observe(on_type, names="value")

display(movie_input, movie_list)


    

Text(value='Toy Story', description='Movie Title:')

Output()