In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import re
import ipywidgets as widgets
from IPython.display import display

In [2]:
movies = pd.read_csv("movies.csv")

In [3]:
movie_links = pd.read_csv("links.csv")

In [4]:
movie_links['tmdbId'] = movie_links['tmdbId'].astype('Int64')

In [5]:
movies = pd.merge(movies, movie_links[['movieId', 'tmdbId']], on="movieId", how="left")

In [6]:
def clean_title(title):
    title = re.sub("[^a-zA-Z0-9 ]", "", title)
    return title

In [7]:
vectorizer = TfidfVectorizer(ngram_range=(1, 2))

In [8]:
movies["clean_title"] = movies["title"].apply(clean_title)
movies["combined_features"] = movies["clean_title"] + ' ' + movies["genres"]

In [9]:
tfidf = vectorizer.fit_transform(movies["combined_features"])

In [10]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def search(title):
    title = clean_title(title)
    query_vec = vectorizer.transform([title])
    similarity = cosine_similarity(query_vec, tfidf).flatten()
    indices = np.argpartition(similarity, -5)[-5:]
    results = movies.iloc[indices].iloc[::-1]
    
    return results


In [11]:
import ipywidgets as widgets
from IPython.display import display

movie_input = widgets.Text(
    value='Toy Story',
    description='Movie Title:',
    disabled=False
)
movie_list = widgets.Output()

def on_type(data):
    with movie_list:
        movie_list.clear_output()
        title = data["new"]
        display(search(title))

movie_input.observe(on_type, names='value')
display(movie_input, movie_list)

Text(value='Toy Story', description='Movie Title:')

Output()

In [12]:
import pickle

pickle.dump(vectorizer,open('vectorizer.pkl','wb'))
pickle.dump(tfidf,open('tfidf_matrix.pkl','wb'))