In [None]:
from pathlib import Path
import pandas as pd

file = Path("../MovieSearch/movies.csv")
movies = pd.read_csv(file)
movies.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  9742 non-null   int64 
 1   title    9742 non-null   object
 2   genres   9742 non-null   object
dtypes: int64(1), object(2)
memory usage: 228.5+ KB


The data look good: no nulls and correct typing.

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer

# init tf-idf, cleanse stop works (unlikely, but why not) and fit the vector to genre
vectorizer = TfidfVectorizer(stop_words="english")
tfidf_matrix = vectorizer.fit_transform(movies["genres"])


In [3]:
from sklearn.metrics.pairwise import cosine_similarity

# calc similarity scores
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# map movie title: index & remove duplicates
movie_indices = pd.Series(movies.index, index=movies["title"]).drop_duplicates()


In [4]:
from thefuzz import process

# I made too many typo's, so I pulled in a fuzzy matching algo
def find_closest_movie(title, movie_list): # pass user input and set of movies to fn
    closest_match = process.extractOne(title, movie_list)
    if closest_match[1] > 70:  # arbitrary 70% confidence threshold
        return closest_match[0]
    else:
        return None

def recommend_movies(title, cosine_sim=cosine_sim): # input title and similarities
    corrected_title = find_closest_movie(title, movies["title"]) # deploy matching
    if not corrected_title: # Reject incorrect. This needs a loop, but this is already extra.
        return "Movie not found. Try another title."
    print(f"Recommending for: {corrected_title}?")

    idx = movie_indices[corrected_title] # pull index for title
    sim_scores = list(enumerate(cosine_sim[idx])) # get similarity scores for title
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True) # sort for similiarity (desc)
    sim_scores = sim_scores[1:11] # store top-10 most similar (excluding itself)
    movie_indices_recommended = [i[0] for i in sim_scores] # store top 10 indices and return titles
    return movies["title"].iloc[movie_indices_recommended]
    

In [5]:
# Let's test it with some gibberish
user_input = input("Enter a movie title: ")
print(recommend_movies(user_input))


Enter a movie title:  flibbertigibbet


Movie not found. Try another title.


In [6]:
# OK, now with a better word
user_input = input("Enter a movie title: ")
print(recommend_movies(user_input))

Enter a movie title:  Blue


Recommending for: Blue in the Face (1995)?
67                               Big Bully (1996)
74                Antonia's Line (Antonia) (1995)
85                  In the Bleak Midwinter (1995)
94     Nobody Loves Me (Keiner liebt mich) (1994)
129                       Blue in the Face (1995)
143                                Jeffrey (1995)
150                   Love & Human Remains (1993)
164                                  Smoke (1995)
173                        Unstrung Heroes (1995)
186                       Boys on the Side (1995)
Name: title, dtype: object
