In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
import ast

import nltk
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\lucas\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\lucas\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
# import data with necessary columns
movies = pd.read_csv("data/tmdb_500_movies.csv")
credits = pd.read_csv("data/tmdb_500_credits.csv")[['movie_id', 'cast', 'crew']]
movies_credits = movies.merge(credits, left_on='id', right_on='movie_id', how='inner')[['genres', 'keywords','overview', 'original_title', 'tagline', 'title', 'cast']]

# make sure all the columns have consistent data type
cols_to_clean = ['genres', 'keywords', 'cast']
convert = lambda x: ", ".join([dct["name"] for dct in ast.literal_eval(x) if 'name' in dct.keys()])
for col in cols_to_clean:
    movies_credits[col] = movies_credits[col].apply(convert)

#check for nan values
print(movies_credits.isna().sum())

#fill nan values
movies_credits.fillna('', inplace=True)

genres             0
keywords           0
overview           0
original_title     0
tagline           13
title              0
cast               0
dtype: int64


In [3]:
#combine columns for richer context
def enrichment(df):
    summary = df.apply(lambda x: ", ".join(x).lower(), axis=1)
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))
    return summary.apply(lambda x: " ".join([lemmatizer.lemmatize(word) for word in re.sub(r'[^\w\s]', '', x).split() if word not in stop_words]))

#vectorize the summary
def vectorize(context):
    vectorizer = TfidfVectorizer(stop_words='english')
    tfidf_matrix = vectorizer.fit_transform(context)
    return vectorizer, tfidf_matrix

#compute similarity
def compute_similarity(query, vectorizer, tfidf_matrix):
    to_remove = ['i', 'want', 'like', 'movie', 'movies', 'love']
    query = query.lower()
    for x in to_remove:
        query = ' '.join([x for x in query.split() if x not in to_remove])
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))
    query_vec = vectorizer.transform([" ".join([lemmatizer.lemmatize(word) for word in re.sub(r'[^\w\s]', '', query).split() if word not in stop_words])])
    return cosine_similarity(query_vec, tfidf_matrix).flatten()

#recommend top 5 movies
def recommend_movies(df): 
    summary = enrichment(df)
    query = input("What kind of movie would you like to watch?: ")
    vectorizer, tfidf_matrix = vectorize(summary)
    similarity = compute_similarity(query, vectorizer, tfidf_matrix)
    top_indices = similarity.argsort()[-3:][::-1]
    
    recommendations = []
    for idx in top_indices:
        recommendations.append({
            'title': df.iloc[idx]['title'],
            'similarity': similarity[idx]
        })

    return recommendations

In [5]:
# give movie recommendation
recommend_movies(movies_credits)

What kind of movie would you like to watch?:  I want family movies


[{'title': 'The Croods', 'similarity': 0.1782824968450518},
 {'title': 'Stuart Little', 'similarity': 0.16943315929344252},
 {'title': 'Stuart Little 2', 'similarity': 0.1333837325892032}]