In [23]:
import numpy as np
import os
import pandas as pd
import sys
sys.path.append("../../backend/database") 
from sklearn.feature_extraction.text import HashingVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from db_connection import get_db_connection
from sqlalchemy import create_engine

from dotenv import load_dotenv
from pathlib import Path

# Load environment variables from .env file
dotenv_path = Path(".env")
load_dotenv(dotenv_path=dotenv_path)

conn = get_db_connection()

# Read variables
DB_USER = os.getenv("DB_USER")
DB_PASSWORD = os.getenv("DB_PASSWORD")
DB_HOST = os.getenv("DB_HOST")
DB_PORT = os.getenv("DB_PORT") 
DB_NAME = os.getenv("DB_NAME")


engine = create_engine(f'postgresql://{DB_USER}:{DB_PASSWORD}@{DB_HOST}:{DB_PORT}/{DB_NAME}')

if conn is None:
    raise Exception("Failed to connect to database.")

# query only movies that have an overview (some don't)
# include genres and moviegenres table
# show genres aggregated as one string, spaced per genre.
query = """
SELECT m.movie_id AS movie_id, m.title, m.overview, STRING_AGG(g.name, ' ') AS genre
FROM movies m 
JOIN "MovieGenres" gm ON m.movie_id = gm.movie_id
JOIN genres g ON gm.genre_id = g.genre_id
WHERE m.overview IS NOT NULL 
AND TRIM(m.overview) != ''
GROUP BY m.movie_id, m.title, m.overview
 """

movies_df = pd.read_sql(query, engine)

print(f"Loaded  {len(movies_df)} movies with overviews")
print(movies_df.head())

    




Loaded  3606 movies with overviews
   movie_id                        title  \
0         1                    Toy Story   
1         2                      Jumanji   
2         3             Grumpier Old Men   
3         4            Waiting to Exhale   
4         5  Father of the Bride Part II   

                                            overview  \
0  Led by Woody, Andy's toys live happily in his ...   
1  When siblings Judy and Peter discover an encha...   
2  A family wedding reignites the ancient feud be...   
3  Cheated on, mistreated and stepped on, the wom...   
4  Just when George Banks has recovered from his ...   

                          genre  
0   Animation Children's Comedy  
1  Adventure Children's Fantasy  
2                Comedy Romance  
3                  Comedy Drama  
4                        Comedy  


In [None]:
def compute_tfidf(corpus, stop_words='english'):
    # Corpus is all text documents "movie overviews and genres"
    # returns: tfidf_matrix, vectorizer

    # ignore common english words "the", "and", "of"
    # Vectorizer is a trained TfidfVectorizer object for inspecting feature names or transforming text.
    vectorizer = TfidfVectorizer(stop_words=stop_words)
    # Sparse matrix: Each row is a movie, each column is a word from the its vocabulary, each entry is the TF_IDF score of that word in that movie.
    # Transformation: converts each document into a vector of TF-IDF values. 
    tfidf_matrix = vectorizer.fit_transform(corpus)
    return tfidf_matrix, vectorizer

def get_top_n_recommendations(tfidf_matrix, movie_index, movies_df, n=10):
    # movie index --> first movie. 
    # tfidf_matrix --> the TF-IDF vectors for all movies
    # movies_df --> dataframe of agreggated data queried from database
    # n=10 (can change), number of top recommendations to return.

    # Computes pairwise cosine similarity between all movie vectors.
    # measures how similar two vectors are based on their direction, ignoring magnitude.
    # cosine_sim a square matrix of shape: (num_movies, num_movies)
    # 0 means no similarity, 1 means identical.
    cosine_sim = cosine_similarity(tfidf_matrix)
    # 1-d flattened array of similarity scores between passed in movie and all other movies.
    sim_scores = list(enumerate(cosine_sim[movie_index]))
    # a list of tuples sorted descending order.movie_index [(0, 1.0), (1, 0.55), (2, 0.43), ...]
    # shape: (movie_index, similarity) 
    # x[1] sorts by similarity value not movie index.
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    # Get list of row indices in dataframe with top N similar movies.
    # i for movie_index, similarity in all sim scores starting from the second most similar to the nth.
    # first one is the 0 index, which is our movie we passed in, so we skip.
    top_indices = [i for i, _ in sim_scores[1:n+1]]
    # get rows in df with corresponding to top movies.
    # top movies have all three metadata sql queried earlier.
    return movies_df.iloc[top_indices]

# TF-IDF is case sensitive
movies_df['clean_overview'] = movies_df['overview'].str.lower()


tfidf_matrix, vectorizer = compute_tfidf(movies_df['clean_overview'].tolist())
recommendations = get_top_n_recommendations(tfidf_matrix, movie_index=0, movies_df=movies_df)

query_movie = movies_df.iloc[0] # get the first movie

print('Recommendations for: ', query_movie["title"])
print(recommendations[['title']])


Recommendations for:  Toy Story
                      title
2831            Toy Story 2
1017  Rebel Without a Cause
1837              Condorman
2887        Man on the Moon
455                  Malice
418       For Love or Money
3063        Bound for Glory
1792           Child's Play
1793         Child's Play 2
1794         Child's Play 3


In [26]:
import pickle

with open("C:/Users/jonab/.vscode/PROJECTS/Web Development Projects/Movie Recommendation Application/backend/artifacts/tfidf_vectorizer.pkl", "wb") as f:
    pickle.dump(vectorizer, f)

with open("C:/Users/jonab/.vscode/PROJECTS/Web Development Projects/Movie Recommendation Application/backend/artifacts/tfidf_matrix.pkl", "wb") as f:
    pickle.dump(tfidf_matrix, f)