In [1]:
from dotenv import load_dotenv
load_dotenv()
import pymongo
import os
import requests
from sentence_transformers import SentenceTransformer
import pandas as pd
from tqdm import tqdm

In [2]:
# saved variables
mongodbpassword = os.environ.get('MONGODB_PASSWORD')
mongodbuser = os.environ.get('MONGODB_USER')
hf_token = os.environ.get('HUGGINGFACE_TOKEN')

In [3]:
# DB connection
client = pymongo.MongoClient(f"mongodb+srv://{mongodbuser}:{mongodbpassword}@cluster0.2wxhtzz.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0")
db = client.sample_mflix
collection = db.movies

In [4]:
# function that takes in a text and transforms it into an embedding.
# using the python package instead of the huggingface api

model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
    
def generate_embeddings(text):
    embeddings = model.encode(text)
    return embeddings.tolist()
    


In [5]:
# total documents in database
total_documents = 21349

# Generate the embeddings for all plots. 
# using tqdm to give visual updates on the progress
for doc in tqdm(collection.find({'plot': {"$exists": True}}).limit(total_documents)):
    doc['plot_embedding_hf'] = generate_embeddings(doc['plot'])
    collection.replace_one({'_id': doc['_id']}, doc)

20203it [14:54, 22.58it/s] 


In [6]:
# testing if all embeds are in the database
query = {'plot_embedding_hf': {"$exists": True}}
data = collection.find(query)
document_list = list(data)
print(len(document_list))
    

20203


In [7]:
query_exists = {'plot_embedding_hf': {"$exists": True}}
query_null = {'plot_embedding_hf': None}

count_exists = collection.count_documents(query_exists)
count_null = collection.count_documents(query_null)

print("Documents with 'plot_embedding_hf' field:", count_exists)
print("Documents with 'plot_embedding_hf' field set to null:", count_null)


Documents with 'plot_embedding_hf' field: 20203
Documents with 'plot_embedding_hf' field set to null: 1146


## Vector search by query

In [19]:
# my query
query = "Avengers that save the world"

In [20]:
# query the movie database via vector serach
results = collection.aggregate([
    {
        "$vectorSearch": {
            "queryVector": generate_embeddings(query),
            "path": "plot_embedding_hf",
            "numCandidates": 100, #optimisation how many matches before return results
            "limit": 10, # top 4 matches
            "index": "PlotSemanticSearch" # search index on mongodb
        }
    }
])

In [21]:
for document in results:
    print(f'Movie Name: {document["title"]}, \n Movie Plot: {document["plot"]}\n')

Movie Name: Ultimate Avengers, 
 Movie Plot: To confront an alien menace, General Fury assembles a team of superheroes lead by a recently resuscitated Captain America.

Movie Name: Iron Man and Captain America: Heroes United, 
 Movie Plot: Iron Man and Captain America battle to keep the Red Skull and his triggerman, Taskmaster, from unleashing an army of Hydra Brutes on the world.

Movie Name: The Avengers, 
 Movie Plot: Earth's mightiest heroes must come together and learn to fight as a team if they are to stop the mischievous Loki and his alien army from enslaving humanity.

Movie Name: Ultimate Avengers II, 
 Movie Plot: When the Chitauri invaders are sighted in the African kingdom of Wakanda, the Avengers covertly enter the advanced nation to investigate.

Movie Name: Avengers: Age of Ultron, 
 Movie Plot: When Tony Stark and Bruce Banner try to jump-start a dormant peacekeeping program called Ultron, things go horribly wrong and it's up to Earth's Mightiest Heroes to stop the vill