In [None]:
from dotenv import load_dotenv
load_dotenv()
import pymongo
import os
import requests
from sentence_transformers import SentenceTransformer
import pandas as pd
from tqdm import tqdm

In [None]:
# saved variables
mongodbpassword = os.environ.get('MONGODB_PASSWORD')
mongodbuser = os.environ.get('MONGODB_USER')
hf_token = os.environ.get('HUGGINGFACE_TOKEN')

In [None]:
# DB connection
client = pymongo.MongoClient(f"mongodb+srv://{mongodbuser}:{mongodbpassword}@cluster0.2wxhtzz.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0")
db = client.sample_mflix
collection = db.movies

In [None]:
# function that takes in a text and transforms it into an embedding.
# using the python package instead of the huggingface api

model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
    
def generate_embeddings(text):
    embeddings = model.encode(text)
    return embeddings.tolist()
    


In [None]:
# total documents in database
total_documents = 21349

# Generate the embeddings for all plots. 
# using tqdm to give visual updates on the progress
for doc in tqdm(collection.find({'plot': {"$exists": True}}).limit(total_documents)):
    doc['plot_embedding_hf'] = generate_embeddings(doc['plot'])
    collection.replace_one({'_id': doc['_id']}, doc)

In [None]:
# testing if all embeds are in the database
query = {'plot_embedding_hf': {"$exists": True}}
data = collection.find(query)
document_list = list(data)
print(len(document_list))
    

In [None]:
query_exists = {'plot_embedding_hf': {"$exists": True}}
query_null = {'plot_embedding_hf': None}

count_exists = collection.count_documents(query_exists)
count_null = collection.count_documents(query_null)

print("Documents with 'plot_embedding_hf' field:", count_exists)
print("Documents with 'plot_embedding_hf' field set to null:", count_null)


## Vector search by query

In [None]:
# my query
query = "imaginary characters from outer space at war"

In [None]:
# query the movie database via vector serach
results = collection.aggregate([
    {
        "$vectorSearch": {
            "queryVector": generate_embeddings(query),
            "path": "plot_embedding_hf",
            "numCandidates": 100, #optimisation how many matches before return results
            "limit": 4, # top 4 matches
            "index": "PlotSemanticSearch" # search index on mongodb
        }
    }
])

In [None]:
for document in results:
    print(f'Movie Name: {document["title"]}, \n Movie Plot {document["plot"]}\n')