## Project #1: Movies RAG

In [1]:
import pymongo
import os
import requests
from dotenv import load_dotenv

### Setup MongoDB connection

In [2]:
load_dotenv()
my_client = os.getenv('myclient')

client = pymongo.MongoClient(my_client)
db = client.sample_mflix
collect = db.movies

In [3]:
huggingface_tk = os.getenv('huggingface_tk')
embedding_url = "https://api-inference.huggingface.co/pipeline/feature-extraction/sentence-transformers/all-MiniLM-L6-v2"

hf_token = huggingface_tk

In [None]:
def generate_embedding(text: str) -> list[float]:

    """ 
    Set up the embedding creation function
    """

    response = requests.post(
        embedding_url, 
        headers={"Authorization": f"Bearer {hf_token}"},
        json={"inputs": text}
    )

    if response.status_code != 200:
        raise ValueError(f"Request failed with status code {response.status_code}: {response.text}")

    return response.json()


## replace data in-place

# for doc in collect.find({'plot': {"$exists": True}}).limit(25):
#     doc['hf_plot_embedding'] = generate_embedding(doc['plot'])
#     collect.replace_one({'_id': doc['_id']}, doc)

### Use the embeddings; vector search in the mongodb collection, using the vector search aggregation pipeline stage.

##### --> Atlas Vector Search queries consist of aggregation pipeline stages where the $vectorSearch stage is the first stage in the pipeline

In [None]:
query = "___your_keywords_here___"


####################################
##  vector search code structure  ##
####################################

# {
#   "$vectorSearch": {
#     "exact": true | false,
#     "filter": {<filter-specification>},
#     "index": "<index-name>",
#     "limit": <number-of-results>,
#     "numCandidates": <number-of-candidates>,
#     "path": "<field-to-search>",
#     "queryVector": [<array-of-numbers>]
#   }
# }

# !!! queryvector is the generated vector from the specific query used. 
# !!! index is the search index created inside MongoDB by the admin.


####################################
####################################


results = collect.aggregate([
    {
            "$vectorSearch": {
            "index": "PlotSemanticSearch",
            "limit": 4,
            "numCandidates": 100,
            "path": "hf_plot_embedding",
            "queryVector": generate_embedding(query)
        }
    }
])


for document in results:
    print(f"Movie Name: {document["title"]} ,\nMovie plot: {document["plot"]}\nRelease Year: {document["year"]}\nRating: {document["critic"]["rating"]}")
