## Project #1: Movies RAG

In [10]:
import pymongo
import os
import requests
from dotenv import load_dotenv
from termcolor import colored

### Setup MongoDB connection

In [3]:
load_dotenv()
my_client = os.getenv('myclient')

client = pymongo.MongoClient(my_client)
db = client.sample_mflix
collect = db.movies

In [4]:
huggingface_tk = os.getenv('huggingface_tk')
embedding_url = "https://api-inference.huggingface.co/pipeline/feature-extraction/sentence-transformers/all-MiniLM-L6-v2"

hf_token = huggingface_tk

In [8]:
def generate_embedding(text: str) -> list[float]:

    """ 
    Set up the embedding creation function
    """

    response = requests.post(
        embedding_url, 
        headers={"Authorization": f"Bearer {hf_token}"},
        json={"inputs": text}
    )

    if response.status_code != 200:
        raise ValueError(f"Request failed with status code {response.status_code}: {response.text}")

    return response.json()


## replace data in-place

# for doc in collect.find({'plot': {"$exists": True}}).limit(25):
#     doc['hf_plot_embedding'] = generate_embedding(doc['plot'])
#     collect.replace_one({'_id': doc['_id']}, doc)

### Use the embeddings; vector search in the mongodb collection, using the vector search aggregation pipeline stage.

##### --> Atlas Vector Search queries consist of aggregation pipeline stages where the $vectorSearch stage is the first stage in the pipeline

In [13]:
query = "___your_keywords_here___"


####################################
##  vector search code structure  ##
####################################

# {
#   "$vectorSearch": {
#     "exact": true | false,
#     "filter": {<filter-specification>},
#     "index": "<index-name>",
#     "limit": <number-of-results>,
#     "numCandidates": <number-of-candidates>,
#     "path": "<field-to-search>",
#     "queryVector": [<array-of-numbers>]
#   }
# }

# !!! queryvector is the generated vector from the specific query used. 
# !!! index is the search index created inside MongoDB by the admin.


####################################
####################################


results = collect.aggregate([
    {
            "$vectorSearch": {
            "index": "PlotSemanticSearch",
            "limit": 4,
            "numCandidates": 100,
            "path": "hf_plot_embedding",
            "queryVector": generate_embedding(query)
        }
    }
])


for document in results:
    print(colored(f'Movie Name: {document["title"]} ,\nMovie plot: {document["plot"]}\nRelease Year: {document["year"]}\nRating: {document["imdb"]["rating"]}', color='yellow', attrs=['bold']))
    print('----------------------------------------------')


[1m[33mMovie Name: The Saphead ,
Movie plot: The simple-minded son of a rich financier must find his own way in the world.
Release Year: 1920
Rating: 6.2[0m
----------------------------------------------
[1m[33mMovie Name: Wild and Woolly ,
Movie plot: A rich young Easterner who has always wanted to live in "the Wild West" plans to move to a Western town. Unknown to him, the town's "wild" days are long gone, and it is an orderly, ...
Release Year: 1917
Rating: 6.9[0m
----------------------------------------------
[1m[33mMovie Name: The Blue Bird ,
Movie plot: Two peasant children, Mytyl and Tyltyl, are led by Berylune, a fairy, to search for the Blue Bird of Happiness. Berylune gives Tyltyl a cap with a diamond setting, and when Tyltyl turns the...
Release Year: 1918
Rating: 6.6[0m
----------------------------------------------
[1m[33mMovie Name: The Great Train Robbery ,
Movie plot: A group of bandits stage a brazen train hold-up, only to find a determined posse hot on thei