## RAG with MongoDB - 

### 1. Data ingestion

In [4]:
import os
from dotenv import load_dotenv
load_dotenv()

os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY')

In [12]:
from openai import  OpenAI

## initialize the client
client = OpenAI()

## specify the embedding model
model = 'text-embedding-3-large' 

## Define the function to generate embedding

def get_embedding(text, input_type="document"):
    response = client.embeddings.create(
        model=model,
        input=text
    )
    return response.data[0].embedding

In [4]:
response = get_embedding("RAG technology")
len(response)

3072

In [6]:
## Data ingestion
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [7]:
## load the pdf
loader = PyPDFLoader("https://investors.mongodb.com/node/12236/pdf")
data = loader.load()

## split the data into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=400, chunk_overlap=20)
documents = text_splitter.split_documents(data)

In [9]:
## Prepare documents for insertion
docs_to_insert = [
    {
        "text": doc.page_content,
        "embedding": get_embedding(doc.page_content)
    } for doc in documents
]

In [None]:
from pymongo import MongoClient


mongo_user = os.getenv("MONGO_USER")
mongo_pass = os.getenv("MONGO_PASS")
mongo_host = os.getenv("MONGO_HOST")
mongo_db = os.getenv("MONGO_DB", "admin")

## connect to your MongoDB  Deployment
client = MongoClient(f"mongodb+srv://{mongo_user}:{mongo_pass}@{mongo_host}/?appName=Cluster0")
collection = client["sample_mflix"]["rag_pdf"]

## insert documents into the collection
result = collection.insert_many(docs_to_insert)
result


### 2. Retrieval

In [8]:
### query with search index
from pymongo.operations import SearchIndexModel
import time

In [9]:
## create your index model, then create the ssearch index
index_name = "vector_index"
search_index_model = SearchIndexModel(
    definition = {
        "fields": [
            {
                "type": "vector",
                "numDimensions": 3072,
                "path": "embedding",
                "similarity": "cosine"
            }
        ]
    },
    name = index_name,
    type = "vectorSearch"
)

collection.create_search_index(model=search_index_model)

'vector_index'

In [10]:
# wait for initial sync to complete
print("Polling to check if the inndex is ready. This may take up to a minute")
predicate = None
if predicate is None:
    predicate = lambda index: index.get("queryable") is True

while True:
    indices = list(collection.list_search_indexes(index_name))
    if len(indices) and predicate(indices[0]):
        break
    time.sleep(5)

print(index_name + " is ready for querying")

Polling to check if the inndex is ready. This may take up to a minute
vector_index is ready for querying


In [13]:
query_embedding = get_embedding("AI Technology")

In [15]:
results = collection.rag_pdf.aggregate(
    [
        {
            "$vectorSearch": {
                "index": "vector_index",
                "path": "embedding",
                "queryVector": query_embedding,
                "numCandidates": 3072,
                "limit": 5,
            }
        }
    ]
)

In [16]:
array_of_results = []
for doc in results:
    array_of_results.append(doc)

array_of_results

[]

In [22]:
# Define a function to run vector search queries
def get_query_results(query):
    """ Gets results from a vector search query """

    query_embedding = get_embedding(query, input_type="query")
    # print(query_embedding)
    pipeline = [
        {
            "$vectorSearch": {
                "index": "vector_index",
                "queryVector": query_embedding,
                "path": "embedding",
                "numCandidates": 3072,
                "limit": 5,
            }
        }, {
            "$project": {
                "_id": 0,
                "text": 1,
            }
        }
    ]
    
    results = collection.aggregate(pipeline)
    # print(results)

    array_of_results = []
    for doc in results:
        array_of_results.append(doc)    
        
    return array_of_results

In [26]:
## Test the funcion with a sample query // print len of results
len(get_query_results("mongodb vector search"))

5

### 3. Generation pipeline

In [None]:
from openai import OpenAI

# Specify search query, retrieve relevat documents, and convert to string
query = "What are MongoDB's latest AI announcements?"
context_docs = get_query_results(query)
context_string = " ".join([doc["text"] for doc in context_docs])

# Construct prompt for the LLM using the retrieved documents as the context
prompt = f"""
Use the following context to answer the question at the end.
Context: {context_string}
Question: {query}
"""

openai_client = OpenAI()

# OpenAI model to use
model_name = "gpt-4o"

completion = openai_client.chat.completions.create(
    model=model_name,
    messages=[
        {"role": "user", "content": prompt}
    ]
)

print(completion.choices[0].message.content)
# "MongoDB's latest AI announcements include the launch of the MongoDB ...."

"MongoDB's latest AI announcements include the launch of the MongoDB AI Applications Program (MAAP), which aims to expand their AI ecosystem by providing customers with reference architectures, pre-built partner integrations, and professional services to help them quickly build AI-powered applications. Additionally, they have announced MongoDB 8.0, which features significant performance improvements like faster reads, updates, bulk inserts, and time series queries. They have also made Atlas Stream Processing generally available, enabling customers to build sophisticated, event-driven applications with real-time data. Furthermore, Accenture will establish a center of excellence focused on MongoDB projects and is the first global systems integrator to join MAAP."