## RAG with MongoDB - 

### 1. Data ingestion

In [4]:
import os
from dotenv import load_dotenv
load_dotenv()

os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY')

In [12]:
from openai import  OpenAI

## initialize the client
client = OpenAI()

## specify the embedding model
model = 'text-embedding-3-large' 

## Define the function to generate embedding

def get_embedding(text, input_type="document"):
    response = client.embeddings.create(
        model=model,
        input=text
    )
    return response.data[0].embedding

In [4]:
response = get_embedding("RAG technology")
len(response)

3072

In [6]:
## Data ingestion
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [7]:
## load the pdf
loader = PyPDFLoader("https://investors.mongodb.com/node/12236/pdf")
data = loader.load()

## split the data into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=400, chunk_overlap=20)
documents = text_splitter.split_documents(data)

In [9]:
## Prepare documents for insertion
docs_to_insert = [
    {
        "text": doc.page_content,
        "embedding": get_embedding(doc.page_content)
    } for doc in documents
]

In [None]:
from pymongo import MongoClient


mongo_user = os.getenv("MONGO_USER")
mongo_pass = os.getenv("MONGO_PASS")
mongo_host = os.getenv("MONGO_HOST")
mongo_db = os.getenv("MONGO_DB", "admin")

## connect to your MongoDB  Deployment
client = MongoClient(f"mongodb+srv://{mongo_user}:{mongo_pass}@{mongo_host}/?appName=Cluster0")
collection = client["sample_mflix"]["rag_pdf"]

## insert documents into the collection
result = collection.insert_many(docs_to_insert)
result


### 2. Retrieval

In [8]:
### query with search index
from pymongo.operations import SearchIndexModel
import time

In [9]:
## create your index model, then create the ssearch index
index_name = "vector_index"
search_index_model = SearchIndexModel(
    definition = {
        "fields": [
            {
                "type": "vector",
                "numDimensions": 3072,
                "path": "embedding",
                "similarity": "cosine"
            }
        ]
    },
    name = index_name,
    type = "vectorSearch"
)

collection.create_search_index(model=search_index_model)

'vector_index'

In [10]:
# wait for initial sync to complete
print("Polling to check if the inndex is ready. This may take up to a minute")
predicate = None
if predicate is None:
    predicate = lambda index: index.get("queryable") is True

while True:
    indices = list(collection.list_search_indexes(index_name))
    if len(indices) and predicate(indices[0]):
        break
    time.sleep(5)

print(index_name + " is ready for querying")

Polling to check if the inndex is ready. This may take up to a minute
vector_index is ready for querying


In [13]:
query_embedding = get_embedding("AI Technology")

In [15]:
results = collection.rag_pdf.aggregate(
    [
        {
            "$vectorSearch": {
                "index": "vector_index",
                "path": "embedding",
                "queryVector": query_embedding,
                "numCandidates": 3072,
                "limit": 5,
            }
        }
    ]
)

In [16]:
array_of_results = []
for doc in results:
    array_of_results.append(doc)

array_of_results

[]

In [17]:
# Define a function to run vector search queries
def get_query_results(query):
    """ Gets results from a vector search query """

    query_embedding = get_embedding(query, input_type="query")
    print(query_embedding)
    pipeline = [
        {
            "$vectorSearch": {
                "index": "vector_index",
                "queryVector": query_embedding,
                "path": "embedding",
                "numCandidates": 3072,
                "limit": 5,
            }
        }, {
            "$project": {
                "_id": 0,
                "text": 1,
            }
        }
    ]
    
    results = collection.aggregate(pipeline)
    print(results)

    array_of_results = []
    for doc in results:
        array_of_results.append(doc)    
        
    return array_of_results

In [18]:
## Test the funcion with a sample query
get_query_results("mongodb vector search")

[-0.014096962288022041, 0.010744848288595676, -0.0008071675547398627, 0.04186665639281273, -0.0025592907331883907, 0.005852290894836187, -0.004454417619854212, 0.05046253651380539, -0.054690927267074585, 0.04047573730349541, 0.019445044919848442, 0.006599909625947475, -0.025286903604865074, -0.008693242445588112, -0.0015021923463791609, 0.03805553913116455, -0.013165046460926533, 0.0015717382775619626, -0.023645620793104172, -0.012372222729027271, -0.028235651552677155, 0.008922744542360306, 0.011669808998703957, 0.032686591148376465, -0.03560752049088478, 0.002340220846235752, -0.0044161672703921795, -0.0005837512435391545, -0.003136521903797984, -0.001556090428493917, -0.01776203326880932, 0.01934768073260784, -0.001285730628296733, 0.003209545277059078, 0.0009649499552324414, -0.002020309679210186, 0.0006511238752864301, -0.03505115583539009, -0.015981657430529594, -0.029431842267513275, 0.0008445485145784914, 0.00870019756257534, -0.03936300426721573, 0.0015187094686552882, 0.00383

[{'text': 'of MongoDB 8.0—with significant performance improvements such as faster reads and updates, along with significantly\nfaster bulk inserts and time series queries—and the general availability of Atlas Stream Processing to build sophisticated,\nevent-driven applications with real-time data.'},
 {'text': "that allow development teams to address the growing requirements for today's wide variety of modern applications, all in a unified and consistent user\nexperience. MongoDB has tens of thousands of customers in over 100 countries. The MongoDB database platform has been downloaded hundreds of"},
 {'text': "About MongoDB\nHeadquartered in New York, MongoDB's mission is to empower innovators to create, transform, and disrupt industries by unleashing the power of\nsoftware and data. Built by developers, for developers, MongoDB's developer data platform is a database with an integrated set of related services"},
 {'text': 'MongoDB, Inc. Announces First Quarter Fiscal 2025 Financial R

### 3. Generation pipeline

In [20]:
from openai import OpenAI

# Specify search query, retrieve relevat documents, and convert to string
query = "What are MongoDB's latest AI announcements?"
context_docs = get_query_results(query)
context_string = " ".join([doc["text"] for doc in context_docs])

# Construct prompt for the LLM using the retrieved documents as the context
prompt = f"""
Use the following context to answer the question at the end.
Context: {context_string}
Question: {query}
"""

openai_client = OpenAI()

# OpenAI model to use
model_name = "gpt-4o"

completion = openai_client.chat.completions.create(
    model=model_name,
    messages=[
        {"role": "user", "content": prompt}
    ]
)

print(completion.choices[0].message.content)

[-0.013731470331549644, 0.005906258709728718, -0.0228282380849123, 0.047860946506261826, 0.0026227906346321106, -0.015391464345157146, -0.009269406087696552, 0.019162971526384354, 0.014594667591154575, 0.059653542935848236, 0.04788750782608986, -0.024899909272789955, 0.016254661604762077, 0.020398005843162537, -0.010630602017045021, -0.013419391587376595, -0.01851225271821022, -0.002805389929562807, 0.02118152379989624, -0.037316665053367615, -0.024448391050100327, -0.01025876309722662, 0.021208083257079124, 0.027250461280345917, -0.04467375949025154, 0.0014342347858473659, 0.010969240218400955, -0.01335299201309681, -0.018299773335456848, -0.014275948517024517, -0.007881651632487774, -0.019070010632276535, 0.036254268139600754, -0.026559904217720032, -0.02580294758081436, -0.009136606939136982, 0.024289032444357872, -0.0023820914793759584, 0.02950805425643921, -0.011022360064089298, 0.010803241282701492, 0.017556097358465195, -0.010617321357131004, 0.009501805528998375, -0.02236343920