In [41]:
### Retrieval and Generation Pipeline(RAG) Example with MongoDB.

## Data Ingestion

In [42]:
#importing my openai key from .env file
import os
from dotenv import load_dotenv

load_dotenv()

openai_api_key = os.getenv("OPENAI_API_KEY")

In [43]:
from openai import OpenAI

openai_client = OpenAI(api_key=openai_api_key)

#create embedding model
model = "text-embedding-3-small"

#function to create embeddings
def get_embedding(text):
    response = openai_client.embeddings.create(
        input=text,
        model=model
    )
    embedding = response.data[0].embedding
    return embedding

In [44]:
# get_embedding("AI Technology")
# len(get_embedding("AI Technology"))

In [8]:
#Data ingestion
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

#load pdf document
loader = PyPDFLoader("https://investors.mongodb.com/node/12236/pdf")
data = loader.load()

#split the document into smaller chunks
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=400,
    chunk_overlap=30
)

documents = text_splitter.split_documents(data)

In [9]:
documents

[Document(metadata={'producer': 'West Corporation using ABCpdf', 'creator': 'PyPDF', 'creationdate': '2024-05-30T20:06:12+00:00', 'title': 'MongoDB, Inc. Announces First Quarter Fiscal 2025 Financial Results', 'source': 'https://investors.mongodb.com/node/12236/pdf', 'total_pages': 8, 'page': 0, 'page_label': '1'}, page_content='MongoDB, Inc. Announces First Quarter Fiscal 2025 Financial Results\nMay 30, 2024\nFirst Quarter Fiscal 2025 Total Revenue of $450.6 million, up 22% Year-over-Year\nContinued Strong Customer Growth with Over 49,200 Customers as of April 30, 2024\nMongoDB Atlas Revenue up 32% Year-over-Year; 70% of Total Q1 Revenue'),
 Document(metadata={'producer': 'West Corporation using ABCpdf', 'creator': 'PyPDF', 'creationdate': '2024-05-30T20:06:12+00:00', 'title': 'MongoDB, Inc. Announces First Quarter Fiscal 2025 Financial Results', 'source': 'https://investors.mongodb.com/node/12236/pdf', 'total_pages': 8, 'page': 0, 'page_label': '1'}, page_content='NEW YORK, May 30, 2

In [12]:
# documents insert
documents_to_insert_Mongodb =[
    {
        "text": doc.page_content,
        "embedding": get_embedding(doc.page_content)
    }
    for doc in documents
]

In [13]:
documents_to_insert_Mongodb


[{'text': 'MongoDB, Inc. Announces First Quarter Fiscal 2025 Financial Results\nMay 30, 2024\nFirst Quarter Fiscal 2025 Total Revenue of $450.6 million, up 22% Year-over-Year\nContinued Strong Customer Growth with Over 49,200 Customers as of April 30, 2024\nMongoDB Atlas Revenue up 32% Year-over-Year; 70% of Total Q1 Revenue',
  'embedding': [0.0198432095348835,
   -0.03690257668495178,
   0.017716355621814728,
   0.06534228473901749,
   -0.003020466072484851,
   0.005180725362151861,
   -0.0432942733168602,
   0.03946371003985405,
   -0.03309428691864014,
   -0.048060208559036255,
   0.015890156850218773,
   2.8545230179588543e-06,
   0.03483140096068382,
   -0.04004275053739548,
   0.021613730117678642,
   0.058393821120262146,
   -0.02282748557627201,
   -0.033896032720804214,
   -0.06373879313468933,
   -0.03129035606980324,
   0.03013228066265583,
   0.025188181549310684,
   0.027615688741207123,
   0.0022549100685864687,
   -0.03175804391503334,
   0.06066543236374855,
   -0.0609

In [14]:
#importing pymongo to connect to mongodb
from pymongo import MongoClient
from pymongo.server_api import ServerApi

# In notebook
mongo_uri = os.getenv("MONGODB_URI")
mongo_client = MongoClient(mongo_uri, server_api=ServerApi('1'))

#connect to mongodb
collection = mongo_client["sample_mflix"]["rag_pdf_collection"]

#insert documents into mongodb
collection.insert_many(documents_to_insert_Mongodb)

# ingestion complete and inserted into mongodb collection

InsertManyResult([ObjectId('696d9fd253e4fe937e6a6359'), ObjectId('696d9fd253e4fe937e6a635a'), ObjectId('696d9fd253e4fe937e6a635b'), ObjectId('696d9fd253e4fe937e6a635c'), ObjectId('696d9fd253e4fe937e6a635d'), ObjectId('696d9fd253e4fe937e6a635e'), ObjectId('696d9fd253e4fe937e6a635f'), ObjectId('696d9fd253e4fe937e6a6360'), ObjectId('696d9fd253e4fe937e6a6361'), ObjectId('696d9fd253e4fe937e6a6362'), ObjectId('696d9fd253e4fe937e6a6363'), ObjectId('696d9fd253e4fe937e6a6364'), ObjectId('696d9fd253e4fe937e6a6365'), ObjectId('696d9fd253e4fe937e6a6366'), ObjectId('696d9fd253e4fe937e6a6367'), ObjectId('696d9fd253e4fe937e6a6368'), ObjectId('696d9fd253e4fe937e6a6369'), ObjectId('696d9fd253e4fe937e6a636a'), ObjectId('696d9fd253e4fe937e6a636b'), ObjectId('696d9fd253e4fe937e6a636c'), ObjectId('696d9fd253e4fe937e6a636d'), ObjectId('696d9fd253e4fe937e6a636e'), ObjectId('696d9fd253e4fe937e6a636f'), ObjectId('696d9fd253e4fe937e6a6370'), ObjectId('696d9fd253e4fe937e6a6371'), ObjectId('696d9fd253e4fe937e6a63

In [46]:
### Query with search index
from pymongo.operations import SearchIndexModel
import time

#creating search index on embedding field
index_name = "vector_index"

# collection.drop_search_index(index_name)
# time.sleep(5)  # wait for 2 seconds to ensure index is dropped
search_index_model = SearchIndexModel(
    definition={
        "fields": [
            {
                "type": "vector",
                "path": "embedding",
                "similarity": "cosine",
                "numDimensions": 1536
            }
        ]
    },
    name=index_name,
    type="vectorSearch"
)

collection.create_search_index(model=search_index_model)

'vector_index'

In [48]:
query_embeddings = get_embedding("AI Technology")

query_embeddings

[-0.027704259380698204,
 -0.027157561853528023,
 0.006833717226982117,
 0.010727088898420334,
 0.03244722634553909,
 -0.03959862142801285,
 -0.015159770846366882,
 0.035638757050037384,
 -0.022089529782533646,
 -0.01272918377071619,
 0.014280621893703938,
 -0.046720463782548904,
 -0.041164834052324295,
 -0.0322403684258461,
 -0.017139701172709465,
 -0.022636227309703827,
 0.001476636971347034,
 -0.027024580165743828,
 0.002072278643026948,
 -0.005526076070964336,
 0.0033854604698717594,
 0.03020133636891842,
 -0.008119194768369198,
 -0.0021258401684463024,
 0.0019466859521344304,
 -0.02083360217511654,
 0.009870103560388088,
 0.008377768099308014,
 -0.01696239411830902,
 -0.02699502930045128,
 0.008769321255385876,
 -0.03194485604763031,
 0.009434223175048828,
 -0.031826652586460114,
 0.01494552381336689,
 0.03962817043066025,
 0.020065270364284515,
 0.016593003645539284,
 0.013202003203332424,
 -0.013142900541424751,
 0.019725432619452477,
 -0.009279079735279083,
 0.01111864298582077,

In [68]:
#now vector search our embedded query (query_embeddings) with the collection embeddings 

def get_query_results(query):
    """ Gets results from a vector search query"""

    query_embeddings = get_embedding(query)
    # print(query_embeddings)
    pipeline = [
        {
            "$vectorSearch": {
                "index": "vector_index",
                "queryVector": query_embeddings,
                "path": "embedding",
                "numCandidates": 1536,
                "limit": 5
            }
        }, {
            "$project": {
                "_id": 0,
                "text": 1
            }
        }
    ]
    results = collection.aggregate(pipeline)
    print(results)

    array_results = []
    for doc in results:
        array_results.append(doc)
    return array_results



In [69]:
get_query_results("what is mongo db vector search")

<pymongo.synchronous.command_cursor.CommandCursor object at 0x000002ABE9A41670>


[{'text': 'of MongoDB 8.0—with significant performance improvements such as faster reads and updates, along with significantly\nfaster bulk inserts and time series queries—and the general availability of Atlas Stream Processing to build sophisticated,\nevent-driven applications with real-time data.'},
 {'text': "About MongoDB\nHeadquartered in New York, MongoDB's mission is to empower innovators to create, transform, and disrupt industries by unleashing the power of\nsoftware and data. Built by developers, for developers, MongoDB's developer data platform is a database with an integrated set of related services"},
 {'text': "that allow development teams to address the growing requirements for today's wide variety of modern applications, all in a unified and consistent user\nexperience. MongoDB has tens of thousands of customers in over 100 countries. The MongoDB database platform has been downloaded hundreds of"},
 {'text': 'MongoDB continues to expand its AI ecosystem with the announc

In [None]:
### Generative Q&A with RAG pipline

In [71]:
#after retrieving the relevant documents from mongodb using vector search,
# we can now use these documents to generate an answer using openai gpt model.

from openai import OpenAI

# Specify search query, retrieve relevant documents, and convert to string
query = "What are MongoDB's investments on AI?"
context_docs = get_query_results(query)
context_string = " ".join([doc["text"] for doc in context_docs])

# Construct prompt for the LLM using the retrieved documents as the context
prompt = f"""Use the following pieces of context to answer the question at the end.
    {context_string}
    Question: {query}
"""

openai_client = OpenAI()

# OpenAI model to use
model_name = "gpt-4o"

completion = openai_client.chat.completions.create(
model=model_name,
messages=[{"role": "user",
    "content": prompt
  }]
)
print(completion.choices[0].message.content)

<pymongo.synchronous.command_cursor.CommandCursor object at 0x000002ABE9A35A30>
MongoDB is investing in AI through the launch of the MongoDB AI Applications Program (MAAP). This program aims to expand MongoDB's AI ecosystem by providing customers with reference architectures, pre-built partner integrations, and professional services to quickly build AI-powered applications. Additionally, Accenture is establishing a center of excellence focused on MongoDB projects and has become the first global systems integrator to join MAAP. MongoDB's document-based architecture supports the variety and scale of data needed for AI applications, further positioning it as a significant player in the AI space.
