In [None]:
!pip install langchain langchain_community langchain_core langchain_openai langchain_mongodb pymongo pypdf

Collecting langchain_community
  Downloading langchain_community-0.3.17-py3-none-any.whl.metadata (2.4 kB)
Collecting langchain_openai
  Downloading langchain_openai-0.3.4-py3-none-any.whl.metadata (2.3 kB)
Collecting langchain_mongodb
  Downloading langchain_mongodb-0.4.0-py3-none-any.whl.metadata (1.7 kB)
Collecting pymongo
  Downloading pymongo-4.11-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (22 kB)
Collecting pypdf
  Downloading pypdf-5.2.0-py3-none-any.whl.metadata (7.2 kB)
Collecting langchain_core
  Downloading langchain_core-0.3.34-py3-none-any.whl.metadata (5.9 kB)
Collecting langchain
  Downloading langchain-0.3.18-py3-none-any.whl.metadata (7.8 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain_community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain_community)
  Downloading pydantic_settings-2.7.1-py3-none-any.whl.metadata (3.5 kB)
Collecting httpx-sse<1.0.0

In [None]:
from pymongo import MongoClient
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_community.vectorstores import MongoDBAtlasVectorSearch
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_transformers.openai_functions import (
    create_metadata_tagger,
)
from langchain.prompts import PromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
import os
from google.colab import userdata

In [None]:
os.environ["MONGODB_URI"] = userdata.get("MONGODB_URI")
mongo_uri = os.getenv("MONGODB_URI")

client = MongoClient(mongo_uri)
dbName = "rag"
collectionName = "atlas"

collection = client[dbName][collectionName]

In [None]:
loader = PyPDFLoader("document.pdf")
pages = loader.load()
cleaned_pages = []

In [None]:
for page in pages:
    if len(page.page_content.split(" ")) > 20:
        cleaned_pages.append(page)

In [None]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=150)

In [None]:
schema = {
    "properties": {
        "title": {"type": "string"},
        "keywords": {"type": "array", "items": {"type": "string"}},
        "hasCode": {"type": "boolean"},
    },
    "required": ["title", "keywords", "hasCode"],
}

In [None]:
os.environ["OPENAI_API_KEY"] = userdata.get("OPENAI_API_KEY")
openai_api_key = os.getenv("OPENAI_API_KEY")

llm = ChatOpenAI(
    openai_api_key=openai_api_key,
    temperature=0,
    model="gpt-4o-mini",
)

In [None]:
document_transformer = create_metadata_tagger(metadata_schema=schema, llm=llm)

In [None]:
docs = document_transformer.transform_documents(cleaned_pages)

In [None]:
split_docs = text_splitter.split_documents(docs)

In [None]:
embeddings = OpenAIEmbeddings(openai_api_type=openai_api_key, model="text-embedding-3-small")

In [None]:
vectorStore = MongoDBAtlasVectorSearch.from_documents(
    split_docs, embeddings, collection=collection
)

In [None]:
index = "vector_index"

In [None]:
vectorStore = MongoDBAtlasVectorSearch.from_connection_string(
    mongo_uri,
    dbName + "." + collectionName,
    OpenAIEmbeddings(disallowed_special=(), openai_api_key=openai_api_key, model="text-embedding-3-small"),
    index_name=index,
)

In [None]:
def query_data(query):
    retriever = vectorStore.as_retriever(
        search_type="similarity",
        search_kwargs={
            "k": 3,
            "pre_filter": { "hasCode": { "$eq": False } },
            "score_threshold": 0.01
        },
    )

    template = """
    Use the following pieces of context to answer the question at the end.
    If you don't know the answer, just say that you don't know, don't try to make up an answer.
    Do not answer the question if there is no given context.
    Do not answer the question if it is not related to the context.
    Do not give recommendations to anything other than MongoDB.
    Context:
    {context}
    Question: {question}
    """

    custom_rag_prompt = PromptTemplate.from_template(template)

    retrieve = {
        "context": retriever | (lambda docs: "\n\n".join([d.page_content for d in docs])),
        "question": RunnablePassthrough()
        }

    llm = ChatOpenAI(openai_api_key=openai_api_key, temperature=0, model="gpt-4o-mini")

    response_parser = StrOutputParser()

    rag_chain = (
        retrieve
        | custom_rag_prompt
        | llm
        | response_parser
    )

    answer = rag_chain.invoke(query)


    return answer

In [48]:
print("\n".join(query_data("What is PrintHub?").split(". ")))

PrintHub is an innovative online platform designed to connect users with printing presses, allowing them to find, compare, and order printing services conveniently
It streamlines the discovery process by providing a centralized location where users can compare product offerings, minimum quantities, and costs efficiently
The platform aims to save time, reduce frustration, and modernize the traditional printing process by empowering both sellers and buyers with a user-friendly solution for all printing needs.
