# Creating a RAG with Langchain , Voyage AI Embeddings and OpenAI , Atlas VectorStore 

In [6]:
# imports 
import os 
from dotenv import load_dotenv
load_dotenv()

os.environ["VOYAGE_API_KEY"] = os.getenv("VOYAGE_API_KEY")
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")
os.environ["LANGSMITH_API_KEY"] = os.getenv("LANGSMITH_API_KEY")
os.environ["LANGSMITH_TRACING"] = 'true'
os.environ["MONGODB_URI"] = os.getenv("MONGODB_URI")

In [176]:
import os, pymongo, pprint
from langchain_community.document_loaders import PyPDFLoader
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_mongodb import MongoDBAtlasVectorSearch
from langchain_voyageai import VoyageAIEmbeddings
from langchain_openai import ChatOpenAI
from langchain.prompts import PromptTemplate

from pymongo import MongoClient
from pymongo.operations import SearchIndexModel

## Step 1: (INGESTION) Loading the documents and processing to get ready for embeddings 

## Loading the data 

In [None]:
pdf_path = ("your_pdf_path")

In [12]:
from langchain_community.document_loaders import PyPDFLoader

loader = PyPDFLoader(file_path=pdf_path)
pages = []
async for page in loader.alazy_load():
    pages.append(page)

In [None]:
print(f"{pages[0].metadata}\n")
print(pages[0].page_content)

In [None]:
len(pages)

## Chunking the data using the Semantic chunker from Langchain for better context retrieval 

In [181]:
from langchain_voyageai import VoyageAIEmbeddings
embeddings = VoyageAIEmbeddings(model = "voyage-3-large")

In [182]:
from langchain_experimental.text_splitter import SemanticChunker

text_splitter = SemanticChunker(embeddings=embeddings,breakpoint_threshold_type="gradient",min_chunk_size=100)

In [183]:
documents = []

In [None]:
for idx in range(0,len(pages)):
    documents.append(text_splitter.create_documents([pages[idx].page_content]))

In [None]:
documents

In [None]:
texts = [[doc.page_content for doc in doc_group] for doc_group in documents]

In [None]:
texts

## Creating Vector Embeddings 

In [None]:
import voyageai
model = 'voyage-context-3'
vo = voyageai.Client()


### Custom VoyageAIEmbedding wrapper for 'voyage-context-3' model 

In [180]:
from typing import List, Optional, Callable, Any
import voyageai
from langchain_core.embeddings import Embeddings
from langchain_core.pydantic_v1 import BaseModel, Field, root_validator


class VoyageAIEmbedding(BaseModel, Embeddings):
    """
    VoyageAI embeddings using contextualized_embed method.
    
    This class provides a LangChain-compatible interface for VoyageAI's
    contextualized embedding functionality.
    """
    
    client: Any = Field(default=None, exclude=True)
    model: str = Field(default="voyage-context-3")
    voyage_api_key: Optional[str] = Field(default=None, alias="api_key")
    input_type: str = Field(default="document")
    output_dimension: Optional[int] = Field(default=None)
    output_dtype: str = Field(default="float")
    chunk_fn: Optional[Callable[[str], List[str]]] = Field(default=None)
    
    class Config:
        """Configuration for this pydantic object."""
        extra = "forbid"
        arbitrary_types_allowed = True

    @root_validator()
    def validate_environment(cls, values: dict) -> dict:
        """Validate that api key exists in environment."""
        
        # Initialize the VoyageAI client
        api_key = values.get("voyage_api_key")
        if api_key:
            values["client"] = voyageai.Client(api_key=api_key)
        else:
            
            values["client"] = voyageai.Client()
        
        return values

    def embed_documents(self, texts) -> List[List[float]]:
        """
        Embed search docs using VoyageAI's contextualized_embed method.
        
        Args:
            texts: List of document strings or Document objects to embed
            
        Returns:
            List of embeddings, one for each document
        """
        if not texts:
            return []
        
        # Handle case where Document objects are passed instead of strings
        processed_texts = []
        for text in texts:
            if hasattr(text, 'page_content'):
                # It's a Document object
                processed_texts.append(text.page_content)
            elif isinstance(text, str):
                # It's already a string
                processed_texts.append(text)
            else:
                # Convert to string as fallback
                processed_texts.append(str(text))
        
        # Filter out empty strings
        processed_texts = [text.strip() for text in processed_texts if text and text.strip()]
        
        if not processed_texts:
            return []
        
        
        inputs = [[text] for text in processed_texts]
        
        try:
            
            result = self.client.contextualized_embed(
                inputs=inputs,
                model=self.model,
                input_type=self.input_type,
                output_dimension=self.output_dimension,
                output_dtype=self.output_dtype,
            )
            
            # Extract embeddings from the result
            # The result structure might vary, so we need to handle it properly
            if hasattr(result, 'results') and result.results:
                # If results is a list, get the first element
                embed_obj = result.results[0]
                if hasattr(embed_obj, 'embeddings'):
                    return embed_obj.embeddings
                elif hasattr(embed_obj, 'embedding'):
                    return embed_obj.embedding
                else:
                    return embed_obj
            elif hasattr(result, 'embeddings'):
                return result.embeddings
            else:
                return result
            
        except Exception as e:
            raise ValueError(f"Error calling VoyageAI contextualized_embed: {e}")

    def embed_query(self, text: str) -> List[float]:
        """
        Embed query text using VoyageAI's contextualized_embed method.
        
        Args:
            text: Query text to embed
            
        Returns:
            Embedding for the query
        """
        try:
            
            query_input_type = "query" if self.input_type == "document" else self.input_type
            
            result = self.client.contextualized_embed(
                inputs=[[text]], 
                model=self.model,
                input_type=query_input_type,
                output_dimension=self.output_dimension,
                output_dtype=self.output_dtype,
                chunk_fn=self.chunk_fn
            )
            
            
            if hasattr(result, 'results') and result.results:
                embed_obj = result.results[0]
                if hasattr(embed_obj, 'embeddings'):
                    return embed_obj.embeddings[0] if embed_obj.embeddings else []
                elif hasattr(embed_obj, 'embedding'):
                    return embed_obj.embedding[0] if embed_obj.embedding else []
                else:
                    return embed_obj[0] if embed_obj else []
            elif hasattr(result, 'embeddings'):
                return result.embeddings[0] if result.embeddings else []
            else:
                return result[0] if result else []
            
        except Exception as e:
            raise ValueError(f"Error calling VoyageAI contextualized_embed for query: {e}")

    

## Defining the vectorstore 

In [None]:
from pymongo import MongoClient
from langchain_mongodb.vectorstores import MongoDBAtlasVectorSearch
from pymongo import MongoClient


DB_NAME = "Voyage_ai_RAG"
COLLECTION_NAME = "langhcain_Voyage"
clinet = MongoClient()
vector_store = MongoDBAtlasVectorSearch.from_connection_string(
    connection_string=os.getenv("MONGODB_URI"),
    namespace=f"{DB_NAME}.{COLLECTION_NAME}",
    embedding=VoyageAIEmbedding(model="voyage-context-3"),
    index_name="vector_index",
)

In [None]:
documents[0]

In [168]:
# Create a new embedding instance with the updated code


# Now try adding documents to the vector store
for idx in range(len(documents)):
    try:
        docs_batch = documents[idx]
        print(f"Processing batch {idx} with {len(docs_batch)} documents")
        vector_store.add_documents(documents=docs_batch)
        print(f"✓ Successfully added batch {idx}")
    except Exception as e:
        print(f"✗ Error adding batch {idx}: {e}")
        break  # Stop on first error to see what's happening

Processing batch 0 with 3 documents
✓ Successfully added batch 0
Processing batch 1 with 3 documents
✓ Successfully added batch 1
Processing batch 2 with 3 documents
✓ Successfully added batch 2
Processing batch 3 with 3 documents
✓ Successfully added batch 3
Processing batch 4 with 3 documents
✓ Successfully added batch 4
Processing batch 5 with 3 documents
✓ Successfully added batch 5
Processing batch 6 with 3 documents
✓ Successfully added batch 6
Processing batch 7 with 3 documents
✓ Successfully added batch 7
Processing batch 8 with 4 documents
✓ Successfully added batch 8
Processing batch 9 with 3 documents
✓ Successfully added batch 9
Processing batch 10 with 3 documents
✓ Successfully added batch 10
Processing batch 11 with 3 documents
✓ Successfully added batch 11
Processing batch 12 with 2 documents
✓ Successfully added batch 12
Processing batch 13 with 3 documents
✓ Successfully added batch 13
Processing batch 14 with 3 documents
✓ Successfully added batch 14
Processing batc

# Step2 : Retrieval 

## Performing Semantic Search on vector store 

In [169]:
vector_store.create_vector_search_index(
    dimensions = 1024,
)

In [None]:
import pprint
query = "Your_query."


result = vector_store.similarity_search(query)

pprint.pprint(result)


# Step3 : Generation

## Making the final RAG 

In [177]:
from langchain_openai import ChatOpenAI

retriever = vector_store.as_retriever(
    search_type = "similarity",
    search_kwargs = {"k" : 5}
)

# Defining a prompt template

template = """
   Use the following pieces of context to answer the question at the end.
   {context}
   Question: {question}
"""

prompt = PromptTemplate.from_template(template)
model = ChatOpenAI(model="gpt-4o")
# Construct a chain to answer questions on your data
chain = (
   { "context": retriever, "question": RunnablePassthrough()}
   | prompt
   | model
   | StrOutputParser()
)

In [None]:
# Prompt the chain
question = "your_query"
answer = chain.invoke(question)
print("Question: " + question)
print("Answer: " + answer)
# Return source documents
documents = retriever.invoke(question)
print("\nSource documents:")
pprint.pprint(documents)