In [1]:
import os
import requests
import phoenix as px

from phoenix.otel import register

from bs4 import BeautifulSoup
from pathlib import Path

from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.messages import AIMessage, HumanMessage
from langchain.chains.history_aware_retriever import create_history_aware_retriever
from langchain.chains.retrieval import create_retrieval_chain
from langchain.chains.retrieval_qa.base import RetrievalQA
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings

from openinference.instrumentation.langchain import LangChainInstrumentor

from dotenv import load_dotenv
load_dotenv()

True

In [2]:
assert ("GOOGLE_API_KEY" in os.environ), "Please set your GOOGLE_API_KEY environment variable."

# Setup

In [3]:
# Launch Phoenix
import phoenix as px
px.launch_app(use_temp_dir=False)


🌍 To view the Phoenix app in your browser, visit http://localhost:6006/
💽 Your data is being persisted to sqlite:////home/hrushikesh/.phoenix/phoenix.db
📖 For more information on how to use Phoenix, check out https://docs.arize.com/phoenix


<phoenix.session.session.ThreadSession at 0x7f70acc77e00>

In [4]:
# Connect notebook to Phoenix
tracer_provider = register(project_name="LangChain RAG Tracing")

🔭 OpenTelemetry Tracing Details 🔭
|  Phoenix Project: LangChain RAG Tracing
|  Span Processor: SimpleSpanProcessor
|  Collector Endpoint: localhost:4317
|  Transport: gRPC
|  Transport Headers: {'user-agent': '****'}
|  
|  Using a default SpanProcessor. `add_span_processor` will overwrite this default.
|  
|  `register` has set this TracerProvider as the global OpenTelemetry default.
|  To disable this behavior, call `register` with `set_global_tracer_provider=False`.



In [5]:
# Initialize the LangChain Instrumentor
## There are lot of other tracer-instruments that can be used for more detailed tracing - https://github.com/Arize-ai/openinference/tree/main/python/instrumentation/openinference-instrumentation-langchain/examples
LangChainInstrumentor().instrument(tracer_provider=tracer_provider)

In [6]:
# Setup Models
google_llm = ChatGoogleGenerativeAI(
    model="gemini-1.5-flash-002",
    temperature=0,
)

google_llm_embeddings = GoogleGenerativeAIEmbeddings(
    model='models/text-embedding-004',
    task_type="retrieval_document",
)

# FAISS Database Creation

In [7]:
# Scraping Federal Documents
def fetch_federal_document(url, div_class):
    """
    Scrapes the transcript of the Act Establishing Yellowstone National Park from the given URL.

    Args:
    url (str): URL of the webpage to scrape.

    Returns:
    str: The transcript text of the Act.
    """
    # Sending a request to the URL
    response = requests.get(url)
    if response.status_code == 200:
        # Parsing the HTML content of the page
        soup = BeautifulSoup(response.text, "html.parser")

        # Finding the transcript section by its HTML structure
        transcript_section = soup.find("div", class_=div_class)
        if transcript_section:
            transcript_text = transcript_section.get_text(separator="\n", strip=True)
            return transcript_text
        else:
            return "Transcript section not found."
    else:
        return f"Failed to retrieve the webpage. Status code: {response.status_code}"

In [8]:
# Document Fetching and Saving
def fetch_and_save_documents(url_list, doc_path):
    """
    Fetches documents from given URLs and saves them to a specified file path.

    Args:
        url_list (list): List of URLs to fetch documents from.
        doc_path (str): Path to the file where documents will be saved.
    """
    for url in url_list:
        document = fetch_federal_document(url, "col-sm-9")
        with open(doc_path, "a") as file:
            file.write(document)

In [9]:
# FAISS Database Creation
def create_faiss_database(document_path, database_save_directory, chunk_size=1500, chunk_overlap=200):
    """
    Creates and saves a FAISS database using documents from the specified file.

    Args:
        document_path (str): Path to the file containing documents.
        database_save_directory (str): Directory where the FAISS database will be saved.
        chunk_size (int, optional): Size of each document chunk. Default is 500.
        chunk_overlap (int, optional): Overlap between consecutive chunks. Default is 10.

    Returns:
        FAISS database instance.
    """
    # Load documents from the specified file
    document_loader = TextLoader(document_path)
    raw_documents = document_loader.load()

    # Split documents into smaller chunks with specified size and overlap
    document_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    document_chunks = document_splitter.split_documents(raw_documents)

    # Generate embeddings for each document chunk
    embedding_generator = google_llm_embeddings
    faiss_database = FAISS.from_documents(document_chunks, embedding_generator)

    # Save the FAISS database to the specified directory
    faiss_database.save_local(database_save_directory)

    return faiss_database

In [10]:
# Create a temporary directory to store documents
directory = Path('..', 'data')

# Document Paths and FAISS Index Directory
doc_path = os.path.join(directory, "docs.txt")
persist_dir = os.path.join(directory, "faiss_index_langchain_simple_rag")

# URLs of the Federal documents to scrape
url_listings = [
    "https://www.archives.gov/milestone-documents/act-establishing-yellowstone-national-park#transcript",
    "https://www.archives.gov/milestone-documents/sherman-anti-trust-act#transcript",
]

# Fetch and save documents from the URLs
if not os.path.exists(doc_path):
    print("Fetching and saving documents...")
    fetch_and_save_documents(url_listings, doc_path)

# Create a FAISS database from the saved documents
faiss_index = create_faiss_database(doc_path, persist_dir)

# Run Demo Examples

In [12]:
def print_formatted_response(response_list, max_line_length=80):
    """
    Formats and prints responses with a maximum line length for better readability.

    Args:
    response_list (list): A list of strings representing responses.
    max_line_length (int): Maximum number of characters in a line. Defaults to 80.
    """
    for response in response_list:
        words = response.split()
        line = ""
        for word in words:
            if len(line) + len(word) + 1 <= max_line_length:
                line += word + " "
            else:
                print(line)
                line = word + " "
        print(line)

## Query Retrieval

In [None]:
# Setup RetrievalQA
retrievalQA = RetrievalQA.from_llm(llm=google_llm, retriever=faiss_index.as_retriever())

### Examples

In [13]:
answer1 = retrievalQA.invoke({"query": "What does the document say about trespassers?"})

print_formatted_response([answer1['result']])

The document states that anyone settling, locating, or occupying the land set 
apart as a public park (except as otherwise provided) will be considered a 
trespasser and removed. Another section says that the Secretary of the Interior 
shall cause all persons trespassing on the park after the passage of the act to 
be removed. 


In [14]:
answer2 = retrievalQA.invoke({"query": "What is a bridle-path and can I use one at Yellowstone?"})

print_formatted_response([answer2['result']])

Based on the provided text, a bridle-path is a path suitable for riding horses. 
The text mentions that the Secretary of the Interior is to construct roads and 
bridle-paths in Yellowstone National Park. However, whether or not you can 
currently use one is not specified in these documents. 


In [15]:
answer3 = retrievalQA.invoke({"query": "Can I buy Yellowstone from the Federal Government to set up a buffalo-themed day spa?"})

print_formatted_response([answer3['result']])

I don't know. The provided text describes the establishment of Yellowstone 
National Park and its management, but it doesn't contain information about the 
possibility of purchasing the park from the federal government. 


## Chat with RAG

### Setup

In [16]:
contextualize_q_system_prompt = (
    "Given a chat history and the latest user question "
    "which might reference context in the chat history, "
    "formulate a standalone question which can be understood "
    "without the chat history. Do NOT answer the question, "
    "just reformulate it if needed and otherwise return it as is."
)

contextualize_q_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", contextualize_q_system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)

In [17]:
history_aware_retriever = create_history_aware_retriever(
    google_llm, faiss_index.as_retriever(), contextualize_q_prompt
)

In [18]:
system_prompt = (
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use three sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)

qa_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)

In [19]:
question_answer_chain = create_stuff_documents_chain(google_llm, qa_prompt)
rag_chain = create_retrieval_chain(history_aware_retriever, question_answer_chain)

### Examples

In [20]:
chat_history = []
question = "What is the document say about the Yellowstone?"
ai_msg_1 = rag_chain.invoke({"input": question, "chat_history": chat_history})
print_formatted_response([ai_msg_1['answer']])

The document describes an act passed by the 42nd Congress establishing 
Yellowstone National Park. It designates a specific tract of land near the 
Yellowstone River's headwaters as a public park, reserving it from settlement 
and sale. The act also outlines the Secretary of the Interior's 
responsibilities for managing the park, including preserving its natural 
resources and regulating visitor access. 


In [21]:
chat_history.extend(
    [
        HumanMessage(content=question),
        AIMessage(content=ai_msg_1["answer"]),
    ]
)

second_question = "Can I buy it from the Fedral Government?"
ai_msg_2 = rag_chain.invoke({"input": second_question, "chat_history": chat_history})

print_formatted_response([ai_msg_2["answer"]])

No, the document explicitly states that the land described is "reserved and 
withdrawn from settlement, occupancy, or sale under the laws of the United 
States". 
