In [60]:
# pip install --quiet --upgrade langchain-text-splitters langchain-community langgraph
# pip install -qU "langchain[aws]"
# pip install python-dotenv
# pip install -qU langchain-aws
# pip install -qU langchain-core
# ollama pull llama3
# pip install langchain-ollama



# Investigate:
# pip install unstructured
# pip install "unstructured[pdf]"

### Set Up Credentials and Envitoment Vars

In [61]:
import os
from dotenv import load_dotenv
load_dotenv()

os.environ['AWS_ACCESS_KEY_ID'] = os.getenv('AWS_ACCESS_KEY_ID')
os.environ['AWS_SECRET_ACCESS_KEY'] = os.getenv('AWS_SECRET_ACCESS_KEY')
os.environ['AWS_REGION'] = os.getenv('AWS_REGION')

import os
os.environ['AWS_CA_BUNDLE'] = '/Users/ktejwani/.certs/Zscaler.pem'


##### Trying to fix certificate issue(Not working)

In [62]:
# import os
# import ssl
# import boto3
# from botocore.httpsession import URLLib3Session
# from langchain_aws import BedrockEmbeddings

# # --- START: Force Python to use the Zscaler certificate ---

# # 1. Define the path to the certificate that worked with curl
# ca_bundle_path = '/Users/ktejwani/.certs/Zscaler.pem'

# # 2. Create a custom SSL context that loads your certificate
# ssl_context = ssl.create_default_context(cafile=ca_bundle_path)

# # 3. Create a botocore session that uses our custom SSL context
# http_session = URLLib3Session(ssl_context=ssl_context)
# boto_session = boto3.Session(botocore_session=http_session)

# # 4. Create the Bedrock client from our custom, secure session
# bedrock_client = boto_session.client(
#     service_name='bedrock-runtime',
#     region_name=os.getenv('AWS_REGION')
# )

# # --- END: Certificate forcing logic ---


# # Initialize BedrockEmbeddings with our custom client.
# # This instance is now guaranteed to use the correct certificate.
# embeddings = BedrockEmbeddings(
#     client=bedrock_client,
#     model_id="amazon.titan-embed-text-v2:0"
# )

# print("✅ Bedrock client initialized successfully with custom SSL certificate.")

### Initilaize Gen LLM

In [63]:
from langchain.chat_models import init_chat_model

llm = init_chat_model("anthropic.claude-3-5-sonnet-20240620-v1:0", model_provider="bedrock_converse")

In [64]:
from langchain_ollama.chat_models import ChatOllama

llm = ChatOllama(model="llama3")

### Initialize Embed LLM

In [65]:
# from langchain_aws import BedrockEmbeddings

# # import boto3
# # import os
# # session = boto3.Session()
# # bedrock_client = session.client(
# #     service_name='bedrock-runtime',
# #     region_name=os.getenv('AWS_REGION'),
# #     verify=False
# # )

# embeddings = BedrockEmbeddings(model_id="amazon.titan-embed-text-v2:0")


##### Local work around(NO API Calls)

In [66]:
from langchain_ollama import OllamaEmbeddings

embeddings = OllamaEmbeddings(
    model="nomic-embed-text"
)



### In Memory Vector Store

In [67]:
from langchain_core.vectorstores import InMemoryVectorStore

vector_store = InMemoryVectorStore(embeddings)



### Loading PDF Documents from a Directory :)

In [68]:
import os
from langchain_community.document_loaders import PyPDFLoader

# Path to your data directory
data_directory = "data"
docs = []

print(f"Loading documents from: {data_directory}")

# Loop through each file in the directory
for filename in os.listdir(data_directory):
    if filename.endswith(".pdf"):
        file_path = os.path.join(data_directory, filename)
        print(f"--> Loading {filename}...")
        try:
            # Use PyPDFLoader for each file
            loader = PyPDFLoader(file_path)
            # Extend the main docs list with the pages from the current PDF
            docs.extend(loader.load())
            print(f"    ...success!")
        except Exception as e:
            # If a file fails to load, print an error and continue
            print(f"    ...ERROR loading {filename}: {e}")

print(f"\nSuccessfully loaded a total of {len(docs)} pages from all documents.")

Loading documents from: data
--> Loading tesla 10q.pdf...
    ...success!
--> Loading nvidia 10q.pdf...
    ...success!
--> Loading May Apple 10q.pdf...
    ...success!

Successfully loaded a total of 111 pages from all documents.


### Splitting documents via character

In [69]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,  # chunk size (characters)
    chunk_overlap=200,  # chunk overlap (characters)
    add_start_index=True,  # track index in original document
)

all_splits = text_splitter.split_documents(docs)
print(f"Split all documents into {len(all_splits)} sub-documents.")


Split all documents into 469 sub-documents.


### Embedding in a vector store

In [70]:
document_ids = vector_store.add_documents(documents=all_splits)

##### Again API issue

In [71]:
# from langchain import hub

# # N.B. for non-US LangSmith endpoints, you may need to specify
# # api_url="https://api.smith.langchain.com" in hub.pull.
# prompt = hub.pull("rlm/rag-prompt")

# example_messages = prompt.invoke(
#     {"context": "(context goes here)", "question": "(question goes here)"}
# ).to_messages()

# assert len(example_messages) == 1
# print(example_messages[0].content)



### Prompt Augmenting

In [72]:
from langchain_core.prompts import ChatPromptTemplate

# This is the content of the "rlm/rag-prompt" created locally
# to avoid the SSL error from hub.pull().
RAG_PROMPT_TEMPLATE = """You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.

Question: {question}
Context: {context}
Answer:"""

prompt = ChatPromptTemplate.from_template(RAG_PROMPT_TEMPLATE)

# Your example code will now work without a network call
example_messages = prompt.invoke(
    {"context": "(context goes here)", "question": "(question goes here)"}
).to_messages()

assert len(example_messages) == 1
print(example_messages[0].content)

You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.

Question: (question goes here)
Context: (context goes here)
Answer:


In [73]:
from langchain_core.documents import Document
from typing_extensions import List, TypedDict


class State(TypedDict):
    question: str
    context: List[Document]
    answer: str

In [74]:
def retrieve(state: State):
    retrieved_docs = vector_store.similarity_search(state["question"])
    return {"context": retrieved_docs}


def generate(state: State):
    docs_content = "\n\n".join(doc.page_content for doc in state["context"])
    messages = prompt.invoke({"question": state["question"], "context": docs_content})
    response = llm.invoke(messages)
    return {"answer": response.content}

In [75]:
from langgraph.graph import START, StateGraph

graph_builder = StateGraph(State).add_sequence([retrieve, generate])
graph_builder.add_edge(START, "retrieve")
graph = graph_builder.compile()

In [79]:
result = graph.invoke({"question": "What company is makes most revmue?"})

print(f"Context: {result['context']}\n\n")
print(f"Answer: {result['answer']}")



Context: [Document(id='85c0b79d-7943-4174-86a5-a63afc1704a0', metadata={'producer': 'EDGRpdf Service w/ EO.Pdf 22.0.40.0', 'creator': 'EDGAR Filing HTML Converter', 'creationdate': '2025-05-02T06:05:09-04:00', 'title': '0000320193-25-000057', 'author': 'EDGAR® Online LLC, a subsidiary of OTC Markets Group', 'subject': 'Form 10-Q filed on 2025-05-02 for the period ending 2025-03-29', 'keywords': '0000320193-25-000057; ; 10-Q', 'moddate': '2025-05-02T06:07:30-04:00', 'source': 'data/May Apple 10q.pdf', 'total_pages': 29, 'page': 23, 'page_label': '24', 'start_index': 1722}, page_content='The Company’s profit margins vary across its products, services, geographic segments and distribution channels. For example, the gross margins on the\nCompany’s products and services vary significantly and can change over time. The Company’s gross margins are subject to volatility and downward pressuredue to a variety of factors, including: continued industry-wide global product pricing pressures and pro