In [0]:
# Install the core LangChain and Azure OpenAI integrations
%pip install langchain langchain-openai langchain-community databricks-langchain langchain-classic langgraph
# Restart the Python process so the notebook recognizes the new installs
dbutils.library.restartPython()

[43mNote: you may need to restart the kernel using %restart_python or dbutils.library.restartPython() to use updated packages.[0m


In [0]:
%sql
CREATE VOLUME IF NOT EXISTS default.my_pdfs;

In [0]:
import os

# 1. Define your storage path
# Note: Unity Catalog volumes use the /Volumes/ prefix
download_path = "/Volumes/workspace/default/my_pdfs"
# 2. Use dbutils instead of os.makedirs for the base Volume path
# This creates the sub-directory safely
dbutils.fs.mkdirs(download_path)

# 3. Now continue with your download logic
# ... (rest of your code)

True

In [0]:
import requests
import os

# 1. Define the targets (Company Tickers)
tickers = ["AAPL", "MSFT"]
# SEC requires a User-Agent header with your email
headers = {"User-Agent": "Your Name (your-email@example.com)"} 

# 2. Define your storage path (Unity Catalog Volume from Phase 1)
# Make sure you created the volume: main.default.my_pdfs
download_path = "/Volumes/workspace/default/my_pdfs"
# 2. Use dbutils instead of os.makedirs for the base Volume path
# This creates the sub-directory safely
dbutils.fs.mkdirs(download_path)

# 3. Download URLs (Static examples for the latest 10-K PDFs)
# In a real project, you'd use the 'sec-edgar-downloader' library to find these URLs
urls = {
    "AAPL": "https://s2.q4cdn.com/470004039/files/doc_financials/2025/ar/_10-K-2025-As-Filed.pdf",
    "NVDA": "https://s201.q4cdn.com/141608511/files/doc_financials/2024/q4/1cbe8fe7-e08a-46e3-8dcc-b429fc06c1a4.pdf"
}

for ticker, url in urls.items():
    try:
        print(f"Attempting to download {ticker} 10-K...")
        response = requests.get(url, headers=headers, timeout=30)
        
        if response.status_code == 200:
            file_name = f"{ticker}_10K.pdf"
            full_path = os.path.join(download_path, file_name)
            
            with open(full_path, "wb") as f:
                f.write(response.content)
            print(f"✅ {ticker} saved to: {full_path}")
        else:
            print(f"❌ Failed {ticker}: Status {response.status_code}")
            
    except Exception as e:
        print(f"⚠️ Error with {ticker}: {str(e)}")

# Final confirmation
print("\n--- Files ready in Volume ---")
for f in dbutils.fs.ls(download_path):
    print(f.name)

Attempting to download AAPL 10-K...
✅ AAPL saved to: /Volumes/workspace/default/my_pdfs/AAPL_10K.pdf
Attempting to download NVDA 10-K...
✅ NVDA saved to: /Volumes/workspace/default/my_pdfs/NVDA_10K.pdf

--- Files ready in Volume ---
AAPL_10K.pdf
NVDA_10K.pdf


In [0]:
from langchain_community.document_loaders import PyPDFLoader

# 1. Define your path (matching exactly where your files are)
volume_path = "/Volumes/workspace/default/my_pdfs/"

# 2. List the files we want to process
files_to_load = ["AAPL_10K.pdf", "NVDA_10K.pdf"]

all_documents = []

for file_name in files_to_load:
    full_path = volume_path + file_name
    print(f"Loading: {full_path}")
    
    try:
        # We use the /Volumes path directly
        loader = PyPDFLoader(full_path)
        data = loader.load()
        
        # Add metadata to each page so the Agent knows which company it's looking at
        for page in data:
            page.metadata["company"] = file_name.split("_")[0]
            
        all_documents.extend(data)
        print(f"✅ Loaded {len(data)} pages from {file_name}")
        
    except Exception as e:
        print(f"❌ Error loading {file_name}: {e}")

print(f"\nTotal pages ready for chunking: {len(all_documents)}")

Loading: /Volumes/workspace/default/my_pdfs/AAPL_10K.pdf
✅ Loaded 80 pages from AAPL_10K.pdf
Loading: /Volumes/workspace/default/my_pdfs/NVDA_10K.pdf
✅ Loaded 96 pages from NVDA_10K.pdf

Total pages ready for chunking: 176


In [0]:
import os
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import AzureOpenAIEmbeddings
from langchain_community.vectorstores import AzureSearch

# 2. Chunking (The secret to good RAG)
# We use 1000 characters with an overlap so sentences aren't cut in half.
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)
chunks = text_splitter.split_documents(data)

In [0]:
import os
from langchain_openai import AzureOpenAIEmbeddings
from langchain_community.vectorstores.azuresearch import AzureSearch
from langchain_openai import AzureChatOpenAI, AzureOpenAIEmbeddings

# 1. Setup Embeddings (This part works fine with env vars)
embeddings = AzureOpenAIEmbeddings(
    # CHANGE THIS: Use the nickname you gave the model, NOT the resource name
    # Common examples: "text-embedding-3-small" or "my-embedding-model"
    azure_deployment="text-embedding-3-small",
    openai_api_key=os.environ["AZURE_OPENAI_API_KEY"],
    azure_endpoint=os.environ["AZURE_OPENAI_ENDPOINT"],
    api_version="2024-06-01" # Using a highly stable version for embeddings
)

# 2. Initialize the Brain (Chat Model)
# This MUST be your gpt-4o-mini deployment
llm = AzureChatOpenAI(
    azure_deployment="gpt-4o-mini", 
    api_version="2024-06-01",
    temperature=0
)

# 3. Setup Vector Store (EXPLICITLY pass the endpoint and key here)
vector_store = AzureSearch(
    azure_search_endpoint=os.environ["AZURE_SEARCH_ENDPOINT"], # Added this
    azure_search_key=os.environ["AZURE_SEARCH_KEY"],           # Added this
    index_name="finance-index",
    embedding_function=embeddings.embed_query
)

# 3. Trigger the Upload
if 'all_documents' in locals():
    print(f"Uploading {len(all_documents)} chunks to Azure AI Search...")
    vector_store.add_documents(documents=all_documents)
    print("✅ Success! Your 'Finance Library' is now live in Azure.")
else:
    print("❌ Error: 'all_documents' not found. Run your PDF loading cell first!")

Uploading 176 chunks to Azure AI Search...
✅ Success! Your 'Finance Library' is now live in Azure.


In [0]:
import os
from langchain_openai import AzureChatOpenAI, AzureOpenAIEmbeddings
from langchain_community.vectorstores.azuresearch import AzureSearch

# MODIFIED IMPORTS for v1.x
from langchain_classic.chains.combine_documents import create_stuff_documents_chain
from langchain_classic.chains.retrieval import create_retrieval_chain
from langchain_core.prompts import ChatPromptTemplate

# 1. Initialize your LLM and Vector Store (Ensure keys are in os.environ)
llm = AzureChatOpenAI(azure_deployment="gpt-4o-mini", api_version="2024-06-01")

# 2. Define the prompt
prompt = ChatPromptTemplate.from_template("""
Answer the following question based only on the provided context. 
If the answer isn't in the context, say "I don't have that information in my documents."

Context: {context}
Question: {input}
""")

# 3. Build the Modern Chain
retriever = vector_store.as_retriever()
combine_docs_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, combine_docs_chain)

print("✅ Agent is ready to chat with modern v1.x imports!")

✅ Agent is ready to chat with modern v1.x imports!


In [0]:
response = rag_chain.invoke({"input": "Summarize the top financial risks for NVIDIA."})
print(response["answer"])

The top financial risks for NVIDIA include:

1. **Failure to Meet Industry Needs**: Not adapting to the evolving needs of the industry could adversely impact financial results.
2. **Competition**: Increased competition may reduce market share and negatively affect financial outcomes.
3. **Customer Demand Estimation**: Inaccurate estimates of customer demand may lead to mismatches between supply and demand, risking financial performance.
4. **Dependency on Suppliers**: Reliance on third-party suppliers for manufacturing, assembling, testing, or packaging can reduce control over product quantity, quality, and delivery schedules, potentially harming business and financial condition.
5. **Product Defects**: Defects in products could incur significant remediation expenses and damage the company's reputation, impacting financial stability.
6. **Adverse Economic Conditions**: Broad economic downturns could negatively affect the business and financial standing.


In [0]:
import os
from langchain_classic.chains.history_aware_retriever import create_history_aware_retriever
from langchain_core.prompts import  MessagesPlaceholder
from langchain_core.messages import HumanMessage, AIMessage

# 1. Setup the "Re-writer" Prompt (Contextualizes follow-up questions)
contextualize_q_system_prompt = (
    "Given a chat history and the latest user question "
    "which might reference context in the chat history, "
    "formulate a standalone question which can be understood "
    "without the chat history. Do NOT answer the question."
)
contextualize_q_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", contextualize_q_system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)

# 2. Create the History-Aware Retriever
history_aware_retriever = create_history_aware_retriever(
    llm, retriever, contextualize_q_prompt
)

# 3. Setup the Final Answer Prompt (With Citation Instructions)
qa_system_prompt = (
    "You are a financial assistant. Use the following context to answer the question. "
    "If you don't know the answer, say you don't know. "
    "At the end of your answer, list the 'Source' and 'Page Number' from the metadata."
    "\n\n"
    "{context}"
)
qa_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", qa_system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)

# 4. Build the Final Conversational Chain
question_answer_chain = create_stuff_documents_chain(llm, qa_prompt)
rag_chain = create_retrieval_chain(history_aware_retriever, question_answer_chain)

In [0]:
# Initialize an empty history
chat_history = []

# Question 1
q1 = "What was NVIDIA's total revenue?"
result1 = rag_chain.invoke({"input": q1, "chat_history": chat_history})
print(f"AI: {result1['answer']}")

# Update history
chat_history.extend([HumanMessage(content=q1), AIMessage(content=result1["answer"])])

# Question 2 (The follow-up!)
q2 = "How much of that came from Data Centers?"
result2 = rag_chain.invoke({"input": q2, "chat_history": chat_history})
print(f"\nAI (Follow-up): {result2['answer']}")

AI: NVIDIA's total revenue for fiscal year 2024 was $60.9 billion, which is an increase of 126% from the previous year. 

Source: Fiscal Year 2024 Summary
Page Number: 1

AI (Follow-up): NVIDIA's Data Center revenue for fiscal year 2024 was $47.5 billion, which is up 217% from fiscal year 2023.

Source: Fiscal Year 2024 Summary
Page Number: 1
