In [22]:
import os
from dotenv import load_dotenv


env_path = os.path.join('..', '.env')

# 1. Loading the variables
if os.path.exists(env_path):
    load_dotenv(dotenv_path=env_path)
    print(f"‚úÖ Loaded configuration from: {os.path.abspath(env_path)}")
else:
    print("‚ùå Error: .env file not found in project root.")

# 2. Verifying Credentials
required_vars = [
    "AZURE_OPENAI_API_KEY", 
    "AZURE_OPENAI_ENDPOINT", 
    "AZURE_OPENAI_API_VERSION", 
    "AZURE_OPENAI_MODEL",
]

missing = [var for var in required_vars if not os.getenv(var)]
if missing:
    print(f"‚ùå Missing environment variables: {missing}")
else:
    key = os.getenv("AZURE_OPENAI_API_KEY")
    print(f"‚úÖ Azure Configured. Key: {key[:5]}...******")
    print(f"‚úÖ Azure OpenAI Model: {os.getenv('AZURE_OPENAI_MODEL')}")

‚úÖ Loaded configuration from: /Users/harsh.pandey/Desktop/GenAI/rag-demo/.env
‚úÖ Azure Configured. Key: 97e83...******
‚úÖ Azure OpenAI Model: gpt-5


In [23]:
import requests
import os 

# 1. Configuration
pdf_url = "https://globalwellnessinstitute.org/wp-content/uploads/2023/12/NUTRITION_4_HEALTH_SPAN_GWI_final_202301210_hi-res.pdf"

output_folder = "../pdf"

file_path = os.path.join(output_folder, "nutrition_healthspan.pdf")

# 2. Download if not exists
os.makedirs(output_folder, exist_ok=True)

if not os.path.exists(file_path):
    print(f"Downloading PDF from {pdf_url}...")
    headers = {'User-Agent': 'Mozilla/5.0'}
    response = requests.get(pdf_url, headers=headers)
    
    if response.status_code == 200:
        with open(file_path, 'wb') as f:
            f.write(response.content)
        print("‚úÖ Download complete.")
    else:
        print(f"‚ùå Failed to download. Status: {response.status_code}")
else:
    print("‚ÑπÔ∏è File already exists locally.")

‚ÑπÔ∏è File already exists locally.


In [24]:
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

print("Loading PDF... (This reads the file)")
loader = PyPDFLoader(file_path)
docs = loader.load()
print(f"   Loaded {len(docs)} pages.")

# Spliting Configuration
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,    # Characters per chunk
    chunk_overlap=200   # Overlap to preserve context
)

print("Splitting document into chunks...")
splits = text_splitter.split_documents(docs)

print(f"‚úÖ Created {len(splits)} chunks.")

Loading PDF... (This reads the file)
   Loaded 88 pages.
Splitting document into chunks...
‚úÖ Created 127 chunks.


In [34]:
from langchain_openai import AzureOpenAIEmbeddings
from langchain_community.vectorstores import FAISS
import os

# 1. Configuring Embedding Model
embeddings = AzureOpenAIEmbeddings(
    model="text-embedding-3-small",
    azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
    api_key=os.getenv("AZURE_OPENAI_API_KEY"),
    api_version=os.getenv("AZURE_OPENAI_API_VERSION"),
    azure_deployment=os.getenv("AZURE_EMBEDDING_DEPLOYMENT"),    
    chunk_size=100,
    show_progress_bar=True,
    max_retries=20,
    retry_min_seconds=2
)

print(f"‚öóÔ∏è  Embedding chunks using: {os.getenv('AZURE_EMBEDDING_DEPLOYMENT')}...")

vectorstore = FAISS.from_documents(documents=splits, embedding=embeddings)

print("‚úÖ Vector Store created successfully.")

‚öóÔ∏è  Embedding chunks using: None...


  0%|          | 0/2 [00:00<?, ?it/s]

‚úÖ Vector Store created successfully.


In [35]:
# Sample embedding
sample_word = "apple"
vector = embeddings.embed_query(sample_word)

print(f"Word: '{sample_word}'")
print(f"Vector Dimensions: {len(vector)}") # Should be 1536 for OpenAI models
print(f"Type: {type(vector)}")
print(f"First 10 numbers: {vector[:10]}")
print("...")

  0%|          | 0/1 [00:00<?, ?it/s]

Word: 'apple'
Vector Dimensions: 1536
Type: <class 'list'>
First 10 numbers: [0.01764063909649849, -0.016817327588796616, -0.04184354469180107, 0.019008787348866463, -0.0018100723391398787, -0.026902882382273674, 0.007264504674822092, 0.02287108078598976, -0.01952940970659256, -0.016732575371861458]
...


In [53]:
from openai import AzureOpenAI
import os 

# 1. Setup Azure Client
client = AzureOpenAI(
    api_key=os.getenv("AZURE_OPENAI_API_KEY"),
    api_version=os.getenv("AZURE_OPENAI_API_VERSION"),
    azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT")
)

deployment = os.getenv("AZURE_DEPLOYMENT")

# 2. Define Question
user_question = input("Enter your question: ")

# 3. Retrieve Context (Manually)
print(f"üîç Searching PDF for: '{user_question}'...")
relevant_docs = vectorstore.similarity_search(user_question, k=3)

# Join the retrieved text into one big string
context_data = "\n\n".join([doc.page_content for doc in relevant_docs])

# 4. Preparing the Prompt
system_prompt = """You are a strict assistant.
Your ONLY task is to answer the user's question based on the provided context below.
- Do NOT use your internal knowledge.
- Do NOT make up facts.
- If the answer is not explicitly written in the context, you MUST say "I don't know".
- Do not try to be helpful by adding outside information.
"""

user_message = f"""
Context:
{context_data}

Question: 
{user_question}
"""

# 5. Call GPT-5 
print(f"ü§ñ Asking Azure {os.getenv('AZURE_OPENAI_MODEL')}...")

response = client.chat.completions.create(
    model=deployment,
    messages=[
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_message}
    ]
)

# 6. Output
print("\n--- Answer ---")
print(response.choices[0].message.content)

üîç Searching PDF for: 'My stomach hurts after eating dairy. Do I have Celiac disease or IBS?'...


  0%|          | 0/1 [00:00<?, ?it/s]

ü§ñ Asking Azure gpt-5...

--- Answer ---
I don't know. The context does not specify whether stomach pain after eating dairy indicates celiac disease or IBS.
