In [17]:
"""
RAG SYSTEM - Document Analysis Integration
Analyze PDFs (annual reports, financial statements) with AI
"""

import os
from dotenv import load_dotenv
from langchain_openai import ChatOpenAI
from langchain_community.utilities import SerpAPIWrapper
from langchain_community.document_loaders import PyPDFLoader
from langchain_chroma import Chroma
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from typing import List, Dict
import json

# Load environment
load_dotenv()

# Initialize
llm = ChatOpenAI(model="gpt-4o-mini", temperature=0.3)

# Use HuggingFace embeddings (already installed!)
print("‚è≥ Loading embedding model...")
embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)

search = SerpAPIWrapper()

# Create data directory for PDFs
os.makedirs("data/raw", exist_ok=True)
os.makedirs("data/vector_db", exist_ok=True)

print("‚úÖ RAG System Initialized!")
print("üìÅ PDF Directory: data/raw/")
print("üíæ Vector DB: data/vector_db/")
print("üîß Embedding Model: sentence-transformers/all-MiniLM-L6-v2")
print("üì¶ Using Your Installed Libraries:")
print(f"   - torch: 2.9.1")
print(f"   - sentence-transformers: 5.1.2")
print(f"   - transformers: 4.57.1")


‚è≥ Loading embedding model...
‚úÖ RAG System Initialized!
üìÅ PDF Directory: data/raw/
üíæ Vector DB: data/vector_db/
üîß Embedding Model: sentence-transformers/all-MiniLM-L6-v2
üì¶ Using Your Installed Libraries:
   - torch: 2.9.1
   - sentence-transformers: 5.1.2
   - transformers: 4.57.1


In [18]:
class RAGDocumentProcessor:
    """Process and index PDF documents"""
    
    def __init__(self, vector_db_path="data/vector_db"):
        self.vector_db_path = vector_db_path
        # Use the HuggingFace embeddings from Cell 1
        self.embeddings = embeddings  # ‚Üê This uses the global embeddings variable
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=1000,
            chunk_overlap=200
        )
        self.vectorstore = None
        
    def load_pdf(self, pdf_path: str) -> List:
        """Load and split PDF into chunks"""
        print(f"üìÑ Loading PDF: {pdf_path}")
        
        loader = PyPDFLoader(pdf_path)
        documents = loader.load()
        
        print(f"‚úÖ Loaded {len(documents)} pages")
        
        # Split into chunks
        chunks = self.text_splitter.split_documents(documents)
        print(f"‚úÖ Created {len(chunks)} text chunks")
        
        return chunks
    
    def create_vectorstore(self, documents: List, collection_name: str = "financial_docs"):
        """Create vector database from documents"""
        print(f"üíæ Creating vector database: {collection_name}")
        
        self.vectorstore = Chroma.from_documents(
            documents=documents,
            embedding=self.embeddings,  # Uses HuggingFace embeddings
            collection_name=collection_name,
            persist_directory=self.vector_db_path
        )
        
        print("‚úÖ Vector database created!")
        return self.vectorstore
    
    def load_vectorstore(self, collection_name: str = "financial_docs"):
        """Load existing vector database"""
        print(f"üìÇ Loading vector database: {collection_name}")
        
        self.vectorstore = Chroma(
            collection_name=collection_name,
            embedding_function=self.embeddings,  # Uses HuggingFace embeddings
            persist_directory=self.vector_db_path
        )
        
        print("‚úÖ Vector database loaded!")
        return self.vectorstore
    
    def query_documents(self, query: str, k: int = 5) -> List[str]:
        """Query documents and return relevant chunks"""
        if not self.vectorstore:
            print("‚ö†Ô∏è No vector database loaded!")
            return []
        
        results = self.vectorstore.similarity_search(query, k=k)
        return [doc.page_content for doc in results]

# Initialize processor with HuggingFace embeddings
rag_processor = RAGDocumentProcessor()

print("‚úÖ RAG Document Processor Ready!")
print("üîß Using HuggingFace Embeddings (from Cell 1)")


‚úÖ RAG Document Processor Ready!
üîß Using HuggingFace Embeddings (from Cell 1)


In [19]:
def research_with_rag(query: str, pdf_path: str = None, use_existing_db: bool = False):
    """
    Enhanced research combining PDF analysis with web search
    """
    
    print("\n" + "="*70)
    print("üîç ENHANCED RAG-POWERED RESEARCH")
    print("="*70)
    print(f"Query: {query}\n")
    
    # Step 1: Process PDF if provided
    pdf_insights = ""
    if pdf_path and os.path.exists(pdf_path):
        print("üìÑ Processing PDF document...")
        
        # Load and index PDF
        chunks = rag_processor.load_pdf(pdf_path)
        rag_processor.create_vectorstore(chunks, collection_name="current_research")
        
        # Query PDF for relevant info
        print(f"üîç Extracting insights from PDF...")
        relevant_chunks = rag_processor.query_documents(query, k=5)
        
        pdf_insights = "\n\n".join(relevant_chunks)
        print(f"‚úÖ Extracted {len(relevant_chunks)} relevant sections from PDF\n")
        
    elif use_existing_db:
        print("üìÇ Using existing document database...")
        rag_processor.load_vectorstore("current_research")
        relevant_chunks = rag_processor.query_documents(query, k=5)
        pdf_insights = "\n\n".join(relevant_chunks)
        print(f"‚úÖ Retrieved {len(relevant_chunks)} relevant sections\n")
    
    # Step 2: Web research
    print("üåê Conducting web research...")
    web_results = search.run(f"{query} 2025 2026")
    print("‚úÖ Web research complete\n")
    
    # Step 3: Generate enhanced report
    print("üìù Generating comprehensive report...")
    
    report_prompt = f"""You are a financial analyst with access to both company documents and market research.

Query: {query}

INTERNAL DOCUMENT DATA:
{pdf_insights[:4000] if pdf_insights else "No internal documents provided."}

WEB RESEARCH DATA:
{web_results[:2000]}

Generate a comprehensive analysis (1000-1500 words) that:
1. EXECUTIVE SUMMARY
2. KEY FINDINGS FROM INTERNAL DOCUMENTS (if available)
   - Financial metrics
   - Strategic initiatives
   - Performance indicators
3. MARKET CONTEXT (from web research)
   - Industry trends
   - Competitive landscape
4. SYNTHESIS & INSIGHTS
   - Combine document data with market context
   - Identify opportunities and risks
5. RECOMMENDATIONS

Use specific numbers, dates, and facts. Clearly cite whether information comes from internal documents or market research.
"""
    
    response = llm.invoke(report_prompt)
    final_report = response.content
    
    # Save report
    os.makedirs("outputs/reports", exist_ok=True)
    from datetime import datetime
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    
    filename = f"outputs/reports/{timestamp}_RAG_Enhanced_{query[:40].replace(' ', '_')}.md"
    
    with open(filename, 'w', encoding='utf-8') as f:
        f.write(f"# RAG-Enhanced Financial Analysis\n\n")
        f.write(f"**Query:** {query}\n\n")
        f.write(f"**PDF Analyzed:** {'Yes' if pdf_insights else 'No'}\n\n")
        f.write(f"**Date:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
        f.write("---\n\n")
        f.write(final_report)
    
    print(f"\nüíæ Report saved: {filename}")
    print("\nüéâ RAG-ENHANCED RESEARCH COMPLETE!")
    
    return final_report, filename

print("‚úÖ RAG-Enhanced Research Function Ready!")


‚úÖ RAG-Enhanced Research Function Ready!


In [5]:
def process_multiple_pdfs(pdf_directory: str = "data/raw"):
    """
    Process all PDFs in a directory and create unified vector database
    """
    
    print("\n" + "="*70)
    print("üìö BATCH PDF PROCESSING")
    print("="*70)
    
    # Find all PDFs
    pdf_files = [f for f in os.listdir(pdf_directory) if f.endswith('.pdf')]
    
    if not pdf_files:
        print(f"‚ö†Ô∏è No PDF files found in {pdf_directory}")
        return
    
    print(f"Found {len(pdf_files)} PDF files:\n")
    for i, pdf in enumerate(pdf_files, 1):
        print(f"{i}. {pdf}")
    
    # Process all PDFs
    all_chunks = []
    
    for pdf_file in pdf_files:
        pdf_path = os.path.join(pdf_directory, pdf_file)
        print(f"\nüìÑ Processing: {pdf_file}")
        
        chunks = rag_processor.load_pdf(pdf_path)
        all_chunks.extend(chunks)
    
    # Create unified vector database
    print(f"\nüíæ Creating unified vector database with {len(all_chunks)} chunks...")
    rag_processor.create_vectorstore(all_chunks, collection_name="financial_docs")
    
    print("\n‚úÖ All PDFs processed and indexed!")
    print(f"üìä Total chunks in database: {len(all_chunks)}")

print("‚úÖ Batch Processing Function Ready!")


‚úÖ Batch Processing Function Ready!


In [6]:
def query_document_database(query: str, k: int = 5):
    """
    Query the document database directly
    """
    
    print("\n" + "="*70)
    print("üîç DOCUMENT DATABASE QUERY")
    print("="*70)
    print(f"Query: {query}\n")
    
    # Load database
    rag_processor.load_vectorstore("financial_docs")
    
    # Query
    results = rag_processor.query_documents(query, k=k)
    
    print(f"‚úÖ Found {len(results)} relevant sections:\n")
    
    for i, result in enumerate(results, 1):
        print(f"\n--- Result {i} ---")
        print(result[:300] + "..." if len(result) > 300 else result)
        print()
    
    return results

print("‚úÖ Document Query Interface Ready!")


‚úÖ Document Query Interface Ready!


In [7]:
# TEST 1: Research without PDF (web-only)
print("\n" + "="*70)
print("TEST 1: WEB-ONLY RESEARCH")
print("="*70)

report1, file1 = research_with_rag(
    query="Infosys Q3 FY26 financial results and profit margins",
    pdf_path=None
)

print("\nüìÑ Report Preview:")
print(report1[:500] + "...")



TEST 1: WEB-ONLY RESEARCH

üîç ENHANCED RAG-POWERED RESEARCH
Query: Infosys Q3 FY26 financial results and profit margins

üåê Conducting web research...
‚úÖ Web research complete

üìù Generating comprehensive report...

üíæ Report saved: outputs/reports/20260202_181314_RAG_Enhanced_Infosys_Q3_FY26_financial_results_and_pr.md

üéâ RAG-ENHANCED RESEARCH COMPLETE!

üìÑ Report Preview:
# Infosys Q3 FY26 Financial Results and Profit Margins Analysis

## EXECUTIVE SUMMARY

Infosys, a leading global technology services and consulting company, reported its financial results for the third quarter of fiscal year 2026 (Q3 FY26) on January 14, 2026. The company demonstrated resilience in a challenging market environment, with a consolidated revenue of ‚Çπ45,479 crore, reflecting a 2.2% increase from the previous quarter and a 9% increase year-on-year. However, the operating profit exper...


In [8]:
print("\n" + "="*70)
print("üìÅ HOW TO USE WITH PDFs")
print("="*70)
print("""
To analyze PDF documents:

1. Place your PDF files in: data/raw/
   - Annual reports
   - Financial statements
   - Quarterly results
   - Company presentations

2. For SINGLE PDF analysis:
   report, file = research_with_rag(
       query="Your analysis question",
       pdf_path="data/raw/your_document.pdf"
   )

3. For MULTIPLE PDFs:
   # First, process all PDFs
   process_multiple_pdfs("data/raw")
   
   # Then query
   report, file = research_with_rag(
       query="Your analysis question",
       use_existing_db=True
   )

4. To query documents directly:
   results = query_document_database("Your question")

""")

print("üìÇ Current PDF directory: data/raw/")
print(f"üìä PDFs found: {len([f for f in os.listdir('data/raw') if f.endswith('.pdf')])}")



üìÅ HOW TO USE WITH PDFs

To analyze PDF documents:

1. Place your PDF files in: data/raw/
   - Annual reports
   - Financial statements
   - Quarterly results
   - Company presentations

2. For SINGLE PDF analysis:
   report, file = research_with_rag(
       query="Your analysis question",
       pdf_path="data/raw/your_document.pdf"
   )

3. For MULTIPLE PDFs:
   # First, process all PDFs
   process_multiple_pdfs("data/raw")

   # Then query
   report, file = research_with_rag(
       query="Your analysis question",
       use_existing_db=True
   )

4. To query documents directly:
   results = query_document_database("Your question")


üìÇ Current PDF directory: data/raw/
üìä PDFs found: 0


In [9]:
# If you have PDFs in data/raw/, uncomment and run:

# Example 1: Single PDF
# report, file = research_with_rag(
#     query="What are the revenue trends and profit margins?",
#     pdf_path="data/raw/company_annual_report.pdf"
# )

# Example 2: Multiple PDFs
# process_multiple_pdfs("data/raw")
# report, file = research_with_rag(
#     query="Compare financial performance across companies",
#     use_existing_db=True
# )

# Example 3: Direct document query
# results = query_document_database("Revenue growth Q3 FY26")

print("‚úÖ RAG System Ready!")
print("üìÅ Add PDFs to data/raw/ and uncomment examples above to test!")


‚úÖ RAG System Ready!
üìÅ Add PDFs to data/raw/ and uncomment examples above to test!


In [20]:
# TEST: 400-page Annual Report Analysis
print("\n" + "="*70)
print("üìö TESTING RAG WITH LARGE PDF (400 pages)")
print("="*70)

# Replace with YOUR actual filename
pdf_filename = "annual-report-2024-2025.pdf"  # ‚Üê CHANGE THIS!

# Smaller, focused query works best
report, file = research_with_rag(
    query="What are the revenue, profit margins, and key growth strategies for FY 2024-2025?",
    pdf_path=f"data/raw/{pdf_filename}"
)

print("\n" + "="*70)
print("üìÑ ENHANCED REPORT (PDF + Web Research):")
print("="*70)
print(report)



üìö TESTING RAG WITH LARGE PDF (400 pages)

üîç ENHANCED RAG-POWERED RESEARCH
Query: What are the revenue, profit margins, and key growth strategies for FY 2024-2025?

üìÑ Processing PDF document...
üìÑ Loading PDF: data/raw/annual-report-2024-2025.pdf
‚úÖ Loaded 336 pages
‚úÖ Created 1411 text chunks
üíæ Creating vector database: current_research
‚úÖ Vector database created!
üîç Extracting insights from PDF...
‚úÖ Extracted 5 relevant sections from PDF

üåê Conducting web research...
‚úÖ Web research complete

üìù Generating comprehensive report...

üíæ Report saved: outputs/reports/20260202_184142_RAG_Enhanced_What_are_the_revenue,_profit_margins,_an.md

üéâ RAG-ENHANCED RESEARCH COMPLETE!

üìÑ ENHANCED REPORT (PDF + Web Research):
# Comprehensive Analysis of TCS for FY 2024-2025

## EXECUTIVE SUMMARY

Tata Consultancy Services (TCS) has demonstrated resilience and adaptability in its financial performance for FY 2024-2025, achieving a revenue growth of 6.0% in FY 2025, d