# R&D Project Auditor Exploration

This notebook is used for exploratory data analysis and experimentation with the R&D Project Auditor application. It allows for interactive development and testing of various components.

In [59]:
# Import necessary libraries
import os
import sys
import importlib
from pathlib import Path
from dotenv import load_dotenv

# Robustly set up the project path
current_path = Path(os.getcwd())
if current_path.name == 'notebooks':
    project_root = current_path.parent
else:
    project_root = current_path

if str(project_root) not in sys.path:
    sys.path.append(str(project_root))

print(f"Project root set to: {project_root}")

# Force reload environment variables (override existing)
load_dotenv(project_root / ".env", override=True)

try:
    # Import modules
    import src.ingest
    import src.rag_engine
    import src.auditor
    import src.config
    
    # Force reload to pick up file changes
    importlib.reload(src.config)
    importlib.reload(src.ingest)
    importlib.reload(src.rag_engine)
    importlib.reload(src.auditor)
    
    # Import classes from reloaded modules
    from src.ingest import IngestionEngine
    from src.rag_engine import RAGEngine
    from src.auditor import AuditorAgent
    from src.config import settings
    from langchain_community.chat_models import ChatOpenAI
    
    print(f"✅ Libraries imported. Using LLM Model: {settings.LLM_MODEL_ID}")
except ImportError as e:
    print(f"❌ Import Error: {e}")
    print("Please ensure you are running this notebook from the correct environment and have installed dependencies.")

Project root set to: /Users/tarsobertolini/Documents/PUR-PRO/pur-auditor-rag
✅ Libraries imported. Using LLM Model: amazon.titan-text-express-v1


In [56]:
# 1. Ingestion Phase

# Ensure imports are available if this cell is run out of order
if 'IngestionEngine' not in globals():
    print("⚠️ IngestionEngine not found. Attempting to import...")
    try:
        from src.ingest import IngestionEngine
    except ImportError:
        print("❌ Could not import IngestionEngine. Please run the 'Import necessary libraries' cell above.")

# Initialize the ingestion engine
if 'IngestionEngine' in globals():
    ingestor = IngestionEngine(chunk_size=1000, chunk_overlap=200)

    # Define the path to your uploaded PDF
    pdf_path = "../data/Draft-PUR-Tv3.0-Principal.pdf" 

    # Check if file exists before running
    if not os.path.exists(pdf_path):
        # Try absolute path if relative fails
        pdf_path = os.path.join(os.getcwd(), "..", "data", "Draft-PUR-Tv3.0-Principal.pdf")
        
    if os.path.exists(pdf_path):
        print(f"Found file: {pdf_path}")
        documents = ingestor.process_file(pdf_path)
        print(f"Processed {len(documents)} chunks.")
        
        if documents:
            print(f"Sample chunk content:\n{documents[0].page_content[:500]}...")
    else:
        print(f"❌ File not found at {pdf_path}. Please ensure 'Draft-PUR-Tv3.0-Principal.pdf' is in the 'data' folder.")
else:
    print("❌ Cannot proceed without IngestionEngine.")

Found file: ../data/Draft-PUR-Tv3.0-Principal.pdf


INFO:src.ingest:Successfully loaded 37 pages from ../data/Draft-PUR-Tv3.0-Principal.pdf
INFO:src.ingest:Split 37 pages into 75 chunks.
INFO:src.ingest:Split 37 pages into 75 chunks.


Processed 75 chunks.
Sample chunk content:
PLANO DE UTILIZAÇÃO DE RECURSOS – PUR
N° xxx/2025
PROGRAMA: TV 3.0
PERÍODO: JANEIRO/2025 A JUNHO/2027
MANAUS/AM
AGOSTO/2025...


In [57]:
# 2. RAG Engine Setup
# Initialize RAG Engine
rag = RAGEngine(
    embedding_model_name=settings.EMBEDDING_MODEL_NAME,
    vector_store_path=settings.VECTOR_DB_PATH
)

# Create Vector Store (Run this once or when docs change)
rag.create_vector_store(documents)

# Load it back (to test loading)
rag.load_vector_store()

# Test Retrieval
# Increased k to 10 to ensure enough context is retrieved for the audit
retriever = rag.get_retriever(k=10)
docs = retriever.invoke("Qual o TRL do projeto?")
print(f"Retrieved {len(docs)} docs.")
print(docs[0].page_content)

INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: mps
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: sentence-transformers/all-MiniLM-L6-v2
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: sentence-transformers/all-MiniLM-L6-v2
INFO:src.rag_engine:Creating vector store with 75 chunks...
INFO:src.rag_engine:Creating vector store with 75 chunks...
INFO:src.rag_engine:Vector store saved to vector_store_faiss
INFO:src.rag_engine:Loading vector store from vector_store_faiss...
INFO:src.rag_engine:Vector store saved to vector_store_faiss
INFO:src.rag_engine:Loading vector store from vector_store_faiss...


Retrieved 10 docs.
TRL (Technology Readiness Level), a situação presente pode ser enquadrada entre TRL 2
e TRL 3, correspondente à formulação do conceito tecnológico e à comprovação
experimental de princípios básicos em ambiente controlado, sem aplicação prática no
cenário local.
A implantação da infraestrutura e o desenvolvimento dos projetos têm como objetivo
avançar gradualmente até níveis mais altos de maturidade. Espera-se que, ao final do
programa, as soluções alcançarão conjuntamente TRL 7 ou superior para a implantação da
TV 3.0, estágio no qual um sistema ou protótipo é demonstrado em ambiente operacional
próximo ao real. Dessa forma, as soluções entregues pelos projetos derivados poderão
Página 10 de 37
Plano de Utilização de Recursos – PUR nº 495/2025 aplicado aos Projetos Prioritários
Ano-Base 2025


In [60]:
# 3. Audit Execution
import boto3
from langchain_aws import ChatBedrock
from langchain_openai import ChatOpenAI

# Initialize LLM
# We will try to use AWS Bedrock first (since you configured AWS creds)
# If that fails, we fall back to OpenAI if key is present.

llm = None

try:
    print(f"Attempting to connect to AWS Bedrock (Region: {settings.AWS_REGION})...")
    
    # Create Bedrock client
    bedrock_client = boto3.client(
        service_name="bedrock-runtime", 
        region_name=settings.AWS_REGION
    )
    
    llm = ChatBedrock(
        model_id=settings.LLM_MODEL_ID, 
        client=bedrock_client,
        model_kwargs={"temperature": 0.0}
    )
    print("✅ AWS Bedrock connected successfully.")
    
except Exception as e:
    print(f"⚠️ Could not connect to AWS Bedrock: {e}")
    print("Checking for OpenAI API Key...")
    
    if settings.OPENAI_API_KEY:
        llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0)
        print("✅ Connected to OpenAI.")
    else:
        print("❌ No valid LLM configuration found. Please check your .env file.")

if llm:
    auditor = AuditorAgent(llm=llm, retriever=retriever)
    
    # Run Audit
    print("Running audit... (This may take a few seconds)")
    result = auditor.audit_project("Analise este projeto e extraia as métricas de conformidade.")
    
    print("\n=== Audit Result ===")
    # Pydantic v2 uses model_dump_json() instead of json()
    print(result.model_dump_json(indent=2))
else:
    print("Skipping audit due to missing LLM.")

Attempting to connect to AWS Bedrock (Region: us-east-1)...
✅ AWS Bedrock connected successfully.
Running audit... (This may take a few seconds)


INFO:langchain_aws.llms.bedrock:Using Bedrock Invoke API to generate response
INFO:langchain_aws.llms.bedrock:Using Bedrock Invoke API to generate response



=== Audit Result ===
{
  "project_title": "TV 3.0 na Amazônia Ocidental",
  "trl_level": 0,
  "methodology_summary": "Desenvolvimento de metodologias ágeis e de colaboração para projetos de TV 3.0.",
  "innovation_highlights": [
    "Inovação na integração vertical dos processos, uso de inteligência aplicada e capacidade de controle avançado da transmissão e dos testes de TV 3.0."
  ],
  "team_analysis": "Equipe",
  "financial_analysis": "Não informado",
  "risk_assessment": "Médio",
  "compliance_score": 0,
  "justification": "Sem justificativa fornecida"
}
