Etapa 1: Gerar PDFs com conteúdo

In [None]:
from reportlab.pdfgen import canvas
import os

os.makedirs("pdfs", exist_ok=True)

texts = {
    "documento1.pdf": "Inteligência Artificial é o campo que estuda como criar máquinas que pensam.",
    "documento2.pdf": "O Azure oferece serviços de IA como OpenAI, Search e mais.",
    "documento3.pdf": "RAG combina recuperação de documentos com geração de linguagem natural."
}

for filename, content in texts.items():
    c = canvas.Canvas(f"pdfs/{filename}")
    c.drawString(100, 750, content)
    c.save()


Etapa 2: Indexar os PDFs com RAG no Azure

2.1 Ler e converter os PDFs em texto

In [None]:
import fitz  # PyMuPDF

def extract_text_from_pdf(path):
    doc = fitz.open(path)
    text = ""
    for page in doc:
        text += page.get_text()
    return text

pdf_folder = "pdfs"
docs = []

for filename in os.listdir(pdf_folder):
    if filename.endswith(".pdf"):
        path = os.path.join(pdf_folder, filename)
        content = extract_text_from_pdf(path)
        docs.append({
            "id": filename,
            "content": content
        })


2.2 Indexar com Azure AI Search

In [None]:
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents import SearchClient
from azure.core.credentials import AzureKeyCredential
from azure.search.documents.indexes.models import (
    SearchIndex, SimpleField, SearchableField, VectorSearch, HnswAlgorithmConfiguration,
    VectorSearchProfile, VectorField, SemanticSettings, SemanticConfiguration, PrioritizedFields
)

endpoint = "https://<seu-endpoint>.search.windows.net"
key = "<sua-chave>"
index_name = "docs-index"

index_client = SearchIndexClient(endpoint=endpoint, credential=AzureKeyCredential(key))

# Criar o índice
fields = [
    SimpleField(name="id", type="Edm.String", key=True),
    SearchableField(name="content", type="Edm.String"),
    VectorField(name="contentVector", dimensions=1536, vector_search_configuration="default")
]

index = SearchIndex(
    name=index_name,
    fields=fields,
    vector_search=VectorSearch(
        algorithms=[HnswAlgorithmConfiguration(name="default")],
        profiles=[VectorSearchProfile(name="default", algorithm_configuration_name="default")]
    ),
    semantic_settings=SemanticSettings(
        configurations=[SemanticConfiguration(name="default", prioritized_fields=PrioritizedFields(content_field="content"))]
    )
)

index_client.create_or_update_index(index=index)


2.3 Adicionar documentos vetorizados com o Azure OpenAI

In [None]:
from azure.search.documents import SearchClient
from openai import AzureOpenAI

search_client = SearchClient(endpoint, index_name, AzureKeyCredential(key))

aoai = AzureOpenAI(
    azure_endpoint="https://<seu-openai-endpoint>.openai.azure.com/",
    api_key="<sua-chave-openai>",
    api_version="2024-02-15-preview"
)

def get_embedding(text):
    response = aoai.embeddings.create(input=[text], model="text-embedding-ada-002")
    return response.data[0].embedding

for doc in docs:
    embedding = get_embedding(doc["content"])
    search_client.upload_documents([
        {
            "id": doc["id"],
            "content": doc["content"],
            "contentVector": embedding
        }
    ])


Etapa 3: Consultar com RAG (LangChain + Azure)

In [None]:
from langchain.chat_models import AzureChatOpenAI
from langchain.chains import RetrievalQA
from langchain.vectorstores.azuresearch import AzureSearch
from langchain.embeddings import OpenAIEmbeddings

retriever = AzureSearch(
    azure_search_endpoint=endpoint,
    azure_search_key=key,
    index_name=index_name,
    embedding_function=get_embedding
)

llm = AzureChatOpenAI(
    openai_api_base="https://<seu-openai-endpoint>.openai.azure.com/",
    openai_api_version="2024-02-15-preview",
    deployment_name="gpt-35-turbo",  # ou gpt-4
    openai_api_key="<sua-chave-openai>"
)

qa_chain = RetrievalQA.from_chain_type(llm=llm, retriever=retriever, return_source_documents=True)

query = "O que é RAG?"
result = qa_chain.run(query)

print("Resposta:", result)