<a href="https://colab.research.google.com/github/Mathvivas/Artificial-Intelligence/blob/main/RAG_Chroma_Docs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Pipeline de Ingestão de Dados (ETL)

- **Extração**: Carregar os documentos PDF das aulas.
- **Transformação**: Aplicar estratégias de chunking adaptativo e gerar embeddings.
- **Carregamento**: Indexar os chunks e seus embeddings em um banco de dados vetorial, o Chroma.

### Instalação e Importações

In [1]:
!pip install -q langchain langchain-core langchain-community
!pip install -q langchain-google-genai
!pip install -q chromadb pypdf rank_bm25
!pip install -q ragas datasets
!pip install -q langchain-text-splitters
!pip install -q langchain_core

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/2.5 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━[0m [32m2.3/2.5 MB[0m [31m65.5 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m39.0 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.0 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m35.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.7/64.7 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.9/50.9 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dep

In [2]:
from google.colab import userdata
GOOGLE_API_KEY = userdata.get('GEMINI_API_KEY')

In [3]:
from langchain_community.document_loaders import PyPDFDirectoryLoader



### Extract

In [4]:
loader = PyPDFDirectoryLoader('documentos_curso/')
docs = loader.load()

In [5]:
len(docs)

98

### Transform

In [6]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [7]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1500,
    chunk_overlap=200,
    separators=['\n\n', '\n', '. ', ' ', ''],
)
chunks = text_splitter.split_documents(docs)

print(f'Total de chunks criados: {len(chunks)}')

Total de chunks criados: 98


In [8]:
from langchain_google_genai import GoogleGenerativeAIEmbeddings

In [9]:
embedding_model = GoogleGenerativeAIEmbeddings(
    model='gemini-embedding-001',
    api_key=GOOGLE_API_KEY
)

### Load

In [10]:
from langchain_community.vectorstores import Chroma

In [None]:
vectorstore = Chroma.from_documents(
    documents=chunks,
    embedding=embedding_model,
    persist_directory='./chroma_db'
).as_retriever(search_kwargs={'k': 3})
print('Banco de dados criado com sucesso!')

In [17]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.output_parsers import StrOutputParser

In [14]:
prompt_consulta = ChatPromptTemplate.from_messages([
    ('system', 'Responda usando exclusivamente o conteúdo fornecido.'),
    ('human', '{query}\n\nContexto: \n{contexto}\n\nResposta: ')
])

In [16]:
llm = ChatGoogleGenerativeAI(
    model='gemini-2.0-flash-lite',
    temperature=0,
    api_key=GOOGLE_API_KEY,
    max_output_tokens=400
)

In [18]:
chain = prompt_consulta | llm | StrOutputParser()

In [21]:
def response(question: str):
    vectors = vectorstore.similarity_search(question)
    context = '\n\n'.join(vector.page_content for vector in vectors)
    return chain.invoke(
        {
            'query': question,
            'contexto': context
        }
    )

In [None]:
print(response('O que é chunking adaptativo?'))