Percorre todos os diretórios a partir de um diretório raiz (URL) e extrai as informações, devolvendo uma lista de 'Document'. As informações extraidas são:
- **page_content** -> _Conteúdo da página_
- **metadata:**
    - **source** -> _URL lida_
    - **title** -> _Título da página_
    - **description** -> _Descrição do conteúdo da página_
    - **language** -> _Idioma da página_

In [None]:
import weaviate
import os
import re
import logging

from langchain_community.document_loaders.recursive_url_loader import RecursiveUrlLoader
from bs4 import BeautifulSoup as Soup
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_core.embeddings import Embeddings
from langchain.vectorstores.weaviate import Weaviate
from langchain.indexes import index, SQLRecordManager
from langchain.utils.html import PREFIXES_TO_IGNORE_REGEX, SUFFIXES_TO_IGNORE_REGEX

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

PREFIX_URL = 'https://ai.stackspot.com'

url = f'{PREFIX_URL}/docs/'

def simple_extractor(html: str) -> str:
    soup = Soup(html, "lxml")
    return re.sub(r"\n\n+", "\n\n", soup.text.encode('latin1').decode('utf-8')).strip()

def load_api_docs(): 
    return RecursiveUrlLoader(
        url=url,
        max_depth=10,
        extractor=simple_extractor,
        #Pode ser usado fora do notebook jupyter
        #use_async=True,
        link_regex=(
            f"href=[\"']{PREFIXES_TO_IGNORE_REGEX}((?:{SUFFIXES_TO_IGNORE_REGEX}.)*?)"
            r"(?:[\#'\"]|\/[\#'\"])"
        ),
        check_response_status=True,
).load()

docs = load_api_docs()
logger.info(f"Document example: {docs[0]}")

sources = []

for source in docs:
    directory = source.metadata['source'].replace(PREFIX_URL, "")
    sources.append(directory)

logger.info(f'URLs found: {sources}')

Alimenta o vectorstore com as informações dos 'Documents' e cria uma tabela no PostgreSQL com o UUID, a chave, namespace, group_id e updated_at dos objetos do vectorstore para o gerenciamento dos dados

In [None]:
WEAVIATE_DOCS_INDEX_NAME = 'My_Docs_Index_Name'
WEAVIATE_URL = 'http://localhost:8080'
RECORD_MANAGER_DB_URL = 'postgresql://postgres:123admin@localhost'

def get_embeddings_model() -> Embeddings:
    return OpenAIEmbeddings(model="text-embedding-3-small", chunk_size=200)

def ingest_docs():
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=4000, chunk_overlap=200)
    client = weaviate.Client(
        url=WEAVIATE_URL
    )
    embedding = get_embeddings_model()
    vectorstore = Weaviate(
        client=client,
        index_name=WEAVIATE_DOCS_INDEX_NAME,
        text_key="text",
        embedding=embedding,
        by_text=False,
        attributes=["source", "title"],
    )

    docs_from_documentation = text_splitter.split_documents(docs)

    for doc in docs_from_documentation:
        if "source" not in doc.metadata:
            doc.metadata['source'] = ""
        if "title" not in doc.metadata:
            doc.metadata['title'] = ""

    record_manager = SQLRecordManager(
        f"weaviate/{WEAVIATE_DOCS_INDEX_NAME}", db_url=RECORD_MANAGER_DB_URL
    )

    record_manager.create_schema()
    
    indexing_stats = index(
        docs_from_documentation,
        record_manager,
        vectorstore,
        cleanup="full",
        source_id_key="source",
        force_update=(os.environ.get("FORCE_UPDATE") or "false").lower() == "true",
    )

    logger.info(f"Index Stats:  {indexing_stats}")

ingest_docs()