In [1]:
!pip install python-docx
!pip install pypdf



In [2]:
import os
from pathlib import Path

In [3]:
from haystack_integrations.document_stores.chroma import ChromaDocumentStore
from haystack.components.converters import TextFileToDocument, PyPDFToDocument
from haystack.components.converters.docx import DOCXToDocument
from haystack.components.routers import FileTypeRouter
from haystack.components.preprocessors import DocumentSplitter
from haystack.components.embedders import SentenceTransformersDocumentEmbedder
from haystack.components.writers import DocumentWriter
from haystack import Pipeline

In [4]:
# Инициализация хранилища (Chroma) с сохранением на диск
document_store = ChromaDocumentStore(persist_path="../data/chroma_index")

In [7]:
# Инициализация компонентов
file_router = FileTypeRouter(mime_types=["text/plain", "application/pdf", "application/vnd.openxmlformats-officedocument.wordprocessingml.document"])
txt_converter = TextFileToDocument() 
pdf_converter = PyPDFToDocument() 
docx_converter = DOCXToDocument()
splitter = DocumentSplitter(split_length=1000, split_overlap=100)
embedder = SentenceTransformersDocumentEmbedder("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
writer = DocumentWriter(document_store=document_store)


In [8]:
# Построение пайплайна
pipeline = Pipeline()
pipeline.add_component("router", file_router)
pipeline.add_component("txt_converter", txt_converter)
pipeline.add_component("pdf_converter", pdf_converter)
pipeline.add_component("docx_converter", docx_converter)
pipeline.add_component("splitter", splitter)
pipeline.add_component("embedder", embedder)
pipeline.add_component("writer", writer)

In [9]:
# Соединение узлов (маршрутизация файлов по типам)
pipeline.connect("router.text/plain", "txt_converter.sources")
pipeline.connect("router.application/pdf", "pdf_converter.sources")
pipeline.connect("router.application/vnd.openxmlformats-officedocument.wordprocessingml.document", "docx_converter.sources")
# Подключение конвертеров к последующим шагам
pipeline.connect("txt_converter.documents", "splitter.documents")
pipeline.connect("pdf_converter.documents", "splitter.documents")
pipeline.connect("docx_converter.documents", "splitter.documents")
pipeline.connect("splitter.documents", "embedder.documents")
pipeline.connect("embedder.documents", "writer.documents")

PipelineConnectError: Cannot connect 'pdf_converter.documents' with 'splitter.documents': splitter.documents is already connected to ['txt_converter'].


In [None]:
file_paths = ["../data/parsed_files" / Path(name) for name in os.listdir("../data/parsed_files")]

In [None]:
# Запуск индексирования на папке файлов
pipeline.run({"router": {"sources": file_paths}})