In [16]:
CHROMA_PATH = "rls_chroma"
DATA_PATH = "data/rls"
CHUNK_SIZE = 10000
CHUNK_OVERLAP = 1000

In [17]:
from langchain_community.embeddings import HuggingFaceEmbeddings


In [18]:
def get_embeddings():
   model_kwargs = {'device': 'cuda'}
   embeddings_hf = HuggingFaceEmbeddings(
       model_name='intfloat/multilingual-e5-large',
       model_kwargs=model_kwargs
   )
   return embeddings_hf

In [19]:
from langchain_community.embeddings import OllamaEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma

from langchain_community.document_loaders import DedocPDFLoader

from langchain_community.document_loaders import PyPDFLoader

import glob 
files = glob.glob(DATA_PATH + r'\*.pdf', recursive=True)


In [20]:
def load_documents():
   loaders = [PyPDFLoader(fp) for fp in files]
   all_documents = []
   count = 0
   for loader in loaders:
      count += 1
      print(f'{count}/{len(loaders)}', "Loading raw document..." + loader.file_path)
      raw_documents = loader.load()

      print("Splitting text...")
      text_splitter = RecursiveCharacterTextSplitter(
         chunk_size=CHUNK_SIZE,
         chunk_overlap=CHUNK_OVERLAP,
         length_function=len,
         add_start_index=True,
      )
      documents = text_splitter.split_documents(raw_documents)
      all_documents.extend(documents)

  
   return all_documents

In [21]:
from langchain.docstore.document import Document

def get_fixed_documents(documents):
    fixed_documents = []
    count = 0 
    for document in documents:
        count += 1
        print(f'{count}/{len(documents)}', "Обработка чанка")
        metadata= document.metadata
        metadata.pop('rotated_page_angles', None)
        doc =  Document(page_content=document.page_content, metadata=metadata)
        fixed_documents.append(doc)
    return fixed_documents

In [22]:
import os
import shutil

def save_to_chroma(chunks: list):
    # Clear out the database first.
    if os.path.exists(CHROMA_PATH):
        shutil.rmtree(CHROMA_PATH)

    # Create a new DB from the documents.
    db = Chroma.from_documents(
        chunks, get_embeddings(), persist_directory=CHROMA_PATH
    )
    db.persist()
    print(f"Saved {len(chunks)} chunks to {CHROMA_PATH}.")

In [23]:
chunks = load_documents()

1/339 Loading raw document...data/rls\9-ка СТОПразит.pdf
Splitting text...
2/339 Loading raw document...data/rls\Адаптол.pdf
Splitting text...
3/339 Loading raw document...data/rls\АДАСЕЛЬ®.pdf
Splitting text...
4/339 Loading raw document...data/rls\Аклиф.pdf
Splitting text...
5/339 Loading raw document...data/rls\Аллегра.pdf
Splitting text...
6/339 Loading raw document...data/rls\Аллергоферон® бета.pdf
Splitting text...
7/339 Loading raw document...data/rls\Аллергоферон®.pdf
Splitting text...
8/339 Loading raw document...data/rls\Альфаксим®.pdf
Splitting text...
9/339 Loading raw document...data/rls\Амлодипин+Периндоприл-СЗ.pdf
Splitting text...
10/339 Loading raw document...data/rls\Анаферон® детский.pdf
Splitting text...
11/339 Loading raw document...data/rls\Анаферон®.pdf
Splitting text...
12/339 Loading raw document...data/rls\Арипризол®.pdf
Splitting text...
13/339 Loading raw document...data/rls\Армадин® лонг.pdf
Splitting text...
14/339 Loading raw document...data/rls\Артрофоон

In [24]:
fixed_chunks=get_fixed_documents(chunks)

1/2384 Обработка чанка
2/2384 Обработка чанка
3/2384 Обработка чанка
4/2384 Обработка чанка
5/2384 Обработка чанка
6/2384 Обработка чанка
7/2384 Обработка чанка
8/2384 Обработка чанка
9/2384 Обработка чанка
10/2384 Обработка чанка
11/2384 Обработка чанка
12/2384 Обработка чанка
13/2384 Обработка чанка
14/2384 Обработка чанка
15/2384 Обработка чанка
16/2384 Обработка чанка
17/2384 Обработка чанка
18/2384 Обработка чанка
19/2384 Обработка чанка
20/2384 Обработка чанка
21/2384 Обработка чанка
22/2384 Обработка чанка
23/2384 Обработка чанка
24/2384 Обработка чанка
25/2384 Обработка чанка
26/2384 Обработка чанка
27/2384 Обработка чанка
28/2384 Обработка чанка
29/2384 Обработка чанка
30/2384 Обработка чанка
31/2384 Обработка чанка
32/2384 Обработка чанка
33/2384 Обработка чанка
34/2384 Обработка чанка
35/2384 Обработка чанка
36/2384 Обработка чанка
37/2384 Обработка чанка
38/2384 Обработка чанка
39/2384 Обработка чанка
40/2384 Обработка чанка
41/2384 Обработка чанка
42/2384 Обработка чанка
4

In [25]:
save_to_chroma(fixed_chunks)

  embeddings_hf = HuggingFaceEmbeddings(
  from .autonotebook import tqdm as notebook_tqdm
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Saved 2384 chunks to rls_chroma.


  db.persist()
