In [1]:
CHROMA_PATH = "clinrec_chroma"
DATA_PATH = "data\clinrec"
CHUNK_SIZE = 10000
CHUNK_OVERLAP = 1000

In [2]:
from langchain_community.embeddings import HuggingFaceEmbeddings


In [3]:
def get_embeddings():
   model_kwargs = {'device': 'cuda'}
   embeddings_hf = HuggingFaceEmbeddings(
       model_name='intfloat/multilingual-e5-large',
       model_kwargs=model_kwargs
   )
   return embeddings_hf

In [4]:
from langchain_community.embeddings import OllamaEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma

from langchain_community.document_loaders import DedocPDFLoader

from langchain_community.document_loaders import PyPDFLoader

import glob 
files = glob.glob(DATA_PATH + r'\*.pdf', recursive=True)


In [5]:
def load_documents():
   loaders = [PyPDFLoader(fp) for fp in files]
   all_documents = []
   count = 0
   for loader in loaders:
      count += 1
      print(f'{count}/{len(loaders)}', "Loading raw document..." + loader.file_path)
      raw_documents = loader.load()

      print("Splitting text...")
      text_splitter = RecursiveCharacterTextSplitter(
         chunk_size=CHUNK_SIZE,
         chunk_overlap=CHUNK_OVERLAP,
         length_function=len,
         add_start_index=True,
      )
      documents = text_splitter.split_documents(raw_documents)
      all_documents.extend(documents)

  
   return all_documents

In [6]:
from langchain.docstore.document import Document

def get_fixed_documents(documents):
    fixed_documents = []
    count = 0 
    for document in documents:
        count += 1
        print(f'{count}/{len(documents)}', "Обработка чанка")
        metadata= document.metadata
        metadata.pop('rotated_page_angles', None)
        doc =  Document(page_content=document.page_content, metadata=metadata)
        fixed_documents.append(doc)
    return fixed_documents

In [7]:
import os
import shutil

def save_to_chroma(chunks: list):
    # Clear out the database first.
    if os.path.exists(CHROMA_PATH):
        shutil.rmtree(CHROMA_PATH)

    # Create a new DB from the documents.
    db = Chroma.from_documents(
        chunks, get_embeddings(), persist_directory=CHROMA_PATH
    )
    db.persist()
    print(f"Saved {len(chunks)} chunks to {CHROMA_PATH}.")

In [8]:
chunks = load_documents()

1/646 Loading raw document...data\clinrec\100_1.pdf
Splitting text...
2/646 Loading raw document...data\clinrec\103_3.pdf
Splitting text...
3/646 Loading raw document...data\clinrec\106_2.pdf
Splitting text...
4/646 Loading raw document...data\clinrec\109_2.pdf
Splitting text...
5/646 Loading raw document...data\clinrec\10_4.pdf
Splitting text...
6/646 Loading raw document...data\clinrec\112_2.pdf
Splitting text...
7/646 Loading raw document...data\clinrec\114_3.pdf
Splitting text...
8/646 Loading raw document...data\clinrec\115_2.pdf
Splitting text...
9/646 Loading raw document...data\clinrec\116_2.pdf
Splitting text...
10/646 Loading raw document...data\clinrec\117_2.pdf
Splitting text...
11/646 Loading raw document...data\clinrec\119_2.pdf
Splitting text...
12/646 Loading raw document...data\clinrec\11_3.pdf
Splitting text...
13/646 Loading raw document...data\clinrec\121_2.pdf
Splitting text...
14/646 Loading raw document...data\clinrec\127_2.pdf
Splitting text...
15/646 Loading ra

In [9]:
fixed_chunks=get_fixed_documents(chunks)

1/10322 Обработка чанка
2/10322 Обработка чанка
3/10322 Обработка чанка
4/10322 Обработка чанка
5/10322 Обработка чанка
6/10322 Обработка чанка
7/10322 Обработка чанка
8/10322 Обработка чанка
9/10322 Обработка чанка
10/10322 Обработка чанка
11/10322 Обработка чанка
12/10322 Обработка чанка
13/10322 Обработка чанка
14/10322 Обработка чанка
15/10322 Обработка чанка
16/10322 Обработка чанка
17/10322 Обработка чанка
18/10322 Обработка чанка
19/10322 Обработка чанка
20/10322 Обработка чанка
21/10322 Обработка чанка
22/10322 Обработка чанка
23/10322 Обработка чанка
24/10322 Обработка чанка
25/10322 Обработка чанка
26/10322 Обработка чанка
27/10322 Обработка чанка
28/10322 Обработка чанка
29/10322 Обработка чанка
30/10322 Обработка чанка
31/10322 Обработка чанка
32/10322 Обработка чанка
33/10322 Обработка чанка
34/10322 Обработка чанка
35/10322 Обработка чанка
36/10322 Обработка чанка
37/10322 Обработка чанка
38/10322 Обработка чанка
39/10322 Обработка чанка
40/10322 Обработка чанка
41/10322 

In [10]:
save_to_chroma(fixed_chunks)

  embeddings_hf = HuggingFaceEmbeddings(
  from .autonotebook import tqdm as notebook_tqdm


Saved 10322 chunks to clinrec_chroma.


  db.persist()
