In [1]:
CHROMA_PATH = "clinrec_chroma_new"
DATA_PATH = "data\clinrec"
CHUNK_SIZE = 3000
CHUNK_OVERLAP = 300

In [2]:
from langchain_community.embeddings import HuggingFaceEmbeddings


In [3]:
def get_embeddings():
   model_kwargs = {'device': 'cuda'}
   embeddings_hf = HuggingFaceEmbeddings(
       model_name='intfloat/multilingual-e5-large',
       model_kwargs=model_kwargs
   )
   return embeddings_hf

In [4]:
from langchain_community.embeddings import OllamaEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma

from langchain_community.document_loaders import DedocPDFLoader

from langchain_community.document_loaders import PyPDFLoader

import glob 
files = glob.glob(DATA_PATH + r'\*.pdf', recursive=True)


In [5]:
def load_documents():
   loaders = [PyPDFLoader(fp) for fp in files]
   all_documents = []
   count = 0
   for loader in loaders:
      count += 1
      print(f'{count}/{len(loaders)}', "Loading raw document..." + loader.file_path)
      raw_documents = loader.load()

      print("Splitting text...")
      text_splitter = RecursiveCharacterTextSplitter(
         chunk_size=CHUNK_SIZE,
         chunk_overlap=CHUNK_OVERLAP,
         length_function=len,
         add_start_index=True,
      )
      documents = text_splitter.split_documents(raw_documents)
      all_documents.extend(documents)

  
   return all_documents

In [6]:
from langchain.docstore.document import Document

def get_fixed_documents(documents):
    fixed_documents = []
    count = 0 
    for document in documents:
        count += 1
        print(f'{count}/{len(documents)}', "Обработка чанка")
        metadata= document.metadata
        metadata.pop('rotated_page_angles', None)
        doc =  Document(page_content=document.page_content, metadata=metadata)
        fixed_documents.append(doc)
    return fixed_documents

In [7]:
import os
import shutil

def save_to_chroma(chunks: list):
    # Clear out the database first.
    if os.path.exists(CHROMA_PATH):
        shutil.rmtree(CHROMA_PATH)

    # Create a new DB from the documents.
    db = Chroma.from_documents(
        chunks, get_embeddings(), persist_directory=CHROMA_PATH
    )
    db.persist()
    print(f"Saved {len(chunks)} chunks to {CHROMA_PATH}.")

In [9]:
chunks = load_documents()

1/560 Loading raw document...data\clinrec\100_1.pdf
Splitting text...
2/560 Loading raw document...data\clinrec\103_3.pdf
Splitting text...
3/560 Loading raw document...data\clinrec\106_2.pdf
Splitting text...
4/560 Loading raw document...data\clinrec\109_2.pdf
Splitting text...
5/560 Loading raw document...data\clinrec\10_4.pdf
Splitting text...
6/560 Loading raw document...data\clinrec\112_2.pdf
Splitting text...
7/560 Loading raw document...data\clinrec\114_3.pdf
Splitting text...
8/560 Loading raw document...data\clinrec\115_2.pdf
Splitting text...
9/560 Loading raw document...data\clinrec\116_2.pdf
Splitting text...
10/560 Loading raw document...data\clinrec\117_2.pdf
Splitting text...
11/560 Loading raw document...data\clinrec\119_2.pdf
Splitting text...
12/560 Loading raw document...data\clinrec\11_3.pdf
Splitting text...
13/560 Loading raw document...data\clinrec\121_2.pdf
Splitting text...
14/560 Loading raw document...data\clinrec\127_2.pdf
Splitting text...
15/560 Loading ra

In [10]:
fixed_chunks=get_fixed_documents(chunks)

1/29708 Обработка чанка
2/29708 Обработка чанка
3/29708 Обработка чанка
4/29708 Обработка чанка
5/29708 Обработка чанка
6/29708 Обработка чанка
7/29708 Обработка чанка
8/29708 Обработка чанка
9/29708 Обработка чанка
10/29708 Обработка чанка
11/29708 Обработка чанка
12/29708 Обработка чанка
13/29708 Обработка чанка
14/29708 Обработка чанка
15/29708 Обработка чанка
16/29708 Обработка чанка
17/29708 Обработка чанка
18/29708 Обработка чанка
19/29708 Обработка чанка
20/29708 Обработка чанка
21/29708 Обработка чанка
22/29708 Обработка чанка
23/29708 Обработка чанка
24/29708 Обработка чанка
25/29708 Обработка чанка
26/29708 Обработка чанка
27/29708 Обработка чанка
28/29708 Обработка чанка
29/29708 Обработка чанка
30/29708 Обработка чанка
31/29708 Обработка чанка
32/29708 Обработка чанка
33/29708 Обработка чанка
34/29708 Обработка чанка
35/29708 Обработка чанка
36/29708 Обработка чанка
37/29708 Обработка чанка
38/29708 Обработка чанка
39/29708 Обработка чанка
40/29708 Обработка чанка
41/29708 

In [11]:
save_to_chroma(fixed_chunks)

  embeddings_hf = HuggingFaceEmbeddings(
  from .autonotebook import tqdm as notebook_tqdm


Saved 29708 chunks to clinrec_chroma_new.


  db.persist()
