In [1]:
CHROMA_PATH = "clinrec_chroma"
DATA_PATH = "data\clinrec_lm"
CHUNK_SIZE = 3000
CHUNK_OVERLAP = 300

In [2]:
from langchain_community.embeddings import HuggingFaceEmbeddings


In [3]:
def get_embeddings():
   model_kwargs = {'device': 'cuda'}
   embeddings_hf = HuggingFaceEmbeddings(
       model_name='intfloat/multilingual-e5-large',
       model_kwargs=model_kwargs
   )
   return embeddings_hf

In [4]:
from langchain_community.embeddings import OllamaEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma

from langchain_community.document_loaders import DedocPDFLoader

from langchain_community.document_loaders import PyPDFLoader

import glob 
files = glob.glob(DATA_PATH + r'\*.pdf', recursive=True)


In [5]:
def load_documents():
   loaders = [DedocPDFLoader(fp, language='rus+eng') for fp in files]
   all_documents = []
   count = 0
   for loader in loaders:
      count += 1
      print(f'{count}/{len(loaders)}', "Loading raw document..." + loader.file_path)
      raw_documents = loader.load()

      print("Splitting text...")
      text_splitter = RecursiveCharacterTextSplitter(
         chunk_size=CHUNK_SIZE,
         chunk_overlap=CHUNK_OVERLAP,
         length_function=len,
         add_start_index=True,
      )
      documents = text_splitter.split_documents(raw_documents)
      all_documents.extend(documents)

  
   return all_documents

In [6]:
PROMPT_TEMPLATE = """
Пожалуйста, перефразируйте следующий текст, удаляя ненужные символы (например, "**", "#", и т.д.) и лишние детали, 
при этом сохраняя всю основную информацию без потери смысла. Так как это медицинский документ, важно быть максимально точным и сохранить исходный смысл.
В ответе запиши только исправленный текст
{context}
"""

In [7]:
import argparse
from langchain.prompts import ChatPromptTemplate
from langchain_community.chat_models import ChatOpenAI
from langchain_community.vectorstores import Chroma



def get_fixed_text(query_text):
   
   prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
   prompt = prompt_template.format(context=query_text)
   #print(f"Полученный промпт {prompt}")

   # Подключение к LM Studio и отправка запроса
   model = ChatOpenAI(temperature=0.7, base_url="http://localhost:1234/v1", api_key="not-needed")
   response_text = model.predict(prompt)

   # Выводим результаты ответа
   formatted_response = f"Ответ: {response_text}\n"
   #print(formatted_response)
   return response_text

In [8]:
from langchain.docstore.document import Document

def get_fixed_documents(documents):
    fixed_documents = []
    count = 0 
    for document in documents:
        count += 1
        print(f'{count}/{len(documents)}', "Обработка чанка")
        metadata= document.metadata
        metadata.pop('rotated_page_angles', None)
        doc =  Document(page_content=get_fixed_text(document.page_content), metadata=metadata)
        fixed_documents.append(doc)
    return fixed_documents

In [9]:
import os
import shutil

def save_to_chroma(chunks: list):
    # Clear out the database first.
    if os.path.exists(CHROMA_PATH):
        shutil.rmtree(CHROMA_PATH)

    # Create a new DB from the documents.
    db = Chroma.from_documents(
        chunks, get_embeddings(), persist_directory=CHROMA_PATH
    )
    db.persist()
    print(f"Saved {len(chunks)} chunks to {CHROMA_PATH}.")

In [10]:
chunks = load_documents()

1/6 Loading raw document...data\clinrec_lm\КР103.pdf


  from cryptography.hazmat.primitives.ciphers.algorithms import AES, ARC4


Splitting text...
2/6 Loading raw document...data\clinrec_lm\КР206.pdf




Splitting text...
3/6 Loading raw document...data\clinrec_lm\КР223.pdf




Splitting text...
4/6 Loading raw document...data\clinrec_lm\КР396.pdf




Splitting text...
5/6 Loading raw document...data\clinrec_lm\КР630.pdf




Splitting text...
6/6 Loading raw document...data\clinrec_lm\КР751.pdf




Splitting text...


In [11]:
fixed_chunks=get_fixed_documents(chunks)

1/202 Обработка чанка




2024-11-13 01:36:30,342 - c:\Users\kopys\.conda\envs\ollama-pdf\lib\site-packages\httpx\_client.py - INFO - HTTP Request: POST http://localhost:1234/v1/chat/completions "HTTP/1.1 200 OK"
2/202 Обработка чанка
2024-11-13 01:38:05,774 - c:\Users\kopys\.conda\envs\ollama-pdf\lib\site-packages\httpx\_client.py - INFO - HTTP Request: POST http://localhost:1234/v1/chat/completions "HTTP/1.1 200 OK"
3/202 Обработка чанка
2024-11-13 01:38:50,988 - c:\Users\kopys\.conda\envs\ollama-pdf\lib\site-packages\httpx\_client.py - INFO - HTTP Request: POST http://localhost:1234/v1/chat/completions "HTTP/1.1 200 OK"
4/202 Обработка чанка
2024-11-13 01:40:10,006 - c:\Users\kopys\.conda\envs\ollama-pdf\lib\site-packages\httpx\_client.py - INFO - HTTP Request: POST http://localhost:1234/v1/chat/completions "HTTP/1.1 200 OK"
5/202 Обработка чанка
2024-11-13 01:42:05,741 - c:\Users\kopys\.conda\envs\ollama-pdf\lib\site-packages\httpx\_client.py - INFO - HTTP Request: POST http://localhost:1234/v1/chat/complet

In [12]:
save_to_chroma(fixed_chunks)



2024-11-13 06:08:48,800 - c:\Users\kopys\.conda\envs\ollama-pdf\lib\site-packages\sentence_transformers\SentenceTransformer.py - INFO - Load pretrained SentenceTransformer: intfloat/multilingual-e5-large
2024-11-13 06:08:54,470 - c:\Users\kopys\.conda\envs\ollama-pdf\lib\site-packages\chromadb\telemetry\product\posthog.py - INFO - Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.




Saved 202 chunks to clinrec_chroma.


