In [1]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.embeddings import SentenceTransformerEmbeddings
import os
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
from sentence_transformers import SentenceTransformer
from docx import Document as DocxDocument
from langchain_core.documents import Document

  from .autonotebook import tqdm as notebook_tqdm
  Referenced from: <A09E200C-3620-3399-800F-6831D2DFDFC0> /opt/anaconda3/envs/med277project/lib/python3.12/site-packages/torchvision/image.so
  warn(


In [2]:
class DocumentProcessor:
    def __init__(self, folder_path, chunk_size=1000, chunk_overlap=200):
        self.folder_path = folder_path
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap
    
    def extract_text_from_docx(self, docx_path):
        doc = DocxDocument(docx_path)
        full_text = []
        for para in doc.paragraphs:
            full_text.append(para.text)
        return '\n'.join(full_text)

    def load_and_split_documents(self):
        all_chunks = []
        # Iterate over all files in the directory
        for filename in os.listdir(self.folder_path):
            file_path = os.path.join(self.folder_path, filename)
            if filename.endswith('.pdf'):
                loader = PyPDFLoader(file_path)
                documents = loader.load()
                
                # Split documents into chunks
                splitter = RecursiveCharacterTextSplitter(chunk_size=self.chunk_size, chunk_overlap=self.chunk_overlap)
                chunks = splitter.split_documents(documents)
                all_chunks.extend(chunks)

            elif filename.endswith('.docx'):
                print(file_path)
                document_text = self.extract_text_from_docx(file_path)

                # Create a single Document object with all text from the .docx file
                document_object = Document(
                    page_content=document_text,
                    metadata={
                        'source': file_path,
                        'page': 1  # Treat the entire document as one page for metadata
                    }
                )

                # Use RecursiveCharacterTextSplitter to split the full document text into chunks
                splitter = RecursiveCharacterTextSplitter(chunk_size=self.chunk_size, chunk_overlap=self.chunk_overlap)
                chunks = splitter.split_documents([document_object])  # Pass a list containing the single document object
                
                all_chunks.extend(chunks)

        return all_chunks

class VectorDatabase:
    def __init__(self, persist_directory):
        self.persist_directory = persist_directory
        self.embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2", model_kwargs={'device': 'mps'})
        self.db = None

    def create_vector_store(self, documents):
        # Create a Chroma vector store from the document chunks
        self.db = Chroma.from_documents(documents, self.embeddings, persist_directory=self.persist_directory)


In [3]:
journals_folder = 'Journals'
persist_directory = './chroma_db'
doc_processor = DocumentProcessor(folder_path=journals_folder)
document_chunks = doc_processor.load_and_split_documents()


Journals/KOP_guidelines_2024.docx


In [4]:
vector_db = VectorDatabase(persist_directory=persist_directory)
vector_db.create_vector_store(document_chunks)
# vector_db.persist()

In [22]:
document_chunks[0]

Document(metadata={'source': 'Journals/1-s2.0-S1521689623000034-main.pdf', 'page': 0}, page_content='7\nPatient selection in ambulatory surgery\nJohn A. Hodgson, MD, Associate Professora,\nKyle L. Cyr, MD, Assistant Professora,\nBobbieJean Sweitzer, MD, FACP, SAMBA-F, FASA, Professorb, *\na Walter Reed National Military Medical Center and Uniformed Services University, 8901 Wisconsin Avenue,\nBethesda, MD, 20889, United States\nb Medical Education, University of Virginia, Systems Director, Preoperative Medicine, Inova Health, 3300\nGallows Road, Falls Church, VA, 22042, United States\nKeywords:\nambulatory\nanesthesia\nsurgery\noutpatient\nofﬁce-based\npreoperative\ncomorbidities\nambulatory surgicenters\nPatient selection is important for ambulatory surgical practices.\nProper patient selection for ambulatory practices will optimize\nresources and lead to increased patient and provider satisfaction.\nAs the number and complexity of procedures in ambulatory sur-\ngical centers increase

In [5]:
document_set = set()
for document_chunk in document_chunks:
    document_set.add(document_chunk.metadata['source'])
    if document_chunk.metadata['source'] == 'Journals/KOP_guidelines_2024.docx':
        print(document_chunk.page_content)
# document_set.add(document_chunk.metadata)

UCSD Koman Outpatient Pavilion Case and Patient Selection Criteria

Patient Selection The list below is not necessarily exclusion criteria (unless specifically stated as such), but rather cases that may require further MD review and potential workup before receiving clearance to have surgery at KOP. Please contact APC regarding questions for KOP clearance.
General Theme – What determines if an outpatient surgery should be done at KOP versus main OR? This is not always black and white and will require clinical judgement. The following guidelines aim to set some expectations but does not fill all the holes. In general, the thought process is that if the MD feels the following, surgery may need to move to the main hospital:
patient will be very high risk for admission, patient should be done in the main operating room. This is also dependent on the type of surgery and anesthesia plan (not just patient comorbidities)
patient will require more invasive monitoring for safe anesthesia (i.e. a

In [7]:
document_set

{'Journals/1-s2.0-S0883540324011811-main.pdf',
 'Journals/1-s2.0-S1521689622000568-main.pdf',
 'Journals/1-s2.0-S1521689623000034-main.pdf',
 'Journals/ACO.0000000000000919.pdf',
 'Journals/KOP_guidelines_2024.docx',
 'Journals/aco.0000000000000266.pdf',
 'Journals/j.anclin.2019.01.001.pdf',
 'Journals/patient_selection_for_adult_ambulatory_surgery__a.10.pdf',
 'Journals/pro_con_debate__are_patients_with_a_cardiovascular.7.pdf',
 'Journals/society_for_ambulatory_anesthesia_updated.4.pdf'}