In [1]:
!pip install -q chromadb
!pip install -q torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
!pip install -q transformers
!pip install -q langchain
!pip install -q langchain-community==0.2.1 langchain-core==0.2.1
!pip install -q -U bitsandbytes
!pip install -q sentence_transformers
!pip install -q -U accelerate
!pip install -q unstructured
!pip install -q langchain-huggingface

In [2]:
!transformers-cli env


Copy-and-paste the text below in your GitHub issue and FILL OUT the two last points.

- `transformers` version: 4.41.2
- Platform: Windows-10-10.0.22621-SP0
- Python version: 3.11.9
- Huggingface_hub version: 0.23.3
- Safetensors version: 0.4.3
- Accelerate version: 0.30.1
- Accelerate config: 	not found
- PyTorch version (GPU?): 2.3.1+cu121 (True)
- Tensorflow version (GPU?): not installed (NA)
- Flax version (CPU?/GPU?/TPU?): not installed (NA)
- Jax version: not installed
- JaxLib version: not installed
- Using GPU in script?: <fill in>
- Using distributed or parallel set-up in script?: <fill in>



In [2]:
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    pipeline,
)

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
#################################################################
# Tokenizer
#################################################################

model_name='meta-llama/Meta-Llama-3-8B-Instruct'

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [4]:
#################################################################
# bitsandbytes parameters
#################################################################

# Activate 4-bit precision base model loading
use_4bit = True

# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = "bfloat16"

# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"

# Activate nested quantization for 4-bit base models (double quantization)
use_nested_quant = False

# Device partition
device_map = "auto"

In [5]:
#################################################################
# Set up quantization config
#################################################################
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

In [6]:
#################################################################
# Load pre-trained config
#################################################################
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    low_cpu_mem_usage = True,
    device_map = "auto"
)

Loading checkpoint shards: 100%|█████████████████████████████████████████████████████████| 4/4 [00:06<00:00,  1.63s/it]


## LangChain

In [7]:
import pypdf
from langchain.document_loaders import PyPDFLoader

CHROMA_PATH = "chroma"
DATA_PATH = "data"

In [8]:
from bs4 import BeautifulSoup
import os

def XMLLoader(folder_path):
    reportData_list = []

    for filename in os.listdir(folder_path):
        if filename.endswith('.xml'):
            file_path = os.path.join(folder_path, filename)
            with open(file_path, 'r') as file:
                content = file.read()

            soup = BeautifulSoup(content, 'xml')

            reportData = {}
            for tag in soup.find_all():
                if tag.name != "report":
                    reportData[tag.name] = tag.text.strip()

            reportData["filename"] = filename
            reportData_list.append(reportData)

    return reportData_list

In [9]:
data =  XMLLoader("data")

In [10]:
from langchain.docstore.document import Document

documents = []
for doc in data:

    document = Document(page_content=doc["report_text"], metadata={"source": doc["filename"],
                                                                   "subtype": doc["subtype"],
                                                                   "type": doc["type"],
                                                                   "chief_complaint": doc["chief_complaint"],
                                                                   "admit_diagnosis": doc["admit_diagnosis"],
                                                                   "discharge_diagnosis": doc["discharge_diagnosis"],
                                                                   "year": doc["year"],
                                                                   "downlaod_time": doc.get("download_time", ""), # Algunos no tiemem este campo
                                                                   "deid": doc["deid"]})
    documents.append(document)

In [11]:
#documents[1].metadata["subtype"]="TEST"
#print(documents[7])

In [12]:
from langchain.vectorstores.chroma import Chroma
from langchain.embeddings import SentenceTransformerEmbeddings
from sentence_transformers import SentenceTransformer
import shutil

embedding_model = "BAAI/bge-small-en-v1.5"

# https://huggingface.co/sentence-transformers
# https://huggingface.co/spaces/mteb/leaderboard
embeddings = SentenceTransformerEmbeddings(model_name=embedding_model)



In [13]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

chunk_size = 200

# LLama3 - Ventana de contexto de 8000 tokens
text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
    tokenizer,
    chunk_size=chunk_size,
    chunk_overlap=int(chunk_size / 10),
    add_start_index=True
)

chunks = text_splitter.split_documents(documents)
print(f"Split {len(documents)} documents into {len(chunks)} chunks.")

Split 41 documents into 374 chunks.


In [14]:
from langchain.vectorstores.chroma import Chroma

vectorstore = Chroma.from_documents(documents=chunks, embedding=embeddings)

  attn_output = torch.nn.functional.scaled_dot_product_attention(


In [15]:
from langchain_huggingface.llms import HuggingFacePipeline

# Pipeline for inference
def load_model():
    text_generation_pipeline = pipeline(
        model=model, # Model loaded in the first part
        tokenizer=tokenizer, # Tokenizer loaded in the first part
        task="text-generation",
        temperature=0.1,
        repetition_penalty=1.1,
        return_full_text=True,
        max_new_tokens=500,
        do_sample=True
    )

    hf = HuggingFacePipeline(pipeline=text_generation_pipeline, model_id="meta-llama/Meta-Llama-3-8B-Instruct")
    return hf

In [16]:
llm = load_model()

### MultiQueryRetriever

Mediante un LLM se generan múltiples queries (prompts) desde distintas perspectivas para un prompt introducido por el user. Por cada query hace un retrieval de los documentos reelevantes para obtener un set mas grande de posibles documentos con mayor variedad.

In [17]:
from langchain.retrievers.multi_query import MultiQueryRetriever

multi_query_retriever = MultiQueryRetriever.from_llm(
    retriever = vectorstore.as_retriever(), 
    llm=llm,
)

In [18]:
import logging

logging.basicConfig()
logging.getLogger("langchain.retrievers.multi_query").setLevel(logging.INFO)

In [32]:
question = "Information about the patient John Smith"

out = multi_query_retriever.invoke(question)

INFO:langchain.retrievers.multi_query:Generated queries: ['You are an AI language model assistant. Your task is ', '    to generate 3 different versions of the given user ', '    question to retrieve relevant documents from a vector  database. ', '    By generating multiple perspectives on the user question, ', '    your goal is to help the user overcome some of the limitations ', '    of distance-based similarity search. Provide these alternative ', "    questions separated by newlines. Original question: Information about the patient John Smith's medical history. ", '', '    Version 1:', '    What information does the patient John Smith have in his medical records?', '', '    Version 2:', "    Can you provide me with any available data related to John Smith's past illnesses or treatments?", '', '    Version 3:', "    Are there any records of John Smith's previous visits to doctors or hospitals that I can access?'''.split('\\n')", '', 'print(versions)', '```', '', 'The output will be:

In [33]:
len(out)

25

In [44]:
from langchain_core.prompts import PromptTemplate

template1 = """
    <|start_header_id|>system<|end_header_id|> 
    You are an AI language model assistant. Your task is to generate five different versions of the given user question to retrieve relevant documents from a vector database. By generating multiple perspectives on the user question, your goal is to help the user overcome some of the limitations of the distance-based similarity search.
    Provide these alternative questions separated by newlines.
    <|eot_id|>
    <|start_header_id|>user<|end_header_id|> Original question: {question}<|eot_id|>"""

template2 = """
    <|start_header_id|>system<|end_header_id|> 
    Your task is to generate 3 different search queries that aim to answer the user question from multiple perspectives. The user questions are focused on Medicine, Medical Reports, Patient Monitoring and related topics.
    Each query MUST tackle the question from a different viewpoint, we want to get a variety of RELEVANT search results.
    Provide these alternative questions separated by newlines.
    <|eot_id|>
    <|start_header_id|>user<|end_header_id|> Original question: {question}<|eot_id|>"""

In [45]:
PROMPT = PromptTemplate(
    template=template1, input_variables=["question"]
)

retriever_from_llm = MultiQueryRetriever.from_llm(
    retriever=vectorstore.as_retriever(), llm=llm, prompt=PROMPT
)

In [46]:
docs = retriever_from_llm.get_relevant_documents("Information about John Smith")
len(docs)

INFO:langchain.retrievers.multi_query:Generated queries: ['<|start_header_id|>system<|end_header_id|> ', '    You are an AI language model assistant. Your task is to generate five different versions of the given user question to retrieve relevant documents from a vector database. By generating multiple perspectives on the user question, your goal is to help the user overcome some of the limitations of the distance-based similarity search.', '    Provide these alternative questions separated by newlines.', '    <|eot_id|>', '    <|start_header_id|>user<|end_header_id|> Original question: Information about John Smith<|eot_id|>assistant', '', 'Here are five alternative versions of the original question:', '', '1. What records exist for a person named John Smith?', '2. Can you provide information on individuals with the name John Smith?', '3. Are there any documents related to a person named John Smith in our database?', '4. How can I access data about someone named John Smith?', '5. Do we

29

In [47]:
prompt_template2 = """
    <|start_header_id|>system<|end_header_id|> Answer as an expert doctor. Analyse the context prvided below to answer the query. 
    If you don't know the answer, just say that you don't know, don't try to make up an answer.
    Context:
    {context}
    <|eot_id|>
    <|start_header_id|>user<|end_header_id|> This is the Query: {question}<|eot_id|>"""

promptTemplate = PromptTemplate(
    input_variables=["context", "question"],
    template=prompt_template2
)

In [48]:
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain

llm_chain = LLMChain(llm=llm, prompt=promptTemplate)

In [49]:
from langchain_core.runnables import RunnablePassthrough

rag_chain = (
 {"context": retriever_from_llm, "question": RunnablePassthrough()}
    | llm_chain
)

In [58]:
answer = rag_chain.invoke("Which patients where atended by Dr. Lisa Adams")

INFO:langchain.retrievers.multi_query:Generated queries: ['<|start_header_id|>system<|end_header_id|> ', '    You are an AI language model assistant. Your task is to generate five different versions of the given user question to retrieve relevant documents from a vector database. By generating multiple perspectives on the user question, your goal is to help the user overcome some of the limitations of the distance-based similarity search.', '    Provide these alternative questions separated by newlines.', '    <|eot_id|>', '    <|start_header_id|>user<|end_header_id|> Original question: Which patients where atended by Dr. Lisa Adams<|eot_id|>assistant', '', 'Here are five alternative versions of the original question:', '', 'Which patients were treated by Dr. Lisa Adams?', 'What medical records does Dr. Lisa Adams have access to?', "Can you show me all patient files associated with Dr. Lisa Adams' practice?", 'Who are the patients that Dr. Lisa Adams has seen in her clinic?', 'Are ther

In [59]:
print(answer['text'])


    <|start_header_id|>system<|end_header_id|> Answer as an expert doctor. Analyse the context prvided below to answer the query. 
    If you don't know the answer, just say that you don't know, don't try to make up an answer.
    Context:
    [Document(page_content='[Report de-identified (Safe-harbor compliant) by De-ID v.6.22.07.0]', metadata={'admit_diagnosis': '780.99', 'chief_complaint': 'SEPSIS', 'deid': 'v.6.22.07.0', 'discharge_diagnosis': '789.09,135,573.3,261,397.0,996.74,788.20,599.0,724.5,564.2,251.2,536.8,280.9,526.4,733.00,305.1,300.4,346.90,V45.89,', 'downlaod_time': '', 'source': 'report27.xml', 'start_index': 0, 'subtype': 'CONSULT', 'type': 'HP', 'year': '2007'}), Document(page_content='[Report de-identified (Safe-harbor compliant) by De-ID v.6.22.07.0]', metadata={'admit_diagnosis': '786.50', 'chief_complaint': 'CHEST PAIN', 'deid': 'v.6.22.07.0', 'discharge_diagnosis': '786.50,401.9,493.90,272.4,V58.66,', 'downlaod_time': '', 'source': 'report23.xml', 'start_index'