In [19]:
!pip install -q chromadb
!pip install -q torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
!pip install -q transformers
!pip install -q langchain
!pip install -q langchain-community==0.2.1 langchain-core==0.2.1
!pip install -q -U bitsandbytes
!pip install -q sentence_transformers
!pip install -q -U accelerate
!pip install -q unstructured
!pip install -q langchain-huggingface

In [2]:
!transformers-cli env


Copy-and-paste the text below in your GitHub issue and FILL OUT the two last points.

- `transformers` version: 4.41.2
- Platform: Windows-10-10.0.22621-SP0
- Python version: 3.11.9
- Huggingface_hub version: 0.23.3
- Safetensors version: 0.4.3
- Accelerate version: 0.30.1
- Accelerate config: 	not found
- PyTorch version (GPU?): 2.3.1+cu121 (True)
- Tensorflow version (GPU?): not installed (NA)
- Flax version (CPU?/GPU?/TPU?): not installed (NA)
- Jax version: not installed
- JaxLib version: not installed
- Using GPU in script?: <fill in>
- Using distributed or parallel set-up in script?: <fill in>



In [1]:
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    pipeline,
)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
#################################################################
# Tokenizer
#################################################################

model_name='meta-llama/Meta-Llama-3-8B-Instruct'

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [3]:
#################################################################
# bitsandbytes parameters
#################################################################

# Activate 4-bit precision base model loading
use_4bit = True

# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = "bfloat16"

# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"

# Activate nested quantization for 4-bit base models (double quantization)
use_nested_quant = False

# Device partition
device_map = "auto"

In [4]:
#################################################################
# Set up quantization config
#################################################################
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

In [5]:
#################################################################
# Load pre-trained config
#################################################################
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    #quantization_config=bnb_config,
    low_cpu_mem_usage = True,
    device_map = "auto"
)

Loading checkpoint shards: 100%|█████████████████████████████████████████████████████████| 4/4 [00:07<00:00,  1.87s/it]


## LangChain

In [6]:
import pypdf
from langchain.document_loaders import PyPDFLoader

CHROMA_PATH = "chroma"
DATA_PATH = "data"

In [7]:
from bs4 import BeautifulSoup
import os

def XMLLoader(folder_path):
    reportData_list = []

    for filename in os.listdir(folder_path):
        if filename.endswith('.xml'):
            file_path = os.path.join(folder_path, filename)
            with open(file_path, 'r') as file:
                content = file.read()

            soup = BeautifulSoup(content, 'xml')

            reportData = {}
            for tag in soup.find_all():
                if tag.name != "report":
                    reportData[tag.name] = tag.text.strip()

            reportData["filename"] = filename
            reportData_list.append(reportData)

    return reportData_list

In [8]:
data =  XMLLoader("data")

In [9]:
from langchain.docstore.document import Document

documents = []
for doc in data:

    document = Document(page_content=doc["report_text"], metadata={"source": doc["filename"],
                                                                   "subtype": doc["subtype"],
                                                                   "type": doc["type"],
                                                                   "chief_complaint": doc["chief_complaint"],
                                                                   "admit_diagnosis": doc["admit_diagnosis"],
                                                                   "discharge_diagnosis": doc["discharge_diagnosis"],
                                                                   "year": doc["year"],
                                                                   "downlaod_time": doc.get("download_time", ""), # Algunos no tiemem este campo
                                                                   "deid": doc["deid"]})
    documents.append(document)

In [10]:
#documents[1].metadata["subtype"]="TEST"
#print(documents[7])

In [11]:
from langchain.vectorstores.chroma import Chroma
from langchain.embeddings import SentenceTransformerEmbeddings
from sentence_transformers import SentenceTransformer
import shutil

embedding_model = "BAAI/bge-small-en-v1.5"

# https://huggingface.co/sentence-transformers
# https://huggingface.co/spaces/mteb/leaderboard
embeddings = SentenceTransformerEmbeddings(model_name=embedding_model)



In [12]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

chunk_size = 500

# LLama3 - Ventana de contexto de 8000 tokens
text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
    tokenizer,
    chunk_size=chunk_size,
    chunk_overlap=int(chunk_size / 10),
    add_start_index=True
)

chunks = text_splitter.split_documents(documents)
print(f"Split {len(documents)} documents into {len(chunks)} chunks.")

Split 41 documents into 187 chunks.


In [13]:
from langchain.vectorstores.chroma import Chroma

vectorstore = Chroma.from_documents(documents=chunks, embedding=embeddings)

  attn_output = torch.nn.functional.scaled_dot_product_attention(


In [14]:
from langchain_huggingface.llms import HuggingFacePipeline

# Pipeline for inference
def load_model():
    text_generation_pipeline = pipeline(
        model=model, # Model loaded in the first part
        tokenizer=tokenizer, # Tokenizer loaded in the first part
        task="text-generation",
        temperature=0.1,
        repetition_penalty=1.1,
        return_full_text=True,
        max_new_tokens=500,
        do_sample=True
    )

    hf = HuggingFacePipeline(pipeline=text_generation_pipeline, model_id="meta-llama/Meta-Llama-3-8B-Instruct")
    return hf

In [15]:
llm = load_model()

### MultiQueryRetriever

Mediante un LLM se generan múltiples queries (prompts) desde distintas perspectivas para un prompt introducido por el user. Por cada query hace un retrieval de los documentos reelevantes para obtener un set mas grande de posibles documentos con mayor variedad.

In [22]:
from langchain.retrievers.multi_query import MultiQueryRetriever

multi_query_retriever = MultiQueryRetriever.from_llm(
    retriever = vectorstore.as_retriever(), 
    llm=llm,
)

In [17]:
import logging

logging.basicConfig()
logging.getLogger("langchain.retrievers.multi_query").setLevel(logging.INFO)

In [18]:
question = "Retrieve all the names of patients with Chronic obstructive pulmonary disease"

out = multi_query_retriever.invoke(question)

INFO:langchain.retrievers.multi_query:Generated queries: ['You are an AI language model assistant. Your task is ', '    to generate 3 different versions of the given user ', '    question to retrieve relevant documents from a vector  database. ', '    By generating multiple perspectives on the user question, ', '    your goal is to help the user overcome some of the limitations ', '    of distance-based similarity search. Provide these alternative ', '    questions separated by newlines. Original question: Retrieve all the names of patients with Chronic obstructive pulmonary disease (COPD) who have undergone lung transplantation in the past year.', '', "    Here's my attempt:", '', '    What medical conditions do patients with COPD typically undergo treatment for?', '    Which patients with COPD have received organ transplants within the last 12 months?', '    Can you provide a list of patient names who have been diagnosed with COPD and have undergone lung transplant surgery?', '', "  

In [19]:
print(out)

[Document(page_content='Trans:    **DATE[Mar 08 2007] 09:18:23\n**INITIALS\n \nR:  **DATE[Mar 08 2007] 09:18:23/ks', metadata={'admit_diagnosis': '786.50', 'chief_complaint': 'CHEST PAIN', 'deid': 'v.6.22.07.0', 'discharge_diagnosis': '786.50,401.9,493.90,272.4,V58.66,', 'downlaod_time': '', 'source': 'report23.xml', 'start_index': 3575, 'subtype': '', 'type': 'HP', 'year': '2007'}), Document(page_content='CC:\n \n \n \n \n*** Dictated By: -**NAME[XXX, TTT] ***\nElectronically Signed by **NAME[WWW XXX]  **DATE[Apr 03 2007] 05:40:23 AM', metadata={'admit_diagnosis': '436', 'chief_complaint': 'STROKE', 'deid': 'v.6.22.08.0', 'discharge_diagnosis': '433.11,410.71,998.12,427.31,427.32,424.0,787.2,401.9,250.00,414.01,414.02,', 'downlaod_time': '', 'source': 'report24.xml', 'start_index': 2552, 'subtype': 'CCM ATTEND', 'type': 'PGN', 'year': '2007'}), Document(page_content='T:  **DATE[Mar 02 2007] 08:55:15\nR:  **DATE[Mar 02 2007] 08:55:15/hmj\nJob ID:  328095/**ID-NUM\nCc:\n \n*** Dictated 

In [20]:
len(out)

25

In [23]:
from langchain_core.prompts import PromptTemplate

template1="""
You are an AI language model assistant. Your task is to generate five
    different versions of the given user question to retrieve relevant documents from a vector
    database. By generating multiple perspectives on the user question, your goal is to help
    the user overcome some of the limitations of the distance-based similarity search.
    Provide these alternative questions separated by newlines.
    Original question: {question}
"""

template2 = """
Your task is to generate 3 different search queries that aim to
answer the user question from multiple perspectives. The user questions
are focused on Medicine, Medical Reports, Patient Monitoring and related
topics.
Each query MUST tackle the question from a different viewpoint, we
want to get a variety of RELEVANT search results.
Provide these alternative questions separated by newlines.
Original question: {question}
"""

QUERY_PROMPT = PromptTemplate(
    input_variables=["question"],
    template=template1,
)

In [25]:
template2 = """
Your task is to generate 3 different search queries that aim to
answer the user question from multiple perspectives. The user questions
are focused on Medicine, Medical Reports, Patient Monitoring and related
topics.
Each query MUST tackle the question from a different viewpoint, we
want to get a variety of RELEVANT search results.
Provide these alternative questions separated by newlines.
Original question: {question}
"""

PROMPT = PromptTemplate(
    template=template2, input_variables=["question"]
)

retriever_from_llm = MultiQueryRetriever.from_llm(
    retriever=vectorstore.as_retriever(), llm=llm, prompt=PROMPT
)

In [27]:
docs = retriever_from_llm.invoke("Retrieve all the names of patients with Chronic obstructive pulmonary disease")
len(docs)

INFO:langchain.retrievers.multi_query:Generated queries: ['Your task is to generate 3 different search queries that aim to', 'answer the user question from multiple perspectives. The user questions', 'are focused on Medicine, Medical Reports, Patient Monitoring and related', 'topics.', 'Each query MUST tackle the question from a different viewpoint, we', 'want to get a variety of RELEVANT search results.', 'Provide these alternative questions separated by newlines.', 'Original question: Retrieve all the names of patients with Chronic obstructive pulmonary disease', '(COPD) who have been hospitalized in the last year.', '', 'Alternative Search Queries:', '1. What are the most common chronic respiratory diseases that require hospitalization?', '2. Which patient demographics (age, sex, etc.) are more prone to COPD-related hospitalizations?', '3. Can you provide a list of hospitals that have treated patients with COPD in the past year?', '', 'Please note that I will be evaluating your resp

28

Añadimos un prompt template

Otra opción para este RAG, con un custom prompt para realizar la generación de queries.

In [35]:
from typing import List

from langchain.chains import LLMChain
from langchain.output_parsers import PydanticOutputParser
from langchain_core.prompts import PromptTemplate
from pydantic import BaseModel, Field


# Output parser will split the LLM result into a list of queries
class LineList(BaseModel):
    # "lines" is the key (attribute name) of the parsed output
    lines: List[str] = Field(description="Lines of text")


class LineListOutputParser(PydanticOutputParser):
    def __init__(self) -> None:
        super().__init__(pydantic_object=LineList)

    def parse(self, text: str) -> LineList:
        lines = text.strip().split("\n")
        return LineList(lines=lines)


output_parser = LineListOutputParser()

template1="""
You are an AI language model assistant. Your task is to generate five
    different versions of the given user question to retrieve relevant documents from a vector
    database. By generating multiple perspectives on the user question, your goal is to help
    the user overcome some of the limitations of the distance-based similarity search.
    Provide these alternative questions separated by newlines.
    Original question: {question}
"""

template2 = """
Your task is to generate 3 different search queries that aim to
answer the user question from multiple perspectives. The user questions
are focused on Medicine, Medical Reports, Patient Monitoring and related
topics.
Each query MUST tackle the question from a different viewpoint, we
want to get a variety of RELEVANT search results.
Provide these alternative questions separated by newlines.
Original question: {question}
"""

QUERY_PROMPT = PromptTemplate(
    input_variables=["question"],
    template=template1,
)

# Chain
llm_chain = LLMChain(llm=llm, prompt=QUERY_PROMPT, output_parser=output_parser)

# Other inputs
question = "Patients with Chronic obstructive pulmonary disease"

  warn_deprecated(


In [36]:
# Run
retriever = MultiQueryRetriever(
    retriever=vectorstore.as_retriever(), llm_chain=llm_chain, parser_key="lines"
)  # "lines" is the key (attribute name) of the parsed output

# Results
unique_docs = retriever.invoke(input=question)
len(unique_docs)

OutputParserException: Invalid json output: You are an AI language model assistant. Your task is to generate five
    different versions of the given user question to retrieve relevant documents from a vector
    database. By generating multiple perspectives on the user question, your goal is to help
    the user overcome some of the limitations of the distance-based similarity search.
    Provide these alternative questions separated by newlines.
    Original question: Patients with Chronic obstructive pulmonary disease
    (COPD) often experience respiratory symptoms such as wheezing and coughing.
    What are the most effective treatments for COPD patients who exhibit these
    symptoms?''


```python
import re

def generate_alternative_questions(question):
    # Tokenize the original question
    tokens = re.split(r'\W+', question)

    # Generate alternative questions by modifying the original question
    alternatives = []

    # 1. Change the order of words
    alternatives.append(' '.join([tokens[i] for i in range(len(tokens)-2, -1, -1)]))

    # 2. Replace specific keywords
    alternatives.append(re.sub(r'COPD', 'Chronic bronchitis', question))
    alternatives.append(re.sub(r'wheezing','shortness of breath', question))

    # 3. Add or remove adjectives
    alternatives.append(re.sub(r'symptoms','severe symptoms', question))
    alternatives.append(re.sub(r'most effective', '', question))

    # 4. Use synonyms
    alternatives.append(re.sub(r'respiratory', 'pulmonary', question))
    alternatives.append(re.sub(r'treatments','medications', question))

    return '\n'.join(alternatives)


original_question = "Patients with Chronic obstructive pulmonary disease (COPD) often experience respiratory symptoms such as wheezing and coughing. What are the most effective treatments for COPD patients who exhibit these symptoms?"
print(generate_alternative_questions(original_question))
```

Output:
```
What are the treatments for COPD patients experiencing severe wheezing and coughing?
What are the medications for patients with chronic bronchitis who have shortness of breath?
What are the best ways to manage COPD symptoms like wheezing and coughing?
What are the treatment options for patients with chronic lung disease who experience respiratory problems?
What are the therapies for COPD patients who show signs of respiratory distress?
```

These alternative questions can be used to retrieve relevant documents from the vector database, which may not be retrieved using the original question due to its specificity. The generated questions provide different perspectives on the original question, allowing the user to explore related topics and potentially discover more relevant information.