In [5]:
!pip install -q chromadb
!pip install -q torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
!pip install -q transformers
!pip install -q langchain
!pip install -q -U langchain-community
!pip install -q git+https://github.com/huggingface/peft.git
#!pip install -q bitsandbytes
!pip install -q trl
!pip install -q sentence_transformers
!pip install -q pypdf
!pip install -q unstructured

In [10]:
!transformers-cli env


Copy-and-paste the text below in your GitHub issue and FILL OUT the two last points.

- `transformers` version: 4.41.1
- Platform: Windows-10-10.0.22621-SP0
- Python version: 3.11.9
- Huggingface_hub version: 0.23.1
- Safetensors version: 0.4.3
- Accelerate version: 0.30.1
- Accelerate config: 	not found
- PyTorch version (GPU?): 2.3.0+cu121 (True)
- Tensorflow version (GPU?): not installed (NA)
- Flax version (CPU?/GPU?/TPU?): not installed (NA)
- Jax version: not installed
- JaxLib version: not installed
- Using GPU in script?: <fill in>
- Using distributed or parallel set-up in script?: <fill in>



In [13]:
import os
import torch
from peft import LoraConfig, PeftModel
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    pipeline,
)
from trl import SFTTrainer

In [15]:
#os.environ['TRANSFORMERS_OFFLINE']='0'

In [17]:
#################################################################
# Tokenizer
#################################################################

model_name='meta-llama/Meta-Llama-3-8B-Instruct'

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

OSError: We couldn't connect to 'https://huggingface.co' to load this file, couldn't find it in the cached files and it looks like meta-llama/Meta-Llama-3-8B-Instruct is not the path to a directory containing a file named config.json.
Checkout your internet connection or see how to run the library in offline mode at 'https://huggingface.co/docs/transformers/installation#offline-mode'.

In [48]:
#################################################################
# Load pre-trained config
#################################################################
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    #quantization_config=bnb_config,
    low_cpu_mem_usage = True,
    device_map = device_map
)

Downloading shards: 100%|████████████████████████████████████████████████████████████████| 3/3 [02:09<00:00, 43.06s/it]
Loading checkpoint shards: 100%|█████████████████████████████████████████████████████████| 3/3 [00:06<00:00,  2.10s/it]


In [1]:
#################################################################
# Base inference
#################################################################

#prompt = """En que consiste el Artículo 7 de la ley 11/2022 del BOE de españa"""

# CUDA: Para programar directamente la GPU
#model_input = tokenizer(prompt, return_tensors="pt").to("cuda")

#model.eval()
#with torch.no_grad():
#    print(tokenizer.decode(model.generate(**model_input, max_new_tokens=256, pad_token_id=2)[0], skip_special_tokens=True))

## LangChain

In [6]:
import pypdf
from langchain.document_loaders import PyPDFLoader

CHROMA_PATH = "chroma"
DATA_PATH = "data"

In [7]:
from bs4 import BeautifulSoup
import os

def XMLLoader(folder_path):
    reportData_list = []

    for filename in os.listdir(folder_path):
        if filename.endswith('.xml'):
            file_path = os.path.join(folder_path, filename)
            with open(file_path, 'r') as file:
                content = file.read()

            soup = BeautifulSoup(content, 'xml')

            reportData = {}
            for tag in soup.find_all():
                if tag.name != "report":
                    reportData[tag.name] = tag.text.strip()

            reportData["filename"] = filename
            reportData_list.append(reportData)

    return reportData_list

In [8]:
data =  XMLLoader("data")

In [9]:
from langchain.docstore.document import Document

documents = []
for doc in data:
    
    document = Document(page_content=doc["report_text"], metadata={"source": doc["filename"], 
                                                                   "subtype": doc["subtype"], 
                                                                   "type": doc["type"], 
                                                                   "chief_complaint": doc["chief_complaint"], 
                                                                   "admit_diagnosis": doc["admit_diagnosis"], 
                                                                   "discharge_diagnosis": doc["discharge_diagnosis"], 
                                                                   "year": doc["year"], 
                                                                   "downlaod_time": doc["downlaod_time"], 
                                                                   "deid": doc["deid"]})
    documents.append(document)

In [42]:
#documents[1].metadata["subtype"]="TEST"
#print(documents[7])

In [None]:
from langchain.embeddings import SentenceTransformerEmbeddings

# https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2
embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=200,
        chunk_overlap=0,
        length_function=len,
        add_start_index=True
    )

In [None]:
splits = text_splitter.split_documents(documents)

In [None]:
from langchain.vectorstores.chroma import Chroma

vectorstore = Chroma(documents=splits, embedding_function=embeddings, persist_directory="medicalReports_DB")

In [None]:
from langchain.llms import HuggingFacePipeline
from langchain.prompts import PromptTemplate

def load_model():
    text_generation_pipeline = pipeline(
        model=model,
        tokenizer=tokenizer,
        task="text-generation",
        temperature=0.1,
        repetition_penalty=1.1,
        return_full_text=True,
        max_new_tokens=300,
        do_sample=True
    )

    hf = HuggingFacePipeline(pipeline=text_generation_pipeline)
    return hf

In [None]:
llm = load_model()

### MultiQueryRetriever

Mediante un LLM se generan múltiples queries (prompts) desde distintas perspectivas para un prompt introducido por el user. Por cada query hace un retrieval de los documentos reelevantes para obtener un set mas grande de posibles documentos con mayor variedad.

In [None]:
from langchain.retrievers.multi_query import MultiQueryRetriever

multi_query_retriever = MultiQueryRetriever.from_llm(
    retriever = vectorstore.as_retriever(), llm=llm
)

In [None]:
import logging

logging.basicConfig()
logging.getLogger("langchain.retrievers.multi_query").setLevel(logging.INFO)

In [None]:
question = ""

out = multi_query_retriever.get_relevant_documents(query=question)
len(out)

Añadimos un prompt template

In [83]:
# RAG
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain

TEMPLATE = """
[INST]
You are an expert doctor. Use the medical reports provided below to answer the question.

If you do not know the answer, or are unsure, say you don't know.

Query:
{question}

Context:
{context}

[/INST]
"""

rag_template = PromptTemplate(
    input_variables = ["question", "context"]
    template=TEMPLATE
)

In [None]:
qa_chain = LLMChain(llm=llm, prompt=rag_template)

In [None]:
out_ptemplate = qa_chain(
    inputs = {
        "question": question,
        "context": "\n---\n".join([d.page_content for d in out])
    }
)

Otra opción para este RAG, con un custom prompt para realizar la generación de queries.

In [None]:
from typing import List

from langchain.chains import LLMChain
from langchain.output_parsers import PydanticOutputParser
from langchain_core.prompts import PromptTemplate
from pydantic import BaseModel, Field


# Output parser will split the LLM result into a list of queries
class LineList(BaseModel):
    # "lines" is the key (attribute name) of the parsed output
    lines: List[str] = Field(description="Lines of text")


class LineListOutputParser(PydanticOutputParser):
    def __init__(self) -> None:
        super().__init__(pydantic_object=LineList)

    def parse(self, text: str) -> LineList:
        lines = text.strip().split("\n")
        return LineList(lines=lines)


output_parser = LineListOutputParser()

template1="""
You are an AI language model assistant. Your task is to generate five 
    different versions of the given user question to retrieve relevant documents from a vector 
    database. By generating multiple perspectives on the user question, your goal is to help
    the user overcome some of the limitations of the distance-based similarity search. 
    Provide these alternative questions separated by newlines.
    Original question: {question}
"""

template2 = """
Your task is to generate 3 different search queries that aim to
answer the user question from multiple perspectives. The user questions
are focused on Medicine, Medical Reports, Patient Monitoring and related
topics.
Each query MUST tackle the question from a different viewpoint, we
want to get a variety of RELEVANT search results.
Provide these alternative questions separated by newlines.
Original question: {question}
"""

QUERY_PROMPT = PromptTemplate(
    input_variables=["question"],
    template=template1,
)

# Chain
llm_chain = LLMChain(llm=llm, prompt=QUERY_PROMPT, output_parser=output_parser)

# Other inputs
question = "What are the approaches to Task Decomposition?"

In [None]:
# Run
retriever = MultiQueryRetriever(
    retriever=vectorstore.as_retriever(), llm_chain=llm_chain, parser_key="lines"
)  # "lines" is the key (attribute name) of the parsed output

# Results
unique_docs = retriever.invoke(query="What does the course say about regression?")
len(unique_docs)

LCEL test

In [84]:
from langchain_core.runnables import RunnablePassthrough, RunnableParallel
from operator import itemgetter
from langchain_core.output_parsers import StrOutputParser

setup_and_retrieval = RunnableParallel({"question": RunnablePassthrough(), "context": multi_query_retriever })
output_parser = StrOutputParser()

# LCEL language to create the chain (de izquierda a derecha el output de uno se pasa al otro)
parent_retrieval_chain = setup_and_retrieval | QUERY_PROMPT | llm | output_parser

In [85]:
out = parent_retrieval_chain.invoke("Wich patient has a history of cholecystectomy")
print(out)

Human: 
[INST]
You are an expert doctor. Use the medical reports provided below to answer the question.

If you do not know the answer, or are unsure, say you don't know.

Query:
Wich patient has a history of cholecystectomy

Context:
[Document(page_content="[Report de-identified (Safe-harbor compliant) by De-ID v.6.22.07.0]\n\n\n**INSTITUTION\nConsultation\nName: **NAME[AAA, BBB]\nAcct #: **ID-NUM\nMRN: **ID-NUM\nAdmitted: \nDate: **DATE[Mar 14 2006]\nDict: **NAME[XXX, WWW]\nAttend: **NAME[ZZZ, YYY]\nREASON FOR CONSULT:  Sarcoid, possible pneumonia.\nHISTORY OF PRESENT ILLNESS:  **NAME[BBB AAA] is a **AGE[in 40s]-year-old female who gives \nabout a 16 or 17-year history of sarcoid.  She has had chronic skin rash, and \nbiopsy showed granulomas consistent with sarcoid.  This was in the early \n1999.  These were possibly in 1990.  She later had a breast lump.  Evidently, \nthis showed granulomatous inflammation as well and it was thought to be \nsecondary to sarcoid.  She is currently a