In [2]:
!pip install -q chromadb
!pip install -q torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
!pip install -q transformers
!pip install -q langchain
!pip install -q git+https://github.com/huggingface/peft.git
!pip install -q trl
!pip install -q sentence_transformers
!pip install -q pypdf
!pip install -q unstructured

In [2]:
!transformers-cli env


Copy-and-paste the text below in your GitHub issue and FILL OUT the two last points.

- `transformers` version: 4.39.2
- Platform: Windows-10-10.0.22621-SP0
- Python version: 3.11.8
- Huggingface_hub version: 0.22.1
- Safetensors version: 0.4.2
- Accelerate version: 0.28.0
- Accelerate config: 	not found
- PyTorch version (GPU?): 2.2.2+cu121 (True)
- Tensorflow version (GPU?): not installed (NA)
- Flax version (CPU?/GPU?/TPU?): not installed (NA)
- Jax version: not installed
- JaxLib version: not installed
- Using GPU in script?: <fill in>
- Using distributed or parallel set-up in script?: <fill in>



In [1]:
import os
import torch
from peft import LoraConfig, PeftModel
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
)
from trl import SFTTrainer

  from .autonotebook import tqdm as notebook_tqdm


In [45]:
#################################################################
# Tokenizer
#################################################################

model_name='mistralai/Mistral-7B-Instruct-v0.2'
#model_name = "meta-llama/Meta-Llama-3-8B-Instruct"

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [48]:
#################################################################
# Load pre-trained config
#################################################################
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    low_cpu_mem_usage = True,
    device_map = device_map
)

Downloading shards: 100%|████████████████████████████████████████████████████████████████| 3/3 [02:09<00:00, 43.06s/it]
Loading checkpoint shards: 100%|█████████████████████████████████████████████████████████| 3/3 [00:06<00:00,  2.10s/it]


## LangChain

In [2]:
import pypdf
from langchain.document_loaders import PyPDFLoader
from langchain_community.document_loaders import UnstructuredXMLLoader

CHROMA_PATH = "chroma"
DATA_PATH = "data"

In [34]:
from bs4 import BeautifulSoup
import os

def XMLLoader(folder_path):
    reportData_list = []

    for filename in os.listdir(folder_path):
        if filename.endswith('.xml'):
            file_path = os.path.join(folder_path, filename)
            with open(file_path, 'r') as file:
                content = file.read()

            soup = BeautifulSoup(content, 'xml')

            reportData = {}
            for tag in soup.find_all():
                if tag.name != "report":
                    reportData[tag.name] = tag.text.strip()

            reportData["filename"] = filename
            reportData_list.append(reportData)

    return reportData_list

In [35]:
data =  XMLLoader("data")

In [49]:
from langchain.docstore.document import Document

documents = []
for doc in data:
    
    document = Document(page_content=doc["report_text"], metadata={"source": doc["filename"], 
                                                                   "subtype": doc["subtype"], 
                                                                   "type": doc["type"], 
                                                                   "chief_complaint": doc["chief_complaint"], 
                                                                   "admit_diagnosis": doc["admit_diagnosis"], 
                                                                   "discharge_diagnosis": doc["discharge_diagnosis"], 
                                                                   "year": doc["year"], 
                                                                   "downlaod_time": doc["downlaod_time"], 
                                                                   "deid": doc["deid"]})
    documents.append(document)

In [42]:
#documents[1].metadata["subtype"]="TEST"
#print(documents[7])

In [14]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

def split_text(documents):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=500,
        chunk_overlap=0,
        length_function=len,
        add_start_index=True
    )
    
    chunks = text_splitter.split_documents(documents)
    print(f"Split {len(documents)} documents into {len(chunks)} chunks.")
    
    return chunks

In [15]:
chunks = split_text(documents)

# Cogemos un chunk aleatorio
document = chunks[10]
print(document.page_content) # Printeamos su contenido
print(document.metadata) # Y su metadata (Fichero al que pertenece y donde empieza 'start_index')

Split 1 documents into 15 chunks.
jugular venous distention.  ABDOMEN: Belly is soft, nontender, nondistended.  
EXTREMITIES: No edema.  JOINT:  No gross joint deformities.  NEUROLOGIC: He 
is alert and appropriate.   
 
ASSESSMENT/RECOMMENDATIONS 
1. Acute on chronic renal failure versus progression of his underlying stage 
IV CKD and I suspect it is the latter. 
a) Check urine studies.
b) Check renal ultrasound with postvoid residuals to rule out easily
{'source': 'data/report10.xml', 'start_index': 4528}


In [132]:
from langchain.vectorstores.chroma import Chroma
from langchain.embeddings import SentenceTransformerEmbeddings
import shutil

def save_to_chroma(chunks):
    
    # Clear out the database first if already exists.
    if os.path.exists(CHROMA_PATH):
        shutil.rmtree(CHROMA_PATH)

    # https://huggingface.co/sentence-transformers
    #embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-mpnet-base-v2')
    embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
    
    # Create a new DB from the documents.
    db = Chroma.from_documents(
        chunks, 
        embeddings, 
        persist_directory=CHROMA_PATH
    )
    
    db.persist()
    print(f"Saved {len(chunks)} chunks to {CHROMA_PATH}.")

    return embeddings

In [133]:
embeddings = save_to_chroma(chunks)

Saved 1492 chunks to chroma4.


### Evaluator (For testing purposes)

In [134]:
#Function to turn the word apple to a vector
from langchain.evaluation import load_evaluator

vector = embeddings.embed_query("almendras")
#print(f"Vector for 'apple': {vector}") # Algunos pueden ser muy largos
print(f"Vector length: {len(vector)}")

# Compare vector of two words (distance between two words)
# Esto nos ayudará a saber si dos palabras estas correlacionadas
# Si comparamos dos palabras iguales, la distancia será prácticamente 0
# Utilza por defecto OpenAI embeddings!
evaluator = load_evaluator("embedding_distance", embeddings=embeddings)

words = ("apple", "iphone")
x = evaluator.evaluate_strings(prediction=words[0], reference=words[1])
print(f"Comparing ({words[0]}, {words[1]}): {x}")

Vector length: 512
Comparing (apple, iphone): {'score': 0.30508241313171336}


In [50]:
from langchain.llms import HuggingFacePipeline

# Pipeline for inference
def load_model():
    text_generation_pipeline = pipeline(
        model=model, # Model loaded in the first part
        tokenizer=tokenizer, # Tokenizer loaded in the first part
        task="text-generation",
        temperature=0.1,
        repetition_penalty=1.1,
        return_full_text=True,
        max_new_tokens=300,
        do_sample=True
    )

    hf = HuggingFacePipeline(pipeline=text_generation_pipeline)
    return hf

In [136]:
llm = load_model()

In [137]:
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain_core.runnables import RunnablePassthrough, RunnableParallel
from langchain_core.output_parsers import StrOutputParser

def query_data(embeddings, llm):
    
    db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embeddings)
    
    retriever = db.as_retriever()

    prompt_template = """
    ### [INST]
    Act as an expert doctor. Use the medical reports provided below to answer the question.

    If you do not know the answer, or are unsure, say you don't know.
    
    {context}
    
    ### QUESTION:
    {question}
    
    [/INST]
     """
    
    promptTemplate = PromptTemplate(
        input_variables=["context", "question"],
        template=prompt_template
    )
    
    #llm_chain = LLMChain(llm=llm, prompt=promptTemplate)

    # RunnablePassthrough para pasar la query al siguiente step en la chain
    #rag_chain = ( 
    # {"context": retriever, "question": RunnablePassthrough()}
    #    | llm_chain
    #)



    setup_and_retrieval = RunnableParallel({"question": RunnablePassthrough(), "context": retriever })
    output_parser = StrOutputParser()

    # LCEL language to create the chain (de izquierda a derecha el output de uno se pasa al otro)
    parent_retrieval_chain = setup_and_retrieval | promptTemplate | llm | output_parser

    return parent_retrieval_chain

In [None]:
chain = query_data(embeddings, llm)

In [None]:

out = chain.invoke("Wich patient has a history of cholecystectomy")
print(out)

In [149]:
#rag_chain = query_data(embeddings, llm)
#query = """Explícame el artículo 32. Acceso a números o servicios"""
#answer = rag_chain.invoke(query)

In [150]:
#answer['context']

[Document(page_content='mecanismos  de notificación  a que se refieren  los artículos  32, 33 y 34 del Código \nBOLETÍN OFICIAL DEL ESTADO\nNúm. 155 Miércoles 29 de junio de 2022 Sec. I.   Pág. 91386\ncve: BOE-A-2022-10757\nVerificable en https://www.boe.es', metadata={'page': 133, 'source': 'data/BOE-A-2022-10757.pdf', 'start_index': 0}),
 Document(page_content='que corresponda.\nBOLETÍN OFICIAL DEL ESTADO\nNúm. 155 Miércoles 29 de junio de 2022 Sec. I.   Pág. 91287\ncve: BOE-A-2022-10757\nVerificable en https://www.boe.es', metadata={'page': 34, 'source': 'data/BOE-A-2022-10757.pdf', 'start_index': 0}),
 Document(page_content='cualquier persona física o jurídica que desee contribuir, desinteresadamente, a la \nfinanciación de cualquier prestación propia del servicio universal.\nLos operadores sujetos a obligaciones de prestación del servicio universal recibirán \nde este fondo la cantidad correspondiente al coste neto que les supone dicha obligación, \ncalculado según el procedimient

In [151]:
#answer['text']

"\n    ### [INST] Instrucción: Eres un experto en jurisdiccion, responde a la pregunta según tus conocimientos de jurisdiccion y el siguiente contexto:\n    \n    [Document(page_content='mecanismos  de notificación  a que se refieren  los artículos  32, 33 y 34 del Código \\nBOLETÍN OFICIAL DEL ESTADO\\nNúm. 155 Miércoles 29 de junio de 2022 Sec. I.   Pág. 91386\\ncve: BOE-A-2022-10757\\nVerificable en https://www.boe.es', metadata={'page': 133, 'source': 'data/BOE-A-2022-10757.pdf', 'start_index': 0}), Document(page_content='que corresponda.\\nBOLETÍN OFICIAL DEL ESTADO\\nNúm. 155 Miércoles 29 de junio de 2022 Sec. I.   Pág. 91287\\ncve: BOE-A-2022-10757\\nVerificable en https://www.boe.es', metadata={'page': 34, 'source': 'data/BOE-A-2022-10757.pdf', 'start_index': 0}), Document(page_content='cualquier persona física o jurídica que desee contribuir, desinteresadamente, a la \\nfinanciación de cualquier prestación propia del servicio universal.\\nLos operadores sujetos a obligaciones 

In [158]:
indent = 0
for key, value in answer.items():
    if isinstance(value, dict):
        print('  ' * indent + f'{key}:')
        pretty_print_dict(value, indent + 1)
    else:
        print('  ' * indent + f'{key}: {value}')

context: [Document(page_content='mecanismos  de notificación  a que se refieren  los artículos  32, 33 y 34 del Código \nBOLETÍN OFICIAL DEL ESTADO\nNúm. 155 Miércoles 29 de junio de 2022 Sec. I.   Pág. 91386\ncve: BOE-A-2022-10757\nVerificable en https://www.boe.es', metadata={'page': 133, 'source': 'data/BOE-A-2022-10757.pdf', 'start_index': 0}), Document(page_content='que corresponda.\nBOLETÍN OFICIAL DEL ESTADO\nNúm. 155 Miércoles 29 de junio de 2022 Sec. I.   Pág. 91287\ncve: BOE-A-2022-10757\nVerificable en https://www.boe.es', metadata={'page': 34, 'source': 'data/BOE-A-2022-10757.pdf', 'start_index': 0}), Document(page_content='cualquier persona física o jurídica que desee contribuir, desinteresadamente, a la \nfinanciación de cualquier prestación propia del servicio universal.\nLos operadores sujetos a obligaciones de prestación del servicio universal recibirán \nde este fondo la cantidad correspondiente al coste neto que les supone dicha obligación, \ncalculado según el proce