In [1]:
# pip install -q datasets sentence-transformers faiss-cpu accelerate langchain langchain-community

In [2]:
# pip install git+https://github.com/huggingface/transformers
#

In [3]:
# pip install transformers torch accelerate bitsandbytes

In [4]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score
from transformers import LlamaForCausalLM, LlamaTokenizerFast       # LLM for report classificuing
from sentence_transformers import SentenceTransformer       # for embedding model
from sklearn.model_selection import train_test_split
import os

from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import DataFrameLoader
from langchain.llms import HuggingFacePipeline
from langchain.chains import RetrievalQA
from transformers import AutoTokenizer, pipeline#, AutoModelForSeq2SeqGeneration
# from transformers import AutoModelForSeq2SeqGeneration

In [5]:
import pandas as pd
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import DataFrameLoader
from langchain.llms import HuggingFacePipeline
from langchain.chains import RetrievalQA
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
import torch

In [6]:
from transformers import AutoModelForSeq2SeqLM


In [7]:
filename = "data/labeled_data_combined_reports.csv"

In [8]:
# load data in df format
df_reports = pd.read_csv(filename)
df_reports.head(2)

Unnamed: 0.1,Unnamed: 0,patient_id,imaging_date,reports,image_ct___1,image_ct___2,image_ct___3,combined_reports
0,0,SHSC-134CJ-PV3YY-9L6O6-PLRA9OVTHG-4JZ2M-UR0UO-...,2010-09-21,Bone Scan(Whole Body)Nuc Med TECHNETIUM MDP BO...,0.0,1.0,0.0,Bone Scan(Whole Body)Nuc Med TECHNETIUM MDP BO...
1,1,SHSC-134CJ-PV3YY-9L6O6-PLRA9OVTHG-4JZ2M-UR0UO-...,2011-01-13,CT Chest History:\rFollow-up scan for OZM-011 ...,1.0,1.0,0.0,Bone Scan(Whole Body)Nuc Med TECHNETIUM MDP BO...


In [None]:
df_reports["report_and_frac_label"] = (
    "Report:\n" + 
    df_reports["combined_reports"] + 
    "\n\nFracture classification:\n" + 
    df_reports["image_ct___1"]
    # df_reports["image_ct___1"].apply(lambda x: "Positive" if float(x) > 0 else "Negative")
)

df_reports["report_and_mets_label"] = (
    "Report:\n" + 
    df_reports["combined_reports"] + 
    "\n\nMetastases classification:\n" + 
    df_reports["image_ct___1"]
    # df_reports["image_ct___2"].apply(lambda x: "Positive" if float(x) > 0 else "Negative")
)

# drop reports that have NaN in reports column
df_reports = df_reports.dropna(subset=["report_and_frac_label", "report_and_mets_label"])

df_reports.head(2)


Unnamed: 0.1,Unnamed: 0,patient_id,imaging_date,reports,image_ct___1,image_ct___2,image_ct___3,combined_reports,report_and_frac_label,report_and_mets_label
0,0,SHSC-134CJ-PV3YY-9L6O6-PLRA9OVTHG-4JZ2M-UR0UO-...,2010-09-21,Bone Scan(Whole Body)Nuc Med TECHNETIUM MDP BO...,0.0,1.0,0.0,Bone Scan(Whole Body)Nuc Med TECHNETIUM MDP BO...,Report:\nBone Scan(Whole Body)Nuc Med TECHNETI...,Report:\nBone Scan(Whole Body)Nuc Med TECHNETI...
1,1,SHSC-134CJ-PV3YY-9L6O6-PLRA9OVTHG-4JZ2M-UR0UO-...,2011-01-13,CT Chest History:\rFollow-up scan for OZM-011 ...,1.0,1.0,0.0,Bone Scan(Whole Body)Nuc Med TECHNETIUM MDP BO...,Report:\nBone Scan(Whole Body)Nuc Med TECHNETI...,Report:\nBone Scan(Whole Body)Nuc Med TECHNETI...


In [10]:
class RAGPipeline:
    def __init__(self, 
                 embedding_model_name="sentence-transformers/all-mpnet-base-v2",
                 llm_model_name="google/flan-t5-large",  # Example seq2seq model
                 device="cuda" if torch.cuda.is_available() else "cpu"):
        
        print(f"Initializing RAG system on {device}")
        
        self.embeddings = HuggingFaceEmbeddings(model_name=embedding_model_name)
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=500,
            chunk_overlap=50
        )
        self.tokenizer = AutoTokenizer.from_pretrained(
            llm_model_name,
            trust_remote_code=True
        )
        
        # load LLM
        print("Loading language model...")
        model = AutoModelForSeq2SeqLM.from_pretrained(
            llm_model_name,
            torch_dtype=torch.float16 if device == "cuda" else torch.float32,
            device_map="auto",
            trust_remote_code=True,
            load_in_8bit=True if device == "cuda" else False
        )
        
        # text gen pipeline
        pipe = pipeline(
            "text2text-generation",
            model=model,
            tokenizer=self.tokenizer,
            max_length=512,
            temperature=0.7,
            top_p=0.95,
            # device=device,
            do_sample=True
        )
        
        self.llm = HuggingFacePipeline(pipeline=pipe)
        self.vectorstore = None
        self.qa_chain = None

    def _create_prompt(self, context, question):
        return f"""Answer the following question based on the given context.

Context: {context}

Question: {question}

Answer:"""

    def load_data(self, df, text_column):

        loader = DataFrameLoader(df, page_content_column=text_column)
        documents = loader.load()
        
        texts = self.text_splitter.split_documents(documents)
        self.vectorstore = FAISS.from_documents(texts, self.embeddings)
        
        self.qa_chain = RetrievalQA.from_chain_type(
            llm=self.llm,
            chain_type="stuff",
            retriever=self.vectorstore.as_retriever(
                search_kwargs={"k": 3}
            )
        )
        
        return f"Loaded {len(texts)} text chunks into the vector store"

    def query(self, question):
        """query RAG system"""
        if self.qa_chain is None:
            raise ValueError("Please load data first")
        
        # retrieve relevant docs
        docs = self.vectorstore.similarity_search(question, k=3)
        context = "\n\n".join([doc.page_content for doc in docs])
        
        prompt = self._create_prompt(context, question)
        
        response = self.llm(prompt)
        
        return response[0]['generated_text'] if isinstance(response, list) else response

    def batch_query(self, questions):
        """be able to handle multiple reports at once"""
        return [self.query(q) for q in questions]

    def similarity_search(self, query, k=3):
        if self.vectorstore is None:
            raise ValueError("Please load data first")
        return self.vectorstore.similarity_search(query, k=k)

    def save_vectorstore(self, path):
        """save FAISS vector database locally"""
        if self.vectorstore:
            self.vectorstore.save_local(path)
            print(f"Vector database saved to {path}")
        else:
            raise ValueError("No vector store to save")

    def load_vectorstore(self, path):
        """Load a saved FAISS vector store"""
        print(f"Loading vector store from {path}")
        self.vectorstore = FAISS.load_local(path, self.embeddings)
        
        self.qa_chain = RetrievalQA.from_chain_type(
            llm=self.llm,
            chain_type="stuff",
            retriever=self.vectorstore.as_retriever(
                search_kwargs={"k": 1}
            )
        )

In [20]:
### GENERIC USAGE EXAMPLE
rag = RAGPipeline(
        # embedding_model_name="sentence-transformers/all-mpnet-base-v2",
        # llm_model_name="meta-llama/Llama-2-7b-chat-hf"  # Or any other causal LM
    )
    
input = "..."
print(f"Input: {input}")
output = rag.query(input)
print(f"LLM Output: {output}")

# batch reports & LLM labels
inputs = [
    "..."
]
outputs = rag.batch_query(inputs)

for i, o in zip(inputs, outputs):
    print(f"Input: {i}")
    print(f"Output: {o}")


Initializing RAG system on cuda


The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading language model...
Input: ...


ValueError: Please load data first

In [12]:
### TESTING WITH REPORTS
rag = RAGPipeline(
    # embedding_model_name="sentence-transformers/all-mpnet-base-v2",
    # llm_model_name="meta-llama/Llama-2-7b-chat-hf"  # Or any other causal LM
)

Initializing RAG system on cuda


The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading language model...


In [None]:
df_reports_sample = df_reports, 

In [13]:
print("Loading data...")
result = rag.load_data(df_reports, text_column='report_and_frac_label')
print(result)

Loading data...
Loaded 22227 text chunks into the vector store


In [14]:
fracture_prompt = f"""
We need help classifying whether this radiology report indicates a fracture. 
Instructions:
- You will be given an input report. 
- Please respond with "Positive" if the report indicates a fracture, or "Negative" if it does not.
- Please ONLY say one word, "Positive" or "Negative".
- If you are unsure if a report indicates a fracture, say "Positive".
- Please use the provided examples to inform your decision.

Here is the report to classify\n\n:
"""

In [None]:
df_reports.head(2)

Unnamed: 0.1,Unnamed: 0,patient_id,imaging_date,reports,image_ct___1,image_ct___2,image_ct___3,combined_reports,report_and_frac_label,report_and_mets_label
0,0,SHSC-134CJ-PV3YY-9L6O6-PLRA9OVTHG-4JZ2M-UR0UO-...,2010-09-21,Bone Scan(Whole Body)Nuc Med TECHNETIUM MDP BO...,0.0,1.0,0.0,Bone Scan(Whole Body)Nuc Med TECHNETIUM MDP BO...,Report:\nBone Scan(Whole Body)Nuc Med TECHNETI...,Report:\nBone Scan(Whole Body)Nuc Med TECHNETI...
1,1,SHSC-134CJ-PV3YY-9L6O6-PLRA9OVTHG-4JZ2M-UR0UO-...,2011-01-13,CT Chest History:\rFollow-up scan for OZM-011 ...,1.0,1.0,0.0,Bone Scan(Whole Body)Nuc Med TECHNETIUM MDP BO...,Report:\nBone Scan(Whole Body)Nuc Med TECHNETI...,Report:\nBone Scan(Whole Body)Nuc Med TECHNETI...


In [15]:
texts = df_reports['combined_reports'].tolist()
frac_labels = df_reports['image_ct___1'].astype(float).round().astype(int).values.tolist()
mets_labels = df_reports['image_ct___2'].astype(float).round().astype(int).values.tolist()

frac_train_texts, frac_test_texts, frac_train_labels, frac_test_labels = train_test_split(
    texts, frac_labels, test_size=0.2, random_state=42
)

mets_train_texts, mets_test_texts, mets_train_labels, mets_test_labels = train_test_split(
    texts, mets_labels, test_size=0.2, random_state=42
)

In [16]:
print("Generating predictions...")
predictions = []
true_labels = []

Generating predictions...


In [19]:
outputs = rag.batch_query(frac_test_texts)

for i, o in zip(inputs, outputs):
    print(f"Input: {i}")
    print(f"Output: {o}")

OutOfMemoryError: CUDA out of memory. Tried to allocate 2.06 GiB. GPU 0 has a total capacity of 23.46 GiB of which 1.28 GiB is free. Process 149413 has 18.02 GiB memory in use. Including non-PyTorch memory, this process has 4.16 GiB memory in use. Of the allocated memory 3.78 GiB is allocated by PyTorch, and 180.44 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
i = 0
for report, label in zip(frac_test_texts, frac_test_labels):
    print("Prediction: " + str(i))
    query = fracture_prompt + report
    prediction = rag.query(query)
    predictions.append(prediction == "Positive")
    true_labels.append(label > 0)
    print("\nPredictions: " + str(predictions))
    print("\nTrue labels: " + str(true_labels))
    i = i + 1
    if i == 20:
        break

Prediction: 0


OutOfMemoryError: CUDA out of memory. Tried to allocate 2.07 GiB. GPU 0 has a total capacity of 23.46 GiB of which 213.00 MiB is free. Process 149413 has 17.96 GiB memory in use. Including non-PyTorch memory, this process has 5.29 GiB memory in use. Of the allocated memory 4.79 GiB is allocated by PyTorch, and 299.47 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
predictions = np.array(predictions)
true_labels = np.array(true_labels)

In [None]:
print("Evaluating performance...")
accuracy = accuracy_score(valid_true_labels, valid_predictions)
f1 = f1_score(valid_true_labels, valid_predictions, average="binary")  # Adjust as needed