In [15]:
pip install -q datasets sentence-transformers faiss-cpu accelerate langchain langchain-community

Note: you may need to restart the kernel to use updated packages.


In [18]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score
from transformers import LlamaForCausalLM, LlamaTokenizerFast       # LLM for report classificuing
from sentence_transformers import SentenceTransformer       # for embedding model
from sklearn.model_selection import train_test_split
import os

from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import DataFrameLoader
from langchain.llms import HuggingFacePipeline
from langchain.chains import RetrievalQA
from transformers import AutoTokenizer, pipeline #, AutoModelForSeq2SeqGeneration

In [19]:
filename = "data/labeled_data_combined_reports.csv"

In [21]:
# load data in df format
df_reports = pd.read_csv(filename)
df_reports.head(2)

Unnamed: 0.1,Unnamed: 0,patient_id,imaging_date,reports,image_ct___1,image_ct___2,image_ct___3,combined_reports
0,0,SHSC-134CJ-PV3YY-9L6O6-PLRA9OVTHG-4JZ2M-UR0UO-...,2010-09-21,Bone Scan(Whole Body)Nuc Med TECHNETIUM MDP BO...,0.0,1.0,0.0,Bone Scan(Whole Body)Nuc Med TECHNETIUM MDP BO...
1,1,SHSC-134CJ-PV3YY-9L6O6-PLRA9OVTHG-4JZ2M-UR0UO-...,2011-01-13,CT Chest History:\rFollow-up scan for OZM-011 ...,1.0,1.0,0.0,Bone Scan(Whole Body)Nuc Med TECHNETIUM MDP BO...


In [26]:
df_reports["report_and_frac_label"] = (
    "Report:\n" + 
    df_reports["combined_reports"] + 
    "\n\nFracture classification:\n" + 
    df_reports["image_ct___1"].apply(lambda x: "Positive" if int(x) > 0 else "Negative")
)

df_reports["report_and_mets_label"] = (
    "Report:\n" + 
    df_reports["combined_reports"] + 
    "\n\nMetastases classification:\n" + 
    df_reports["image_ct___2"].apply(lambda x: "Positive" if int(x) > 0 else "Negative")
)

df_reports.head(10)


ValueError: invalid literal for int() with base 10: '0.0'

In [4]:
# load data in datasets format
from datasets import load_dataset
dataset = load_dataset('csv', data_files=filename)

In [11]:
data = dataset["train"]
# data = data["combined_reports"]
data

Dataset({
    features: ['Unnamed: 0', 'patient_id', 'imaging_date', 'reports', 'image_ct___1', 'image_ct___2', 'image_ct___3', 'combined_reports'],
    num_rows: 942
})

In [None]:
data.add_faiss_index("embeddings")

In [5]:
def embed(batch, embedder):
    """
    adds a column to the dataset called 'embeddings'
    """
    # or you can combine multiple columns here
    # For example the title and the text
    information = batch["text"]
    return {"embeddings" : embedder.encode(information)}

In [6]:
# embedding model
# TODO: play around with embedding model type
ST = SentenceTransformer("mixedbread-ai/mxbai-embed-large-v1")

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/266 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/114k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/677 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/670M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.24k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/297 [00:00<?, ?B/s]

In [None]:


class RAGSystem:
    def __init__(self, model_name="sentence-transformers/all-mpnet-base-v2",
                 llm_model="google/flan-t5-base"):
        # Initialize embeddings
        self.embeddings = HuggingFaceEmbeddings(model_name=model_name)
        
        # Initialize text splitter
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=500,
            chunk_overlap=50
        )
        
        # Initialize LLM
        tokenizer = AutoTokenizer.from_pretrained(llm_model)
        model = AutoModelForSeq2SeqGeneration.from_pretrained(llm_model)
        
        self.llm = HuggingFacePipeline(
            pipeline=pipeline(
                "text2text-generation",
                model=model,
                tokenizer=tokenizer,
                max_length=512
            )
        )
        
        self.vectorstore = None
        self.qa_chain = None

    def load_data(self, df, text_column):
        """Load data from pandas DataFrame"""
        # Convert DataFrame to documents
        loader = DataFrameLoader(df, page_content_column=text_column)
        documents = loader.load()
        
        # Split documents into chunks
        texts = self.text_splitter.split_documents(documents)
        
        # Create vector store
        self.vectorstore = FAISS.from_documents(texts, self.embeddings)
        
        # Create QA chain
        self.qa_chain = RetrievalQA.from_chain_type(
            llm=self.llm,
            chain_type="stuff",
            retriever=self.vectorstore.as_retriever(
                search_kwargs={"k": 3}
            )
        )
        
        return f"Loaded {len(texts)} text chunks into the vector store"

    def save_vectorstore(self, path):
        """Save the FAISS vector store"""
        self.vectorstore.save_local(path)

    def load_vectorstore(self, path):
        """Load a saved FAISS vector store"""
        self.vectorstore = FAISS.load_local(path, self.embeddings)
        self.qa_chain = RetrievalQA.from_chain_type(
            llm=self.llm,
            chain_type="stuff",
            retriever=self.vectorstore.as_retriever(
                search_kwargs={"k": 3}
            )
        )

    def query(self, question):
        """Query the RAG system"""
        if self.qa_chain is None:
            raise ValueError("Please load data first")
        
        return self.qa_chain.run(question)

    def similarity_search(self, query, k=3):
        """Perform similarity search without LLM"""
        if self.vectorstore is None:
            raise ValueError("Please load data first")
            
        return self.vectorstore.similarity_search(query, k=k)