In [1]:
pip install -q datasets sentence-transformers faiss-cpu accelerate langchain langchain-community

Note: you may need to restart the kernel to use updated packages.


In [2]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score
from transformers import LlamaForCausalLM, LlamaTokenizerFast       # LLM for report classificuing
from sentence_transformers import SentenceTransformer       # for embedding model
from sklearn.model_selection import train_test_split
import os

from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import DataFrameLoader
from langchain.llms import HuggingFacePipeline
from langchain.chains import RetrievalQA
from transformers import AutoTokenizer, pipeline #, AutoModelForSeq2SeqGeneration

In [3]:
filename = "data/labeled_data_combined_reports.csv"

In [4]:
# load data in df format
df_reports = pd.read_csv(filename)
df_reports.head(2)

Unnamed: 0.1,Unnamed: 0,patient_id,imaging_date,reports,image_ct___1,image_ct___2,image_ct___3,combined_reports
0,0,SHSC-134CJ-PV3YY-9L6O6-PLRA9OVTHG-4JZ2M-UR0UO-...,2010-09-21,Bone Scan(Whole Body)Nuc Med TECHNETIUM MDP BO...,0.0,1.0,0.0,Bone Scan(Whole Body)Nuc Med TECHNETIUM MDP BO...
1,1,SHSC-134CJ-PV3YY-9L6O6-PLRA9OVTHG-4JZ2M-UR0UO-...,2011-01-13,CT Chest History:\rFollow-up scan for OZM-011 ...,1.0,1.0,0.0,Bone Scan(Whole Body)Nuc Med TECHNETIUM MDP BO...


In [5]:
df_reports["report_and_frac_label"] = (
    "Report:\n" + 
    df_reports["combined_reports"] + 
    "\n\nFracture classification:\n" + 
    df_reports["image_ct___1"]
    # df_reports["image_ct___1"].apply(lambda x: "Positive" if float(x) > 0 else "Negative")
)

df_reports["report_and_mets_label"] = (
    "Report:\n" + 
    df_reports["combined_reports"] + 
    "\n\nMetastases classification:\n" + 
    df_reports["image_ct___1"]
    # df_reports["image_ct___2"].apply(lambda x: "Positive" if float(x) > 0 else "Negative")
)

# drop reports that have NaN in reports column
df_reports = df_reports.dropna(subset=["report_and_frac_label", "report_and_mets_label"])

df_reports.head(10)


Unnamed: 0.1,Unnamed: 0,patient_id,imaging_date,reports,image_ct___1,image_ct___2,image_ct___3,combined_reports,report_and_frac_label,report_and_mets_label
0,0,SHSC-134CJ-PV3YY-9L6O6-PLRA9OVTHG-4JZ2M-UR0UO-...,2010-09-21,Bone Scan(Whole Body)Nuc Med TECHNETIUM MDP BO...,0.0,1.0,0.0,Bone Scan(Whole Body)Nuc Med TECHNETIUM MDP BO...,Report:\nBone Scan(Whole Body)Nuc Med TECHNETI...,Report:\nBone Scan(Whole Body)Nuc Med TECHNETI...
1,1,SHSC-134CJ-PV3YY-9L6O6-PLRA9OVTHG-4JZ2M-UR0UO-...,2011-01-13,CT Chest History:\rFollow-up scan for OZM-011 ...,1.0,1.0,0.0,Bone Scan(Whole Body)Nuc Med TECHNETIUM MDP BO...,Report:\nBone Scan(Whole Body)Nuc Med TECHNETI...,Report:\nBone Scan(Whole Body)Nuc Med TECHNETI...
2,2,SHSC-134CJ-PV3YY-9L6O6-PLRA9OVTHG-4JZ2M-UR0UO-...,2011-05-27,Bone Scan(Whole Body)Nuc Med TECHNETIUM MDP BO...,1.0,1.0,0.0,Bone Scan(Whole Body)Nuc Med TECHNETIUM MDP BO...,Report:\nBone Scan(Whole Body)Nuc Med TECHNETI...,Report:\nBone Scan(Whole Body)Nuc Med TECHNETI...
16,4,SHSC-134CJ-PV3YY-9L6O6-PLRA9OVTHG-4JZ2M-UR0UO-...,2011-11-03,Bone Scan Whole Body+Extra Views+Flow+Spect TE...,0.0,1.0,0.0,Bone Scan(Whole Body)Nuc Med TECHNETIUM MDP BO...,Report:\nBone Scan(Whole Body)Nuc Med TECHNETI...,Report:\nBone Scan(Whole Body)Nuc Med TECHNETI...
17,5,SHSC-134CJ-PV3YY-9L6O6-PLRA9OVTHG-4JZ2M-UR0UO-...,2011-12-16,X-Ray Pelvis -Hip & Femur : RT PELVIS AND RIGH...,1.0,1.0,0.0,Bone Scan(Whole Body)Nuc Med TECHNETIUM MDP BO...,Report:\nBone Scan(Whole Body)Nuc Med TECHNETI...,Report:\nBone Scan(Whole Body)Nuc Med TECHNETI...
18,6,SHSC-16KG4-P6VT6-Z3FQN-U9WHYAE4HP-0QTGU-PDOMW-...,2010-04-29,Bone Scan Whole Body+Extra Views+Flow TECHNETI...,1.0,1.0,0.0,Bone Scan Whole Body+Extra Views+Flow TECHNETI...,Report:\nBone Scan Whole Body+Extra Views+Flow...,Report:\nBone Scan Whole Body+Extra Views+Flow...
19,7,SHSC-16KG4-P6VT6-Z3FQN-U9WHYAE4HP-0QTGU-PDOMW-...,2011-11-24,Abdomen + Pelvis CT with oral C+ TECHNIQUE: V...,1.0,1.0,0.0,Bone Scan Whole Body+Extra Views+Flow TECHNETI...,Report:\nBone Scan Whole Body+Extra Views+Flow...,Report:\nBone Scan Whole Body+Extra Views+Flow...
37,9,SHSC-172GA-8TFUS-1WNL6-BUVSWZ4RWB-4CJ89-J5Q06-...,2015-09-08,CT Chest History: 65 yo Male. CA PROSTATE; ly...,0.0,0.0,1.0,Bone Scan Whole Body+Extra Views+Flow+Spect TE...,Report:\nBone Scan Whole Body+Extra Views+Flow...,Report:\nBone Scan Whole Body+Extra Views+Flow...
38,10,SHSC-172GA-8TFUS-1WNL6-BUVSWZ4RWB-4CJ89-J5Q06-...,2016-01-15,Bone Scan Whole Body+Extra Views+Flow+Spect TE...,0.0,1.0,0.0,Bone Scan Whole Body+Extra Views+Flow+Spect TE...,Report:\nBone Scan Whole Body+Extra Views+Flow...,Report:\nBone Scan Whole Body+Extra Views+Flow...
39,11,SHSC-178WS-35GE9-4QOB6-413RR0A6Y8-J4L41-6GS34-...,2008-12-04,Bone Scan Whole Body+Extra Views+Flow TECHNETI...,0.0,0.0,1.0,Bone Scan Whole Body+Extra Views+Flow TECHNETI...,Report:\nBone Scan Whole Body+Extra Views+Flow...,Report:\nBone Scan Whole Body+Extra Views+Flow...


In [6]:
# load data in datasets format
from datasets import load_dataset
dataset = load_dataset('csv', data_files=filename)

In [7]:
data = dataset["train"]
# data = data["combined_reports"]
data

Dataset({
    features: ['Unnamed: 0', 'patient_id', 'imaging_date', 'reports', 'image_ct___1', 'image_ct___2', 'image_ct___3', 'combined_reports'],
    num_rows: 942
})

In [8]:
data.add_faiss_index("embeddings")

ValueError: Columns ['embeddings'] not in the dataset. Current columns in the dataset: ['Unnamed: 0', 'patient_id', 'imaging_date', 'reports', 'image_ct___1', 'image_ct___2', 'image_ct___3', 'combined_reports']

In [None]:
def embed(batch, embedder):
    """
    adds a column to the dataset called 'embeddings'
    """
    # or you can combine multiple columns here
    # For example the title and the text
    information = batch["text"]
    return {"embeddings" : embedder.encode(information)}

In [None]:
# embedding model
# TODO: play around with embedding model type
ST = SentenceTransformer("mixedbread-ai/mxbai-embed-large-v1")

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/266 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/114k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/677 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/670M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.24k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/297 [00:00<?, ?B/s]