## Imports

In [1]:
from dotenv import load_dotenv
import json
import os
import pandas as pd
import pprint
import requests as req
import urllib.request

import chromadb

from llama_index.core import Document, Settings, StorageContext, VectorStoreIndex
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.response.notebook_utils import display_response
from llama_index.core.schema import MetadataMode
from llama_index.embeddings.nomic import NomicEmbedding
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.llms.openai import OpenAI
from llama_index.vector_stores.chroma import ChromaVectorStore

import openai

from utils_15C import extract_from_json

In [2]:
%reload_ext watermark
%watermark -v -p llama_index.core
# Python implementation: CPython
# Python version       : 3.11.7
# IPython version      : 8.20.0
# llama_index.core: 0.10.12

Python implementation: CPython
Python version       : 3.11.7
IPython version      : 8.20.0

llama_index.core: 0.10.12



In [3]:
# Versions:  chromadb and llama-index 
!pip list | grep chromadb
!pip list | grep llama
# chromadb                                 0.4.23
# llama_cpp_python                         0.2.53
# llama-index-core                         0.10.12
# llama-index-embeddings-huggingface       0.1.4
# llama-index-embeddings-nomic             0.1.6
# llama-index-embeddings-openai            0.1.6
# llama-index-llms-huggingface             0.1.3
# llama-index-llms-llama-cpp               0.1.3
# llama-index-llms-openai                  0.1.6
# llama-index-vector-stores-chroma         0.0.1
# llamaindex-py-client                     0.1.13

chromadb                                 0.4.23
llama_cpp_python                         0.2.53
llama-index-core                         0.10.12
llama-index-embeddings-huggingface       0.1.4
llama-index-embeddings-nomic             0.1.6
llama-index-embeddings-openai            0.1.6
llama-index-llms-huggingface             0.1.3
llama-index-llms-llama-cpp               0.1.3
llama-index-llms-openai                  0.1.6
llama-index-vector-stores-chroma         0.0.1
llamaindex-py-client                     0.1.13


## Verify API tokens are available

In [4]:
load_dotenv() 
# nomic_api_key = os.getenv("NOMIC_API_KEY")

openai_api_key = os.getenv("OPENAI_API_KEY")

## (Optional) Remove previous JSON files and Chroma DB before starting
<span style="color: darkred; font-size: 18px;">using macOS/Linux %%bash

In [5]:
%%bash
find ./ -type f -name "*.json" -delete

In [6]:
%%bash
rm -rf chroma_db

## Fetch data corresponding to Pfizer PLS
source:  https://www.pfizer.com/science/clinical-trials/plain-language-study-results-summaries/

In [7]:
def get_trial(nct_id):
    """
    Return: the JSON data for a clinical trial given its NCT ID.
    """
    trial = req.get(f"https://clinicaltrials.gov/api/v2/studies/{nct_id}")
    trial_json = trial.json()
    return trial_json

In [8]:
def get_downloaded_json(list_of_nct_id):
    """
    Downloads and saves JSON file(s) locally for reference.
    """
    downloaded_json = []
    for nct_id in list_of_nct_id:
        trial = get_trial(nct_id)
        downloaded_json.append(trial)
        with open(f"{nct_id}.json", "w") as f:
            json.dump(trial, f, indent=4)
    return downloaded_json

### Nine trials to consider (has Pfizer PLS, phase 3, completed with results, 2 arms):

In [9]:
# "NCT01720524" A Study To Evaluate Safety And Efficacy Of IV Sildenafil In The Treatment Of Neonates With Persistent Pulmonary Hypertension Of The Newborn
# "NCT01942135" Palbociclib (PD-0332991) Combined With Fulvestrant In Hormone Receptor+ HER2-Negative Metastatic Breast Cancer After Endocrine Failure (PALOMA-3)
# "NCT01945775" A Study Evaluating Talazoparib (BMN 673), a PARP Inhibitor, in Advanced and/​or Metastatic Breast Cancer Patients With BRCA Mutation (EMBRACA Study) (EMBRACA)
# "NCT01964716" 13vPnC Multidose Vial Safety, Tolerability and Immunogenicity Study in Healthy Infants.
# "NCT02130557" A Multicenter Phase 3, Open-Label Study of Bosutinib Versus Imatinib in Adult Patients With Newly Diagnosed Chronic Phase Chronic Myelogenous Leukemia
# "NCT02187744" A Study Of PF-05280014 Or Trastuzumab Plus Taxotere® And Carboplatin In HER2 Positive Breast Cancer In The Neoadjuvant Setting (REFLECTIONS B327-04)
# "NCT02367456" A Combination Study of PF-04449913 (Glasdegib) and Azacitidine In Untreated MDS, AML and CMML Patients (BRIGHT 1012)
# "NCT02603432" A Study Of Avelumab In Patients With Locally Advanced Or Metastatic Urothelial Cancer (JAVELIN Bladder 100)
# "NCT03090191" Clostridium Difficile Vaccine Efficacy Trial (Clover)

In [10]:
# list of NCT IDs for the trials
list_of_nct_id = [
    "NCT01720524",
    "NCT01942135",
    "NCT01945775",
    "NCT01964716",
    "NCT02130557",
    "NCT02187744",
    "NCT02367456",
    "NCT02603432",
    "NCT03090191",
]

In [11]:
# fetch the JSON data for the trials
downloaded_json = get_downloaded_json(list_of_nct_id)
# downloaded_json[0] # check
# len(downloaded_json) # check

## For each trial, extract a  subset of the data, save to a list
the **extract_from_json()** function is found in imported utils (this function is WIP and can be improved)

In [12]:
def list_from_extracted_json(downloaded_json): 
    """
    Processes and saves extracted JSON file(s) locally for review
    Return: a list of documents.
    """
    documents_list  = []
    for json_file in downloaded_json:
        extracted_json = extract_from_json(json_file)
        nct_id = json_file['protocolSection']['identificationModule']['nctId']
        save_path = f"{nct_id}_extracted.json"
        with open(save_path, "w") as f:
            json.dump(extracted_json, f, indent=4)
        documents_list.append(extracted_json)
    return documents_list

In [13]:
# create a list of documents from the extracted JSON data 
documents_list = list_from_extracted_json(downloaded_json)
len(documents_list) # check

9

### Metadata fields
Note: JSON files clinicaltrials.gov are heterogenous and each may contain different number of fields.

In [14]:
def max_keys(documents_list):
    """
    Identifies the document with the maximum number of keys in a list of dictionaries.
    Return: a list of keys from the document with the maximum number of keys. 
    """
    max_index, _ = max(enumerate(documents_list), key=lambda x: len(x[1].keys()))
    all_keys = list(documents_list[max_index].keys())
    return all_keys

In [15]:
# get the keys from the document that has the most keys
all_keys = max_keys(documents_list)
len(all_keys) # check
# all_keys # check

124

#### Select metadata to index

In [16]:
def adjust_metadata_keys(all_keys, keys_to_include):
    """
    To adjust the metadata keys used.
    Return: keys to exclude from list of all_keys not in list of keys_to_include.
    """
    keys_to_exclude = [key for key in all_keys if key not in keys_to_include]
    return keys_to_exclude

In [17]:
# WIP — need to explore the how these affect RAG Retrieval 
llm_keys_to_incude = [
    "Brief title",
    "National Clinical Identification NCT ID",
    "Lead sponsor",
    "Arms group 0 intervention names",
    "Enrollment count",
    ""
]

# to exclude the keys not used by LLM
llm_keys_to_exclude = adjust_metadata_keys(all_keys, llm_keys_to_incude)
len(llm_keys_to_exclude) # check

119

In [18]:
# WIP — need to explore the how these affect RAG Retrieval 
# for simplicity, this is the same as llm_keys_to_exclude (in this example)
embedding_keys_to_incude = [
    "Brief title",
    "National Clinical Identification NCT ID",
    "Lead sponsor",
    "Arms group 0 intervention names",
    "Enrollment count",
    ""
]

# to exclude the keys not used by embedding
embedding_keys_to_exclude = adjust_metadata_keys(all_keys, llm_keys_to_incude)
len(embedding_keys_to_exclude) # check

119

## Llama index

### embedding type

In [19]:
# using this as baseline standard
embed_model = OpenAIEmbedding(model="text-embedding-ada-002")

# Possible alternative (TBD)
# embed_model = NomicEmbedding(
#     api_key=nomic_api_key,
#     dimensionality=128,
#     model_name="nomic-embed-text-v1.5",
# )
# embedding = embed_model.get_text_embedding("Nomic Embeddings")

### model type

In [20]:
# using this as baseline standard
llm = OpenAI(temperature=0.001, model="gpt-3.5-turbo", max_tokens=512)

### global settings

In [21]:
Settings.llm = llm
Settings.embed_model = embed_model

### add metadata

In [22]:
# NOTE:  metata data must be one of (str, int, float, None)
def create_llama_docs(documents_list):
    """
    Converts a list of trial documents into LlamaIndex Document objects.
    """
    llama_documents = []  
    for trial in documents_list:
        keys = [
            "National Clinical Identification NCT ID",
            "Organization",
            "Official title",
            "Brief summary",
            "Detailed description",
            "Condition",
            "Arms group 0 intervention name",
            "Arms group 1 intervention name",
            "Primary outcome",
        ]
        content_text = " ".join(trial[key].strip('"') for key in keys)

        llama_document = Document(
            text=content_text, 
            metadata=trial,
            excluded_llm_metadata_keys=llm_keys_to_exclude, #<== adjust?, TBD
            excluded_embed_metadata_keys=embedding_keys_to_exclude , #<== adjust?, TBD
            metadata_template="{key}=>{value}",
            text_template="Metadata:\n{metadata_str}\n===========================\nContent: \n{content}"
        )
        llama_documents.append(llama_document)  
    return llama_documents

In [23]:
# Create LlamaIndex Document objects
llama_documents = create_llama_docs(documents_list)
# len(llama_documents) # check

In [24]:
# Example — LLM sees this:
print(llama_documents[0].get_content(metadata_mode=MetadataMode.LLM))

Metadata:
National Clinical Identification NCT ID=>NCT01720524
Brief title=>A Study To Evaluate Safety And Efficacy Of IV Sildenafil In The Treatment Of Neonates With Persistent Pulmonary Hypertension Of The Newborn
Lead sponsor=>Pfizer's Upjohn has merged with Mylan to form Viatris Inc.
Enrollment count=>59
Arms group 0 intervention names=>['Drug: placebo']
Content: 
NCT01720524 Pfizer A MULTI-CENTRE, RANDOMIZED, PLACEBO-CONTROLLED, DOUBLE-BLIND, TWO-ARMED, PARALLEL GROUP STUDY TO EVALUATE EFFICACY AND SAFETY OF IV SILDENAFIL IN THE TREATMENT OF NEONATES WITH PERSISTENT PULMONARY HYPERTENSION OF THE NEWBORN (PPHN) OR HYPOXIC RESPIRATORY FAILURE AND AT RISK FOR PPHN, WITH A LONG TERM FOLLOW-UP INVESTIGATION OF DEVELOPMENTAL PROGRESS 12 AND 24 MONTHS AFTER COMPLETION OF STUDY TREATMENT This study will evaluate whether IV sildenafil can reduce the time on inhaled nitric oxide treatment and reduce the failure rate of available treatments for persistent pulmonary hypertension of the newbor

In [25]:
# Example — Embedding sees this:
print(llama_documents[0].get_content(metadata_mode=MetadataMode.EMBED))

Metadata:
National Clinical Identification NCT ID=>NCT01720524
Brief title=>A Study To Evaluate Safety And Efficacy Of IV Sildenafil In The Treatment Of Neonates With Persistent Pulmonary Hypertension Of The Newborn
Lead sponsor=>Pfizer's Upjohn has merged with Mylan to form Viatris Inc.
Enrollment count=>59
Arms group 0 intervention names=>['Drug: placebo']
Content: 
NCT01720524 Pfizer A MULTI-CENTRE, RANDOMIZED, PLACEBO-CONTROLLED, DOUBLE-BLIND, TWO-ARMED, PARALLEL GROUP STUDY TO EVALUATE EFFICACY AND SAFETY OF IV SILDENAFIL IN THE TREATMENT OF NEONATES WITH PERSISTENT PULMONARY HYPERTENSION OF THE NEWBORN (PPHN) OR HYPOXIC RESPIRATORY FAILURE AND AT RISK FOR PPHN, WITH A LONG TERM FOLLOW-UP INVESTIGATION OF DEVELOPMENTAL PROGRESS 12 AND 24 MONTHS AFTER COMPLETION OF STUDY TREATMENT This study will evaluate whether IV sildenafil can reduce the time on inhaled nitric oxide treatment and reduce the failure rate of available treatments for persistent pulmonary hypertension of the newbor

## Create LlamaIndex RAG Nodes

In [26]:
def create_nodes(llama_documents):
    """
    Generates and embeds nodes from Llama documents.
    """
    parser = SentenceSplitter(chunk_size=1024,chunk_overlap=100) # <== adjust
    nodes = parser.get_nodes_from_documents(llama_documents)
    for node in nodes:
        node_embedding = embed_model.get_text_embedding(
            node.get_content(metadata_mode=MetadataMode.EMBED)
        )
        node.embedding = node_embedding
    return nodes

In [27]:
# create nodes from Llama documents
nodes = create_nodes(llama_documents)

## Chroma

In [28]:
# Chroma DB collection name
COLLECTION_NAME = "CLINICAL_RAG"

db = chromadb.PersistentClient(path="chroma_db")
print(f"Looking for the {COLLECTION_NAME} collection in the database..." )
if COLLECTION_NAME not in [col.name for col in db.list_collections()]:
    print(f"{COLLECTION_NAME} collection WAS NOT FOUND in Chroma DB, creating...")
    chroma_collection = db.create_collection(COLLECTION_NAME)
    print("Creating vector store...")
    vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
    storage_context = StorageContext.from_defaults(vector_store=vector_store)
    print("Creating vector store index")
    VectorStoreIndex(
        nodes=nodes,
        storage_context=storage_context,
        store_nodes_override=True
    )
    record_count = chroma_collection.count()
    print(f"record count: {record_count}")   
    
else:
    print(f"{COLLECTION_NAME} collection WAS FOUND in Chroma DB")
    COLLECTION_NAME = db.get_collection(COLLECTION_NAME)
    vector_store = ChromaVectorStore(chroma_collection=COLLECTION_NAME)
    print("Restoring vector store index from the collection...")
    index = VectorStoreIndex.from_vector_store(
        vector_store=vector_store,
        embed_model=embed_model,
        store_nodes_override=True
    )
    record_count = COLLECTION_NAME.count()
    print(f"record count: {record_count}")

Looking for the CLINICAL_RAG collection in the database...
CLINICAL_RAG collection WAS NOT FOUND in Chroma DB, creating...
Creating vector store...
Creating vector store index
record count: 9


###  retrieve from Chroma DB and Query

In [29]:
index = VectorStoreIndex.from_vector_store(vector_store)  

#### Q1

In [30]:
query_engine = index.as_query_engine(similarity_top_k=record_count) # <== set k=total number of records

query = "List the different NCT ID in ascending order. How many is that?"
response = query_engine.query(query)
display_response(response)


**`Final Response:`** NCT01720524, NCT01942135, NCT01945775, NCT01964716, NCT02130557, NCT02187744, NCT02367456, NCT02603432, NCT03090191. There are 9 different NCT IDs in total.

In [31]:
# True answer
list_of_nct_id = [
    "NCT01720524",
    "NCT01942135",
    "NCT01945775",
    "NCT01964716",
    "NCT02130557",
    "NCT02187744",
    "NCT02367456",
    "NCT02603432",
    "NCT03090191",
]
# response: High Precision - all correct, High Recall - all retrieved

#### Q2

In [32]:
query_engine = index.as_query_engine(similarity_top_k=3) 
query = "How many NCT ID related to Leukemia? Which are they?"
response = query_engine.query(query)
display_response(response)


**`Final Response:`** There are two NCT IDs related to Leukemia. They are NCT02130557 and NCT02367456.

In [33]:
# True answer
# "NCT02130557",  "NCT02367456",
# response: correct

### Focus on one study

#### Q3

In [34]:
query_engine = index.as_query_engine(similarity_top_k=3)

query = "Who was the lead sponsor of NCT02130557?"
response = query_engine.query(query)
display_response(response)

**`Final Response:`** The lead sponsor of NCT02130557 was Pfizer.

In [35]:
# Correct, but too easy, right?

#### Q4

In [36]:
query_engine = index.as_query_engine(similarity_top_k=6) # <==note k

query = "What intervention was studied in NCT02130557?"
response = query_engine.query(query)
display_response(response)

**`Final Response:`** Bosutinib

In [37]:
# True answer: Bosutinib
# response:  correct
# in a previous iteration, wrong answers with these embeddings:</span><br>
# ("text-embedding-3-small", dimensions=256)<br>
# ("Nomic Embeddings")

#### Q5

In [38]:
query_engine = index.as_query_engine(similarity_top_k=7) # <==note k

query = "What was the enrollment count for NCT02130557?"
response = query_engine.query(query)
display_response(response)

**`Final Response:`** 536

In [39]:
# True answer: 536
# response:  correct

## Enhanced Prompts

In [40]:
PLS_prompt = "Using everyday language to make the clinical results of a study meaningful and understandable to a lay person, rephrase this: "
expert_prompt = "Emulate a PhD scientist and expert statistician to elaborate on the following: "

In [41]:
def get_response(query, prompt_1, prompt_2):
    """
    Queries an index and prints responses to two prompts.
    """
    query_engine = index.as_query_engine(similarity_top_k=7) # <==note k

    response = query_engine.query(query)
    print(f"Original response:\n{response}")
 
    query_2 = prompt_1 + response.response
    response_2 = query_engine.query(query_2)
    print(f"\nPlain Language Summary:\n{response_2}")
    
    query_2 = prompt_2 + response.response
    response_2 = query_engine.query(query_2)
    print(f"\nExpert Analysis:\n{response_2}")

In [42]:
query = "What was the purpose of study NCT02130557?"
get_response(query, PLS_prompt, expert_prompt)

Original response:
The purpose of study NCT02130557 was to compare the efficacy of Bosutinib versus Imatinib in adult patients with newly diagnosed Chronic Phase Chronic Myelogenous Leukemia.

Plain Language Summary:
The study NCT02130557 aimed to see if Bosutinib works better than Imatinib in adults who were recently diagnosed with Chronic Phase Chronic Myelogenous Leukemia.

Expert Analysis:
The study NCT02130557 aimed to assess the effectiveness of Bosutinib compared to Imatinib in adult patients diagnosed with newly diagnosed Chronic Phase Chronic Myelogenous Leukemia. The primary focus was to investigate the impact of these two treatments on the percentage of participants achieving Major Molecular Response (MMR) at Month 12. The study design involved a Phase 3, open-label trial where patients were randomized to receive either Bosutinib or Imatinib for the duration of the study. The study's primary outcome group descriptions emphasized the importance of achieving a total sample siz

## add record(s)
no error thrown if records are dupes

In [43]:
print(f"record_count as is: {record_count}")

record_count as is: 9


In [44]:
# new unrelateds trial
list_of_nct_id = [
    "NCT00094887", # Anemia, Sickle Cell
    "NCT00108953", # Carcinoma, Hepatocellular
]

downloaded_json = get_downloaded_json(list_of_nct_id)
documents_list = list_from_extracted_json(downloaded_json)

In [45]:
# using previous functions 
llama_documents = create_llama_docs(documents_list)
nodes = create_nodes(llama_documents)
index.insert_nodes(nodes)

In [46]:
try:
    print(f"new DB record count: {chroma_collection.count()}")
    record_count = chroma_collection.count()
    print(record_count)
except:
    print(f"established DB record count: {COLLECTION_NAME.count()}")
    record_count = COLLECTION_NAME.count()
    print(record_count)

new DB record count: 11
11


In [47]:
query_engine = index.as_query_engine(similarity_top_k=11) # <== set k=total number of records

query = "List the different NCT ID in ascending order. How many is that?"
response = query_engine.query(query)
display_response(response)


**`Final Response:`** NCT00094887, NCT01942135, NCT01945775. There are 3 different NCT IDs in total.

## Testing RAG vs ChatGPT (GPT-4) and Google Gemini

In [49]:
query_engine = index.as_query_engine(similarity_top_k=4) #<== sensitive, adjusted to get the right answer

query = "What disease was studied in trial NCT02603432?"
response = query_engine.query(query)
display_response(response)

**`Final Response:`** Urothelial Cancer

In [50]:
# True answer: locally advanced or metastatic urothelial cancer 
# response:  correct

ChatGPT (wrong)<br>
As of my last update in April 2023, the clinical trial with the identifier NCT02603432 was focused on Duchenne Muscular Dystrophy (DMD). This trial, titled "A Phase 3 Study to Evaluate the Efficacy and Safety of Eteplirsen in Duchenne Muscular Dystrophy," was designed to assess the efficacy and safety of eteplirsen in treating patients with DMD. Eteplirsen aims to increase dystrophin production by skipping exon 51 of the dystrophin gene, which can potentially provide a therapeutic benefit to DMD patients with a confirmed mutation amenable to exon 51 skipping. Duchenne Muscular Dystrophy is a severe type of muscular dystrophy that affects boys, leading to muscle degeneration and weakness.

Gemini (correct and excellent)<br>
The disease studied in trial NCT02603432 was urothelial carcinoma, which is a type of cancer that forms in the lining of the urinary tract. The urinary tract is the system that removes waste from the body. It includes the kidneys, ureters, bladder, and urethra. 

In [None]:
STOP HERE

## Using CallbackManager & LlamaDebugHandler

In [None]:
from llama_index.core.callbacks import CallbackManager, LlamaDebugHandler

# initiaalize debug handler and save to Settings
llm_debug = LlamaDebugHandler()
callback_manager = CallbackManager([llm_debug])
Settings.callback_manager = callback_manager

In [None]:
# helper function
def print_in_out(in_out):
	print(in_out[0][0].payload['messages'][0])
	print("\n\n")
	print(in_out[0][0].payload['messages'][1])

# whenever query_engine.query() is called
query_engine = index.as_query_engine()
query = "How many NCT ID related to Leukemia? Which are they?"
response = query_engine.query(query)
print(response)

# print LLM debug info 
in_out = llm_debug.get_llm_inputs_outputs()
print_in_out(in_out)