<span style="color: blue; font-size: 30px;">Demonstration Notebook</span><br>
**Purpose**:  A demonstration based on the notebooks we used to develop a workflow for<br>a Retrieval Augmented Generation (RAG) application supporting a ChatBot</span><br>
**Scope**: Clinical Trials — *Phase 3, Study Completed, Reported p-values, Sponsor is Pfizer*

## Dependencies

In [1]:
import json
import os
import re
import requests as req
import sys
import urllib.request
from dotenv import load_dotenv
from getpass import getpass
from IPython.display import display, Markdown

import chromadb

from llama_index.core import Document, Settings, StorageContext, VectorStoreIndex
from llama_index.core import __version__ as llama_index_version
from llama_index.core.evaluation import BatchEvalRunner
from llama_index.core.evaluation import FaithfulnessEvaluator
from llama_index.core.evaluation import generate_question_context_pairs
from llama_index.core.evaluation import RelevancyEvaluator
from llama_index.core.evaluation import RetrieverEvaluator
from llama_index.core.memory import ChatMemoryBuffer
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.response.notebook_utils import display_response
from llama_index.core.schema import MetadataMode
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.llms.openai import OpenAI
from llama_index.vector_stores.chroma import ChromaVectorStore

import openai
import pandas as pd
import random
import time

from utils import extract_from_json, flatten_dict, replace_double_newline, format_flattened_dict, pfizer_ncts

### Environment

In [2]:
print(f"Python version: {sys.version.split(' ')[0]}")
print(f"llama_index.core: {llama_index_version}")
print(f"chromadb: {chromadb.__version__}")

Python version: 3.10.9
llama_index.core: 0.10.21
chromadb: 0.4.24


### Verify API tokens are available

In [3]:
load_dotenv() 
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

if OPENAI_API_KEY is None:
  OPENAI_API_KEY = getpass(prompt="Please input your OpenAI API Key and Enter: ")

Please input your OpenAI API Key and Enter: ········


<span style="color: blue; font-size: 30px;">Data Ingestion</span><br>
Fetch data corresponding to scope: *Phase 3, Study Completed, Reported p-values, Sponsor is Pfizer*

In [4]:
def get_trial(nct_id):
    """
    Return: the JSON data for a clinical trial given its NCT ID.
    """
    trial = req.get(f"https://clinicaltrials.gov/api/v2/studies/{nct_id}")
    trial_json = trial.json()
    return trial_json

In [5]:
def get_downloaded_json(list_of_nct_id):
    """
    Downloads JSON file(s)
    """
    downloaded_json = []
    for nct_id in list_of_nct_id:
        trial = get_trial(nct_id)
        downloaded_json.append(trial)
    return downloaded_json

In [6]:
# get a list of NCT IDs from utils (hard-coded for convenience)
list_of_nct_id = pfizer_ncts

In [7]:
# fetch the JSON data for the trials
downloaded_json = get_downloaded_json(list_of_nct_id)
# downloaded_json[0] # check
len(downloaded_json) # check

247

<span style="color: blue; font-size: 30px;">Data Manipulation

In [8]:
# For each trial, extract a  subset of the data, save to a list
# the function extract_from_json is imported from utils 

def list_from_extracted_json(downloaded_json): 
    """
    Processes and saves extracted JSON file(s) locally for review
    Return: a list of documents.
    """
    documents_list  = []
    for json_file in downloaded_json:
        extracted_json = extract_from_json(json_file)
        nct_id = json_file['protocolSection']['identificationModule']['nctId']
        save_path = f"{nct_id}_extracted.json"
        # uncomment to save JSON file(s) locally
        # with open(save_path, "w") as f:
        #     json.dump(extracted_json, f, indent=4)
        
        documents_list.append(extracted_json)
    return documents_list

In [9]:
# create a list of documents from the extracted JSON data 
documents_list = list_from_extracted_json(downloaded_json)
len(documents_list) # check

247

### Metadata
Note: JSON files clinicaltrials.gov are heterogenous and each may contain different number of fields.

In [10]:
def max_keys(documents_list):
    """
    Identifies the document with the maximum number of keys in a list of dictionaries.
    Return: a list of keys from the document with the maximum number of keys. 
    """
    max_index, _ = max(enumerate(documents_list), key=lambda x: len(x[1].keys()))
    all_keys = list(documents_list[max_index].keys())
    return all_keys

In [11]:
# get the keys from the document that has the most keys
# use "all_keys" during creation Llama Documents
all_keys = max_keys(documents_list)
len(all_keys) # check
# all_keys # check

68

In [12]:
def adjust_metadata_keys(all_keys, keys_to_include):
    """
    To adjust the metadata keys used.
    Return: keys to exclude from list of all_keys not in list of keys_to_include.
    """
    keys_to_exclude = [key for key in all_keys if key not in keys_to_include]
    return keys_to_exclude

##### Select metadata for LLM

In [13]:
# WIP — need to explore the how these affect RAG Retrieval
llm_keys_to_incude = [
    "National Clinical Identification NCT ID",
    "Brief title",
    "Condition",
    "Conditions keywords",
    "Lead sponsor",
    "Arms group 0 intervention names",
    "p-value",
    "Statistical Method",
]

# to exclude the keys not used by LLM
llm_keys_to_exclude = adjust_metadata_keys(all_keys, llm_keys_to_incude)
len(llm_keys_to_exclude) # check

60

##### Select metadata for Embedding

In [14]:
# WIP — need to explore the how these affect RAG Retrieval 
# for simplicity, this is the same as llm_keys_to_exclude (in this example)
embedding_keys_to_incude = [
    "National Clinical Identification NCT ID",
    "Brief title",
    "Condition",
    "Conditions keywords",
    "Lead sponsor",
    "Arms group 0 intervention names",
    "p-value",
    "Statistical Method",
]

# to exclude the keys not used by embedding
embedding_keys_to_exclude = adjust_metadata_keys(all_keys, llm_keys_to_incude)
len(embedding_keys_to_exclude) # check

60

<span style="color: blue; font-size: 30px;">Prepare for *Vectorizing* Data

##### Select Embedding Model

In [15]:
# using this as baseline standard
embed_model = OpenAIEmbedding(model="text-embedding-3-large", api_key=OPENAI_API_KEY)

##### Select LLM  type

In [16]:
# using this as baseline standard
llm = OpenAI(temperature=0.0, model="gpt-3.5-turbo-0125", max_tokens=512, api_key=OPENAI_API_KEY)

##### Global settings (Embedding & LLM)

In [17]:
Settings.llm = llm
Settings.embed_model = embed_model

### **Creating & Loading** *LlamaIndex* Documents

##### Example of format to load into LlamaIndex

In [18]:
# NOTE:  metata data must be one of (str, int, float, None)
def create_llama_docs(documents_list):
    """
    Converts a list of trial documents into LlamaIndex Document objects.
    """
    
    llama_documents = []  
    for trial in documents_list:
        # apply functions from utils to flatten JSON and create content similar to the example above
        content_text = format_flattened_dict(flatten_dict(trial))

        llama_document = Document(
            text=content_text, 
            metadata=trial, 
            excluded_llm_metadata_keys=llm_keys_to_exclude, #<== adjust?, TBD
            excluded_embed_metadata_keys=embedding_keys_to_exclude , #<== adjust?, TBD
            metadata_template="{key}=>{value}",
            text_template="Metadata:\n{metadata_str}\n===========================\nContent: \n{content}"
        )
        llama_documents.append(llama_document)  
    return llama_documents

In [19]:
# Create LlamaIndex Document objects
llama_documents = create_llama_docs(documents_list)
len(llama_documents) # check

247

In [20]:
# Example — LLM sees this:
print(llama_documents[0].get_content(metadata_mode=MetadataMode.LLM))

Metadata:
National Clinical Identification NCT ID=>NCT00036270
Brief title=>Randomized Phase III Study Of Exemestane (Aromasin) For 5 Years Versus Tamoxifen for 2.5 to 3 Years Followed By Exemestane
Lead sponsor=>Pfizer
Condition=>['Breast Neoplasms']
Conditions keywords=>oof, this data not available
Arms group 0 intervention names=>['Drug: exemestane (Aromasin)']
p-value=>0.118
Statistical Method=>Log Rank
Content: 
"National Clinical Identification NCT ID": "NCT00036270",
"Organization study identification": "971-ONC-0028-081",
"EudraCT number": "A5991026",
"Organization": "Pfizer",
"Organization class": "INDUSTRY",
"Brief title": "Randomized Phase III Study Of Exemestane (Aromasin) For 5 Years Versus Tamoxifen for 2.5 to 3 Years Followed By Exemestane",
"Official title": "Randomized Phase III Study Of Exemestane (Aromasin) For 5 Years Versus Tamoxifen For 2.5- 3 Years Followed By Exemestane (Aromasin) For A Total Of 5 Years As Adjuvant Therapy For Postmenopausal, Receptor Positive, 

In [21]:
# Example — Embedding sees this:
print(llama_documents[0].get_content(metadata_mode=MetadataMode.EMBED))

Metadata:
National Clinical Identification NCT ID=>NCT00036270
Brief title=>Randomized Phase III Study Of Exemestane (Aromasin) For 5 Years Versus Tamoxifen for 2.5 to 3 Years Followed By Exemestane
Lead sponsor=>Pfizer
Condition=>['Breast Neoplasms']
Conditions keywords=>oof, this data not available
Arms group 0 intervention names=>['Drug: exemestane (Aromasin)']
p-value=>0.118
Statistical Method=>Log Rank
Content: 
"National Clinical Identification NCT ID": "NCT00036270",
"Organization study identification": "971-ONC-0028-081",
"EudraCT number": "A5991026",
"Organization": "Pfizer",
"Organization class": "INDUSTRY",
"Brief title": "Randomized Phase III Study Of Exemestane (Aromasin) For 5 Years Versus Tamoxifen for 2.5 to 3 Years Followed By Exemestane",
"Official title": "Randomized Phase III Study Of Exemestane (Aromasin) For 5 Years Versus Tamoxifen For 2.5- 3 Years Followed By Exemestane (Aromasin) For A Total Of 5 Years As Adjuvant Therapy For Postmenopausal, Receptor Positive, 

### **Chunking** — creating *LlamaIndex* Nodes

In [22]:
def create_nodes(llama_documents):
    """
    Generates and embeds nodes from Llama documents.
    """
    parser = SentenceSplitter(chunk_size=8190,chunk_overlap=0) # <== adjust
    nodes = parser.get_nodes_from_documents(llama_documents)
    for node in nodes:
        node_embedding = embed_model.get_text_embedding(
            node.get_content(metadata_mode=MetadataMode.EMBED)
        )
        node.embedding = node_embedding
    return nodes

In [23]:
# create nodes from Llama documents
nodes = create_nodes(llama_documents)

### **Storing** Nodes — with *Chroma Vector Database*

In [24]:
# Chroma DB collection name
COLLECTION_NAME = "RAG_OPENAI"

db = chromadb.PersistentClient(path="chroma_db")
print(f"Looking for the {COLLECTION_NAME} collection in the database..." )
if COLLECTION_NAME not in [col.name for col in db.list_collections()]:
    print(f"{COLLECTION_NAME} collection WAS NOT FOUND in Chroma DB, creating...")
    chroma_collection = db.create_collection(COLLECTION_NAME)
    print("Creating vector store...")
    vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
    storage_context = StorageContext.from_defaults(vector_store=vector_store)
    print("Creating vector store index")
    VectorStoreIndex(
        nodes=nodes,
        storage_context=storage_context,
        store_nodes_override=True
    )
    record_count = chroma_collection.count()
    print(f"record count: {record_count}")   
    
else:
    print(f"{COLLECTION_NAME} collection WAS FOUND in Chroma DB")
    COLLECTION_NAME = db.get_collection(COLLECTION_NAME)
    vector_store = ChromaVectorStore(chroma_collection=COLLECTION_NAME)
    print("Restoring vector store index from the collection...")
    index = VectorStoreIndex.from_vector_store(
        vector_store=vector_store,
        embed_model=embed_model,
        store_nodes_override=True
    )
    record_count = COLLECTION_NAME.count()
    print(f"record count: {record_count}")

Looking for the RAG_OPENAI collection in the database...
RAG_OPENAI collection WAS NOT FOUND in Chroma DB, creating...
Creating vector store...
Creating vector store index
record count: 248


###  **Creating Vector Index** from stored Nodes

In [25]:
index = VectorStoreIndex.from_vector_store(vector_store)  

<span style="color: blue; font-size: 30px">Querying with the ChatBot

In [26]:
memory = ChatMemoryBuffer.from_defaults(token_limit=10000) #<== adjust
# chat_engine.reset() 

chat_engine = index.as_chat_engine(
    similarity_top_k=3, #<== adjust
    chat_mode="condense_question",

    memory=memory,
    llm=llm,
    context_prompt=(
        """
        You are a chatbot which is expert in parsing information.
        When asked a question, provide a complete response, concisely.
        """
        "Here are the relevant documents for the context:\n"
        "{context_str}"
        "\nInstruction: Use only the context above or this chat history to respond."
    ),
    verbose=False,
)

In [27]:
def ask_chatbot(query):
    response = chat_engine.chat(query)
    display(Markdown(chat_engine.chat(query).response))
    # return response

In [28]:
start_time = time.time()

In [29]:
query = "Provide brief title(s) for studies involving newborn babies and Sildenafil?"
ask_chatbot(query)
# True answer: A Study To Evaluate Safety And Efficacy Of IV Sildenafil In The Treatment Of Neonates With Persistent Pulmonary Hypertension Of The Newborn

A Study To Evaluate Safety And Efficacy Of IV Sildenafil In The Treatment Of Neonates With Persistent Pulmonary Hypertension Of The Newborn

In [30]:
query = "Provide the NCT ID for this study involving infants."
ask_chatbot(query)
# True answer: NCT01720524

The NCT ID for the study titled "A Study To Evaluate Safety And Efficacy Of IV Sildenafil In The Treatment Of Neonates With Persistent Pulmonary Hypertension Of The Newborn" is NCT01720524.

In [31]:
query = "Provide a plain language summary of this study?"
ask_chatbot(query)

The aim of the study was to evaluate whether intravenous sildenafil could reduce the time on inhaled nitric oxide treatment and decrease the failure rate of available treatments for persistent pulmonary hypertension of the newborn.

In [32]:
query = "What was the enrollment count for this trial?"
# query = "What was the enrollment count of the trial with Sildenafil involving newborn babies?"
ask_chatbot(query)
# True answer: 59

The enrollment count for the study titled "A Study To Evaluate Safety And Efficacy Of IV Sildenafil In The Treatment Of Neonates With Persistent Pulmonary Hypertension Of The Newborn" with NCT ID NCT01720524 was 59.

In [33]:
query = "What was the time frame for the primary oucome of this trial?"
ask_chatbot(query)
# True answer: 
# 14 days from the initiation of IV study drug or hospital discharge, whichever occurred first. 

The time frame for the primary outcome of the study "A Study To Evaluate Safety And Efficacy Of IV Sildenafil In The Treatment Of Neonates With Persistent Pulmonary Hypertension Of The Newborn" with NCT ID NCT01720524 was 14 days from the initiation of IV study drug or hospital discharge, whichever occurred first, with a maximum of 14 days.

In [34]:
query = "What was EudraCT number of this trial?"
ask_chatbot(query)
# True answer: "2012-002619-24"

The EudraCT number for the study titled "A Study To Evaluate Safety And Efficacy Of IV Sildenafil In The Treatment Of Neonates With Persistent Pulmonary Hypertension Of The Newborn" with NCT ID NCT01720524 is 2012-002619-24.

In [35]:
query = "Briefly, what conditions are mentioned in this trial involving Sildenafil?"
ask_chatbot(query)
# True answer: 	"Pulmonary Hypertension, Familial Persistent, of the Newborn"

The conditions mentioned in the trial "A Study To Evaluate Safety And Efficacy Of IV Sildenafil In The Treatment Of Neonates With Persistent Pulmonary Hypertension Of The Newborn" are 'persistent pulmonary hypertension', 'newborn', 'neonates', 'iv sildenafil', and 'hypoxic respiratory failure and at risk of persistent pulmonary hypertension of the newborn'.

In [36]:
query = "How did participants receive the treatment?"
ask_chatbot(query)
# True answer: 	"Part A: Participants received sildenafil intravenously based on their body weight

Participants in the study "A Study To Evaluate Safety And Efficacy Of IV Sildenafil In The Treatment Of Neonates With Persistent Pulmonary Hypertension Of The Newborn" received the treatment through intravenous administration of sildenafil. The treatment involved a loading dose of 0.1 mg/kg over 30 minutes followed by a maintenance dose of 0.03 mg/kg/h. The infusion was required to continue for a minimum of 48 hours and a maximum of 14 days.

In [37]:
query = "What statistical method used this trial?"
# query = "What statistical method used this trial involving  Sildenafil involving newborn babies??"
ask_chatbot(query)
# True answer: "ANCOVA"

ANCOVA

In [38]:
query = "What p-value was reported for this trial with Sildenafil for newborn babies?"
ask_chatbot(query)
# True answer: 	0.9850

The p-value reported for the trial involving newborn babies and Sildenafil was 0.9850.

In [39]:
query = "What does this p-value suggest about the study in question? Provide an overview of how researchers use p-value in such studies."
ask_chatbot(query)
# True answer: treatment not effective

A p-value of 0.9850 suggests that there is no statistically significant difference between the groups being compared in the study involving newborn babies and Sildenafil. Researchers typically use p-values to determine the statistical significance of their findings. In this case, a p-value of 0.9850 indicates that the results are not likely due to chance and do not provide enough evidence to reject the null hypothesis, which suggests that the intervention (IV Sildenafil) did not have a significant effect compared to the control group (placebo) in the treatment of neonates with persistent pulmonary hypertension of the newborn.

In [40]:
query = "Based on the p-value, would you say this treatment was successful?"
ask_chatbot(query)
# True answer: not successful

The treatment with IV Sildenafil in the study involving newborn babies with persistent pulmonary hypertension of the newborn was not successful based on the reported p-value of 0.9850.

In [41]:
end_time = time.time()
print(f"runtime: {end_time - start_time:.0f} seconds")

runtime: 79 seconds


##### Reset the ChatBot

In [42]:
chat_engine.reset() 
memory = ChatMemoryBuffer.from_defaults(token_limit=10000) #<== adjust
history = chat_engine.chat_history
# history

In [43]:
query = "What does the acronym NCT mean?"
ask_chatbot(query)

National Clinical Identification

In [44]:
query = "Were Japanese participants involved in any trial? If, so please provide the NCT ID."
ask_chatbot(query)
# True answer: Yes, Japanese participants were involved in the trial with the National Clinical Identification NCT00445770.

Yes, Japanese participants were involved in the National Clinical Identification trial with the NCT ID NCT00445770.

In [45]:
query = "What was the p-value of this trial?"
ask_chatbot(query)
# True answer: <.0001

The p-value of the National Clinical Identification trial involving Japanese participants with the NCT ID NCT00445770 was less than 0.0001.

In [46]:
query = "What does this p-value suggest about this trial? Phrase your explanation of p-value very clearly, so that anyone can understand it."
ask_chatbot(query)
# True answer: results show strong statistical significance 

The p-value less than 0.0001 in the National Clinical Identification trial suggests that there is a statistically significant difference in the efficacy and safety of Etanercept and Methotrexate in Japanese subjects with rheumatoid arthritis.

In [47]:
query = "Based on the p-value, would you say this treatment was successful?"
ask_chatbot(query)
# True answer: successful

Yes, the treatment was successful based on the p-value of the National Clinical Identification trial involving Japanese participants with the NCT ID NCT00445770.

##### Reset the ChatBot

In [48]:
chat_engine.reset() 
memory = ChatMemoryBuffer.from_defaults(token_limit=10000) #<== adjust
history = chat_engine.chat_history
# history

<span style="color: blue; font-size: 30px;">Evaluation

### Select LLM and Query Engine

In [49]:
llm3 = llm
llm4 = OpenAI(model="gpt-4")
vector_index = VectorStoreIndex.from_vector_store(vector_store)  
query_engine = vector_index.as_query_engine(llm=llm3)

### Creating a Question / Context Dataset for Evaluation|

In [50]:
qa_dataset = generate_question_context_pairs(
    nodes,
    llm=llm3,
    num_questions_per_chunk=3
)

100%|██████████| 248/248 [06:55<00:00,  1.68s/it]


### Hit Rate & Mean Reciprocal Rank 

In [51]:
retriever = vector_index.as_retriever(similarity_top_k=3)
retriever_evaluator = RetrieverEvaluator.from_metric_names(
    ["mrr", "hit_rate"], retriever=retriever
)

eval_results = await retriever_evaluator.aevaluate_dataset(qa_dataset)
metric_dicts = []
for eval_result in eval_results:
    metric_dict = eval_result.metric_vals_dict
    metric_dicts.append(metric_dict)

full_df = pd.DataFrame(metric_dicts)
hit_rate = round(full_df["hit_rate"].mean(), 2)
mrr = round(full_df["mrr"].mean(), 2)

print(f"Hit Rate: {hit_rate:.2f}") 
print(f"Mean Reciprocal Rank (MRR): {mrr:.2f}")

Hit Rate: 0.60
Mean Reciprocal Rank (MRR): 0.52


Hit Rate: 0.60<br>
Mean Reciprocal Rank (MRR): 0.52

### Faithfullness & Relevancy

In [52]:
queries = list(qa_dataset.queries.values())
faithfulness_gpt4 = FaithfulnessEvaluator(llm=llm3)
relevancy_gpt4 = RelevancyEvaluator(llm=llm3)

In [54]:
num_samples = 10
batch_eval_queries = random.sample(queries, num_samples)

# Initiate BatchEvalRunner to compute FaithFulness and Relevancy Evaluation.
runner = BatchEvalRunner(
    {"faithfulness": faithfulness_gpt4, "relevancy": relevancy_gpt4},
    workers=1,
)

# Compute evaluation
eval_results = await runner.aevaluate_queries(
    query_engine, queries=batch_eval_queries

)

faithfulness_score = sum(result.passing for result in eval_results['faithfulness']) / len(eval_results['faithfulness'])
print(f"Faithfulness Score: {faithfulness_score:.2f}")

relevancy_score = sum(result.passing for result in eval_results['relevancy']) / len(eval_results['relevancy'])
print(f"Relevancy Score: {relevancy_score:.2f}")

Faithfulness Score: 0.90
Relevancy Score: 0.80


Faithfulness Score: 0.90<br>
Relevancy Score: 0.80

### Summary of Metrics<br>
(1) “**Hit Rate**” —the percentage of times the most relevant document falls within the top-k retrieved documents by the RAG system. This helps us gauge capability of the retrieval.<br>
(2) “**Mean Reciprocal Rank**” (MRR) — a perfect MRR score of 1 indicates the most relevant document is always retrieved first. But if the RAG returns the most relevant document as its 3rd choice that would be MRR=0.33 (so, higher is better).<br>
(3) “**Faithfulness**” — how well the response matches the retrievable context — here a very low score indicates hallucination.<br>
(4) “**Relevancy**” — measures if the response and context match up nicely with the query. A low score suggests the answer might be off-topic or not addressing the specific question (like students who answer what they know, rather than what the teacher asked).<br>

In [55]:
columns = {"Hit Rate": [hit_rate], "Mean Reciprocal Rank (MRR)": [mrr], 'Faithfulness Score': [faithfulness_score], 'Relevancy Score ': [relevancy_score]}
metric_df = pd.DataFrame(columns)
display(metric_df)

Unnamed: 0,Hit Rate,Mean Reciprocal Rank (MRR),Faithfulness Score,Relevancy Score
0,0.6,0.52,0.9,0.8


### END