## Importing Libraries

In [1]:
import json
import os
import re
import requests as req
import urllib.request
from dotenv import load_dotenv
from IPython.display import display, Markdown

import chromadb

from llama_index.core import Document, Settings, StorageContext, VectorStoreIndex
from llama_index.core.memory import ChatMemoryBuffer
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.response.notebook_utils import display_response
from llama_index.core.schema import MetadataMode
from llama_index.embeddings.nomic import NomicEmbedding
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.llms.openai import OpenAI
from llama_index.vector_stores.chroma import ChromaVectorStore

import openai

from utils_17 import extract_from_json, flatten_dict, replace_double_newline, format_flattened_dict, pfizer_ncts

### Environment

In [2]:
%reload_ext watermark
%watermark -v -p llama_index.core
# Python implementation: CPython
# Python version       : 3.11.7
# IPython version      : 8.20.0
# llama_index.core: 0.10.12

Python implementation: CPython
Python version       : 3.11.7
IPython version      : 8.20.0

llama_index.core: 0.10.18.post1



In [3]:
# Versions:  chromadb and llama-index 
!pip list | grep chromadb
!pip list | grep llama
# chromadb                                 0.4.23
# llama_cpp_python                         0.2.53
# llama-index                              0.10.18
# llama-index-agent-openai                 0.1.5
# llama-index-cli                          0.1.8
# llama-index-core                         0.10.18.post1
# llama-index-embeddings-huggingface       0.1.4
# llama-index-embeddings-nomic             0.1.6
# llama-index-embeddings-openai            0.1.6
# llama-index-indices-managed-llama-cloud  0.1.3
# llama-index-legacy                       0.9.48
# llama-index-llms-huggingface             0.1.3
# llama-index-llms-llama-cpp               0.1.3
# llama-index-llms-ollama                  0.1.2
# llama-index-llms-openai                  0.1.6
# llama-index-multi-modal-llms-openai      0.1.4
# llama-index-program-openai               0.1.4
# llama-index-question-gen-openai          0.1.3
# llama-index-readers-file                 0.1.9
# llama-index-readers-llama-parse          0.1.3
# llama-index-vector-stores-chroma         0.1.6
# llama-index-vector-stores-elasticsearch  0.1.5
# llama-parse                              0.3.8
# llamaindex-py-client                     0.1.13

chromadb                                 0.4.23
llama_cpp_python                         0.2.53
llama-index                              0.10.18
llama-index-agent-openai                 0.1.5
llama-index-cli                          0.1.8
llama-index-core                         0.10.18.post1
llama-index-embeddings-huggingface       0.1.4
llama-index-embeddings-nomic             0.1.6
llama-index-embeddings-openai            0.1.6
llama-index-indices-managed-llama-cloud  0.1.3
llama-index-legacy                       0.9.48
llama-index-llms-huggingface             0.1.3
llama-index-llms-llama-cpp               0.1.3
llama-index-llms-ollama                  0.1.2
llama-index-llms-openai                  0.1.6
llama-index-multi-modal-llms-openai      0.1.4
llama-index-program-openai               0.1.4
llama-index-question-gen-openai          0.1.3
llama-index-readers-file                 0.1.9
llama-index-readers-llama-parse          0.1.3
llama-index-vector-stores-chroma         0.1.6


### Verify API tokens are available

In [4]:
load_dotenv() 
# nomic_api_key = os.getenv("NOMIC_API_KEY")
openai_api_key = os.getenv("OPENAI_API_KEY")

### (Optional) Remove previous JSON files and Chroma DB before starting
using macOS/Linux %%bash

In [5]:
%%bash
find ./ -type f -name "*.json" -delete

In [6]:
%%bash
rm -rf chroma_db

<span style="color: blue; font-size: 30px;">Data Collection

##### Fetch data corresponding to Pfizer PLS<br>source:  https://www.pfizer.com/science/clinical-trials/plain-language-study-results-summaries/

In [7]:
def get_trial(nct_id):
    """
    Return: the JSON data for a clinical trial given its NCT ID.
    """
    trial = req.get(f"https://clinicaltrials.gov/api/v2/studies/{nct_id}")
    trial_json = trial.json()
    return trial_json

In [8]:
def get_downloaded_json(list_of_nct_id):
    """
    Downloads and saves JSON file(s) locally for reference.
    """
    downloaded_json = []
    for nct_id in list_of_nct_id:
        trial = get_trial(nct_id)
        downloaded_json.append(trial)
        with open(f"{nct_id}.json", "w") as f:
            json.dump(trial, f, indent=4)
    return downloaded_json

##### Trials to consider (Pfizer, phase 3, with p-value)

In [10]:
# list of NCT IDs for the trials
list_of_nct_id = pfizer_ncts

In [11]:
# fetch the JSON data for the trials
downloaded_json = get_downloaded_json(list_of_nct_id)
# downloaded_json[0] # check
len(downloaded_json) # check

59

<span style="color: blue; font-size: 30px;">Data Exploration and Cleaning

## Data Cleaning
For each trial, extract a  subset of the data, save to a list<br>the **extract_from_json()** function is found in imported utils **(this function is WIP and can be improved)**

In [12]:
def list_from_extracted_json(downloaded_json): 
    """
    Processes and saves extracted JSON file(s) locally for review
    Return: a list of documents.
    """
    documents_list  = []
    for json_file in downloaded_json:
        extracted_json = extract_from_json(json_file)
        nct_id = json_file['protocolSection']['identificationModule']['nctId']
        save_path = f"{nct_id}_extracted.json"
        with open(save_path, "w") as f:
            json.dump(extracted_json, f, indent=4)
        documents_list.append(extracted_json)
    return documents_list

In [13]:
# create a list of documents from the extracted JSON data 
documents_list = list_from_extracted_json(downloaded_json)
len(documents_list) # check

59

### Metadata
Note: JSON files clinicaltrials.gov are heterogenous and each may contain different number of fields.

In [14]:
def max_keys(documents_list):
    """
    Identifies the document with the maximum number of keys in a list of dictionaries.
    Return: a list of keys from the document with the maximum number of keys. 
    """
    max_index, _ = max(enumerate(documents_list), key=lambda x: len(x[1].keys()))
    all_keys = list(documents_list[max_index].keys())
    return all_keys

In [15]:
# get the keys from the document that has the most keys
# use "all_keys" during creation Llama Documents
all_keys = max_keys(documents_list)
len(all_keys) # check
# all_keys # check

68

In [16]:
# all_keys

In [17]:
def adjust_metadata_keys(all_keys, keys_to_include):
    """
    To adjust the metadata keys used.
    Return: keys to exclude from list of all_keys not in list of keys_to_include.
    """
    keys_to_exclude = [key for key in all_keys if key not in keys_to_include]
    return keys_to_exclude

##### Select metadata for LLM

In [18]:
# WIP — need to explore the how these affect RAG Retrieval
llm_keys_to_incude = [
    "National Clinical Identification NCT ID",
    "Brief title",
    "Condition",
    "Conditions keywords",
    "Lead sponsor",
    "Arms group 0 intervention names",
]

# to exclude the keys not used by LLM
llm_keys_to_exclude = adjust_metadata_keys(all_keys, llm_keys_to_incude)
len(llm_keys_to_exclude) # check

62

##### Select metadata for Embedding

In [19]:
# WIP — need to explore the how these affect RAG Retrieval 
# for simplicity, this is the same as llm_keys_to_exclude (in this example)
embedding_keys_to_incude = [
    "National Clinical Identification NCT ID",
    "Brief title",
    "Condition",
    "Conditions keywords",
    "Lead sponsor",
    "Arms group 0 intervention names",
]

# to exclude the keys not used by embedding
embedding_keys_to_exclude = adjust_metadata_keys(all_keys, llm_keys_to_incude)
len(embedding_keys_to_exclude) # check

62

<span style="color: blue; font-size: 30px;">LlamaIndex

##### Select Embedding type

In [20]:
# using this as baseline standard
embed_model = OpenAIEmbedding(model="text-embedding-ada-002")

# Possible alternative (TBD)
# embed_model = NomicEmbedding(
#     api_key=nomic_api_key,
#     dimensionality=128,
#     model_name="nomic-embed-text-v1.5",
# )
# embedding = embed_model.get_text_embedding("Nomic Embeddings")

##### Select LLM  type

In [21]:
# using this as baseline standard
llm = OpenAI(temperature=0.001, model="gpt-3.5-turbo", max_tokens=512)

##### Global settings (Embedding & LLM)

In [22]:
Settings.llm = llm
Settings.embed_model = embed_model

### **Creating & Loading LlamaIndex Documents**

In [23]:
# the functions below appear in utils_16F

In [24]:
# def flatten_dict(data, parent_key='', sep=' '):
#     """
#     Flattens a nested dictionary, including dictionaries nested within lists,
#     into a flat dictionary with concatenated keys.

#     Args:
#         data: The nested dictionary to flatten.
#         parent_key: The base key to use for the current level of recursion (default '').
#         sep: The separator between parent and child keys (default ' ').

#     Returns:
#         A flat dictionary with concatenated keys representing the structure of the nested input.
#     """
#     items = []
#     if isinstance(data, list):
#         # Handle lists by merging them into their parent key without indexing
#         for i, element in enumerate(data):
#             new_key = f"{parent_key}{sep if parent_key else ''}{i}"
#             items.extend(flatten_dict(element, new_key, sep=sep).items())
#     elif isinstance(data, dict):
#         for key, value in data.items():
#             new_key = f"{parent_key}{sep}{key}" if parent_key else key
#             if isinstance(value, (dict, list)):
#                 items.extend(flatten_dict(value, new_key, sep=sep).items())
#             else:
#                 items.append((new_key, value))
#     else:
#         items.append((parent_key, data))

#     return dict(items)


# def replace_double_newline(text):
#     """Replaces all occurrences of two or more \n with a single \n, 
#     including when preceded or followed by other characters."""
#     text = re.sub(r"\n{2,}", "\n", text)
#     text = text.replace(";", "&")
#     return text


# def format_flattened_dict(flat_dict):
#     """
#     Formats a flattened dictionary into a string representation where each key-value pair is on its own line.

#     Args:
#         flat_dict: The flat dictionary to format.

#     Returns:
#         A string representation of the flat dictionary.
#     """
#     lines = []
#     for key, value in flat_dict.items():
#         # For list elements, remove the numerical index from the key
#         formatted_key = key.rsplit(' ', 1)[0] if key[-1].isdigit() else key
#         line = f'"{formatted_key}": "{value}"' if isinstance(value, str) else f'"{formatted_key}": {value}'
#         line = replace_double_newline(line)
#         lines.append(line)
#     return ",\n".join(lines)

##### Example of format to load into LlamaIndex

In [25]:
# example = documents_list[0]
# # apply functions from utils
# processed_example = format_flattened_dict(flatten_dict(example))
# display(Markdown(processed_example)) # check

In [26]:
# example # check

In [27]:
# NOTE:  metata data must be one of (str, int, float, None)
def create_llama_docs(documents_list):
    """
    Converts a list of trial documents into LlamaIndex Document objects.
    """
    
    llama_documents = []  
    for trial in documents_list:
        # apply functions from utils to flatten JSON and create content similar to the example above
        content_text = format_flattened_dict(flatten_dict(trial))

        llama_document = Document(
            text=content_text, 
            metadata=trial, 
            excluded_llm_metadata_keys=llm_keys_to_exclude, #<== adjust?, TBD
            excluded_embed_metadata_keys=embedding_keys_to_exclude , #<== adjust?, TBD
            metadata_template="{key}=>{value}",
            text_template="Metadata:\n{metadata_str}\n===========================\nContent: \n{content}"
        )
        llama_documents.append(llama_document)  
    return llama_documents

In [28]:
# Create LlamaIndex Document objects
llama_documents = create_llama_docs(documents_list)
# len(llama_documents) # check

In [29]:
# Example — LLM sees this:
print(llama_documents[0].get_content(metadata_mode=MetadataMode.LLM))

Metadata:
National Clinical Identification NCT ID=>NCT01720524
Brief title=>A Study To Evaluate Safety And Efficacy Of IV Sildenafil In The Treatment Of Neonates With Persistent Pulmonary Hypertension Of The Newborn
Lead sponsor=>Pfizer's Upjohn has merged with Mylan to form Viatris Inc.
Condition=>['Pulmonary Hypertension, Familial Persistent, of the Newborn']
Conditions keywords=>['persistent pulmonary hypertension', 'newborn', 'neonates', 'iv sildenafil', 'hypoxic respiratory failure and at risk of persistent pulmonary hypertension of the newborn']
Arms group 0 intervention names=>['Drug: placebo']
Content: 
"National Clinical Identification NCT ID": "NCT01720524",
"Organization study identification": "A1481316",
"EudraCT number": "2012-002619-24",
"Organization": "Pfizer",
"Organization class": "INDUSTRY",
"Brief title": "A Study To Evaluate Safety And Efficacy Of IV Sildenafil In The Treatment Of Neonates With Persistent Pulmonary Hypertension Of The Newborn",
"Official title": "A

In [30]:
# Example — Embedding sees this:
print(llama_documents[0].get_content(metadata_mode=MetadataMode.EMBED))

Metadata:
National Clinical Identification NCT ID=>NCT01720524
Brief title=>A Study To Evaluate Safety And Efficacy Of IV Sildenafil In The Treatment Of Neonates With Persistent Pulmonary Hypertension Of The Newborn
Lead sponsor=>Pfizer's Upjohn has merged with Mylan to form Viatris Inc.
Condition=>['Pulmonary Hypertension, Familial Persistent, of the Newborn']
Conditions keywords=>['persistent pulmonary hypertension', 'newborn', 'neonates', 'iv sildenafil', 'hypoxic respiratory failure and at risk of persistent pulmonary hypertension of the newborn']
Arms group 0 intervention names=>['Drug: placebo']
Content: 
"National Clinical Identification NCT ID": "NCT01720524",
"Organization study identification": "A1481316",
"EudraCT number": "2012-002619-24",
"Organization": "Pfizer",
"Organization class": "INDUSTRY",
"Brief title": "A Study To Evaluate Safety And Efficacy Of IV Sildenafil In The Treatment Of Neonates With Persistent Pulmonary Hypertension Of The Newborn",
"Official title": "A

### **Chunking** — creating LlamaIndex *Nodes*

In [31]:
def create_nodes(llama_documents):
    """
    Generates and embeds nodes from Llama documents.
    """
    parser = SentenceSplitter(chunk_size=1024,chunk_overlap=20) # <== adjust from default
    nodes = parser.get_nodes_from_documents(llama_documents)
    for node in nodes:
        node_embedding = embed_model.get_text_embedding(
            node.get_content(metadata_mode=MetadataMode.EMBED)
        )
        node.embedding = node_embedding
    return nodes

In [32]:
# create nodes from Llama documents
nodes = create_nodes(llama_documents)

### **Storing** Nodes — example with **Chroma Vector Database**

In [33]:
# Chroma DB collection name
COLLECTION_NAME = "CLINICAL_RAG"

db = chromadb.PersistentClient(path="chroma_db")
print(f"Looking for the {COLLECTION_NAME} collection in the database..." )
if COLLECTION_NAME not in [col.name for col in db.list_collections()]:
    print(f"{COLLECTION_NAME} collection WAS NOT FOUND in Chroma DB, creating...")
    chroma_collection = db.create_collection(COLLECTION_NAME)
    print("Creating vector store...")
    vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
    storage_context = StorageContext.from_defaults(vector_store=vector_store)
    print("Creating vector store index")
    VectorStoreIndex(
        nodes=nodes,
        storage_context=storage_context,
        store_nodes_override=True
    )
    record_count = chroma_collection.count()
    print(f"record count: {record_count}")   
    
else:
    print(f"{COLLECTION_NAME} collection WAS FOUND in Chroma DB")
    COLLECTION_NAME = db.get_collection(COLLECTION_NAME)
    vector_store = ChromaVectorStore(chroma_collection=COLLECTION_NAME)
    print("Restoring vector store index from the collection...")
    index = VectorStoreIndex.from_vector_store(
        vector_store=vector_store,
        embed_model=embed_model,
        store_nodes_override=True
    )
    record_count = COLLECTION_NAME.count()
    print(f"record count: {record_count}")

Looking for the CLINICAL_RAG collection in the database...
CLINICAL_RAG collection WAS NOT FOUND in Chroma DB, creating...
Creating vector store...
Creating vector store index
record count: 201


###  **Creating Index** from stored Nodes

In [34]:
index = VectorStoreIndex.from_vector_store(vector_store)  

### (optional) add record(s)
no error thrown if records are dupes

In [35]:
print(f"record_count as is: {record_count}")

record_count as is: 201


In [36]:
# new unrelateds trial
list_of_nct_id = [
    "NCT00108953", # Carcinoma, Hepatocellular
]

downloaded_json = get_downloaded_json(list_of_nct_id)
documents_list = list_from_extracted_json(downloaded_json)

In [37]:
# using previous functions 
llama_documents = create_llama_docs(documents_list)
nodes = create_nodes(llama_documents)
index.insert_nodes(nodes)

In [38]:
try:
    print(f"new DB record count: {chroma_collection.count()}")
    record_count = chroma_collection.count()
except:
    print(f"established DB record count: {COLLECTION_NAME.count()}")
    record_count = COLLECTION_NAME.count()

new DB record count: 204


### **Querying** — example with ChatBot

In [39]:
memory = ChatMemoryBuffer.from_defaults(token_limit=10_000) #<== adjust
# chat_engine.reset() 

chat_engine = index.as_chat_engine(
    similarity_top_k=3, #<== adjust
    chat_mode="condense_plus_context", 
    memory=memory,
    llm=llm,
    context_prompt=(
        """
        You are a chatbot which is expert in parsing information.
        When asked a question, provide a complete response, concisely.
        """
        "Here are the relevant documents for the context:\n"
        "{context_str}"
        "\nInstruction: Use only the context above or this chat history to respond."
    ),
    verbose=False,
)

In [40]:
# chat_engine.reset() 

In [41]:
def ask_chatbot(query):
    response = chat_engine.chat(query)
    display(Markdown(chat_engine.chat(query).response))
    # return response

In [42]:
query = "Provide brief title(s) for studies involving newborn babies?"
ask_chatbot(query)

1. Protocol to Monitor the Neurological Development of Infants With Exposure in Utero From Birth to 15 Months in Tanezumab Clinical Studies
2. A Study To Evaluate Safety And Efficacy Of IV Sildenafil In The Treatment Of Neonates With Persistent Pulmonary Hypertension Of The Newborn

In [43]:
query = "What was the enrollment count of the trial with Sildenafil?"
ask_chatbot(query)
# True answer: 59

The enrollment count of the trial with Sildenafil was 59 participants.

In [44]:
query = "What was the time frame for the primary oucome of this trial?"
ask_chatbot(query)
# True answer: 
# Time in days, on iNO treatment, for participants without iNO treatment failure, was calculated 14 days from the initiation of IV study drug or hospital discharge, whichever occurred first. 
# iNO treatment failure was defined as need for additional treatment targeting PPHN, need for extra corporeal membrane oxygenation (ECMO), or death during the study.

The time frame for the primary outcome of the trial was 14 days from the initiation of the IV study drug or hospital discharge, whichever occurred first.

In [45]:
query = "What was EudraCT number of this trial?"
ask_chatbot(query)
# True answer: "2012-002619-24"

The EudraCT number of this trial was 2012-002619-24.

In [46]:
query = "What conditions are mentioned in this trial invoving Sildenafil?"
ask_chatbot(query)
# True answer: 	"Pulmonary Hypertension, Familial Persistent, of the Newborn"

The conditions mentioned in this trial involving Sildenafil are "Pulmonary Hypertension, Familial Persistent, of the Newborn."

In [47]:
query = "How did participants receive the treatment?"
ask_chatbot(query)
# True answer: 	"Part A: Participants received sildenafil intravenously based on their body weight at a loading dose of 0.1 milligrams per kg (mg/kg) for 30 minutes on Day 1 
# followed by a maintenance dose of 0.03 milligrams per kg per hour (mg/kg/hr), for a minimum of 2 days and maximum of 14 days. 
# Infusion continuation was upon investigator discretion in view of participants' safety and well-being. 
# Part B: Participants who started Part A (not necessarily completed Part A) and who were eligible and consented, continued to be followed up in part B of the study."

Participants received the treatment either as IV Sildenafil or placebo intravenously based on their body weight, with a loading dose followed by a maintenance dose for a minimum of 2 days and a maximum of 14 days.

In [48]:
query = "What statistical method used this trial invoving Sildenafil?"
ask_chatbot(query)
# True answer: "ANCOVA"
# ???

The statistical method used in this trial involving Sildenafil was ANCOVA (Analysis of Covariance).

In [49]:
query = "What p-value was reported for this statistical method?"
ask_chatbot(query)
# True answer: 	0.9850
# ???

The reported p-value for the statistical method ANCOVA in this trial was 0.9850.

##### Reset the ChatBot

In [50]:
chat_engine.reset() 

In [51]:
query = "Do any trials mention Leukemia? Which are they?"
ask_chatbot(query)
# True answer: "NCT02130557",  "NCT02367456", possibly more

Yes, two trials mention Leukemia:
1. A Multicenter Phase 3, Open-Label Study of Bosutinib Versus Imatinib in Adult Patients With Newly Diagnosed Chronic Phase Chronic Myelogenous Leukemia (NCT02130557)
2. A Study Evaluating Intensive Chemotherapy With or Without Glasdegib or Azacitidine With or Without Glasdegib In Patients With Previously Untreated Acute Myeloid Leukemia (NCT03416179)

In [52]:
query = "What is the NCT ID of the trial mentioning Chronic Myelogenous Leukemia?"
ask_chatbot(query)
# True answer: "NCT02130557"

The NCT ID of the trial mentioning Chronic Myelogenous Leukemia is NCT02130557.

In [53]:
query = "Provide a detailed description of this specific trial."
ask_chatbot(query)


The trial with NCT ID NCT02130557 is a multicenter Phase 3, open-label study comparing Bosutinib versus Imatinib in adult patients with newly diagnosed Chronic Phase Chronic Myelogenous Leukemia. The study aims to evaluate the efficacy and safety of Bosutinib, a tyrosine kinase inhibitor, compared to Imatinib, a standard treatment for Chronic Myelogenous Leukemia, in this patient population. Patients enrolled in the trial will be randomly assigned to receive either Bosutinib or Imatinib and will be monitored for the response to treatment and any adverse effects. The primary objective is to assess the major molecular response rate at 12 months. This trial is crucial for determining the optimal treatment approach for patients with Chronic Phase Chronic Myelogenous Leukemia.

In [54]:
query = "Who was the lead sponsor of NCT02130557?"
ask_chatbot(query)
# True answer: Pfizer

The lead sponsor of the trial with NCT ID NCT02130557, which compares Bosutinib versus Imatinib in adult patients with Chronic Phase Chronic Myelogenous Leukemia, is Pfizer.

In [55]:
query = "What intervention was studied in NCT02130557?"
ask_chatbot(query)
# True answer: Bosutinib

In the trial with NCT ID NCT02130557, the intervention studied was the comparison of Bosutinib versus Imatinib in adult patients with newly diagnosed Chronic Phase Chronic Myelogenous Leukemia.

In [56]:
query = "What was the enrollment count this trial with Bosutinib?"
ask_chatbot(query)
# True answer: 536

The enrollment count for the trial with Bosutinib in NCT02130557 was 536 participants.

##### Reset the ChatBot

In [57]:
chat_engine.reset() 

In [58]:
query = "Were Japanese participants involved in any trial?"
ask_chatbot(query)
# True answer: Yes, Japanese participants were involved in the trial with the National Clinical Identification NCT ID NCT04350606.

Yes, Japanese participants were involved in the trial with National Clinical Identification NCT ID NCT04350606. The study assessed the efficacy and safety of PF-06462700 in Japanese participants with moderate and above aplastic anemia.

In [59]:
query = "What was the p-value of the trial with Japanese participants?"
ask_chatbot(query)
# True answer: The p-value for the trial with Japanese participants (National Clinical Identification NCT ID NCT04350606) is not available in the provided information.

The p-value for the trial with Japanese participants is not available in the provided information.

<span style="color: blue; font-size: 30px;">Evaluation

### Using **CallbackManager & LlamaDebugHandler**

In [60]:
STOPPING HERE 

SyntaxError: invalid syntax (141466560.py, line 1)

In [None]:
from llama_index.core.callbacks import CallbackManager, LlamaDebugHandler

# initiaalize debug handler and save to Settings
llm_debug = LlamaDebugHandler()
callback_manager = CallbackManager([llm_debug])
Settings.callback_manager = callback_manager

In [None]:
# helper function
def print_in_out(in_out):
	print(in_out[0][0].payload['messages'][0])
	print("\n\n")
	print(in_out[0][0].payload['messages'][1])

# whenever query_engine.query() is called
query_engine = index.as_query_engine()
query = "How many NCT ID related to Leukemia? Which are they?"
response = query_engine.query(query)
print(response)

# print LLM debug info 
in_out = llm_debug.get_llm_inputs_outputs()
print_in_out(in_out)