## Imports

In [1]:
from dotenv import load_dotenv
import json
import os
import pandas as pd
import pprint
import requests as req

import chromadb

from llama_index.core import Document, Settings, StorageContext, VectorStoreIndex
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.response.notebook_utils import display_response
from llama_index.core.schema import MetadataMode
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.llms.openai import OpenAI
from llama_index.vector_stores.chroma import ChromaVectorStore

from utils_4 import extract_from_json

In [2]:
%reload_ext watermark
%watermark -v -p llama_index.core

# Python implementation: CPython
# Python version       : 3.11.7
# IPython version      : 8.20.0

# llama_index.core: 0.10.12

Python implementation: CPython
Python version       : 3.11.7
IPython version      : 8.20.0

llama_index.core: 0.10.12



## Verify API tokens are available

In [3]:
load_dotenv()  # This loads the variables from .envz

True

## Fetch data from "specific" clinicaltrials.gov
<span style="color: darkred; font-size: 18px;"> source: https://drive.google.com/file/d/1HOsN3v8DLzwoMOXOr_Mfb1Hn6XNwlZ72/view?usp=sharing

In [4]:
# Some trials to consider (interventional, completed):
# nct_id = "NCT00094887"
# nct_id = "NCT00108953"
# nct_id = "NCT00177671" 
# nct_id = "NCT00281918"
# nct_id = "NCT00404079"
# nct_id = "NCT00426751"
# nct_id = "NCT01865747" #<== good one 


In [5]:
def get_trial(nct_id):
    trial = req.get(f"https://clinicaltrials.gov/api/v2/studies/{nct_id}")
    trial_json = trial.json()
    return trial_json

In [6]:
list_of_nct_id = [
    "NCT00094887",
    "NCT00108953",
    "NCT00177671",
    "NCT00281918",
    "NCT00404079",
    "NCT00426751",
    "NCT01865747",
]

downloaded_json = []
for nct_id in list_of_nct_id:
    trial = get_trial(nct_id)
    downloaded_json.append(trial)
    # save locally for reference
    with open(f"{nct_id}.json", "w") as f:
        json.dump(trial, f, indent=4)

# downloaded_json[1]

## Extract a  subset of the data

In [7]:
documents_list  = []
for json_file in downloaded_json:
    extracted_json = extract_from_json(json_file)
    nct_id = json_file['protocolSection']['identificationModule']['nctId']
    # save manipulated JSON file to disk for review
    save_path = f"{nct_id}_extracted.json"
    with open(save_path, "w") as f:
        json.dump(extracted_json, f, indent=4)
    # prepare for indexing
    documents_list.append(extracted_json)

In [8]:
# documents_list[0]

## Llama index

In [9]:
embed_model = OpenAIEmbedding(model="text-embedding-3-small", dimensions=256)

llm = OpenAI()

Settings.llm = llm
Settings.embed_model = embed_model

### add metadata
first try

In [10]:
# NOTE: now "metadata" for nations is a string, where countries was an actual list
# NOTE:  metat data must be one of (str, int, float, None)
# use json.dumps() to convert lists and dictionaries into strings

def create_llama_docs(documents_list):
    llama_documents = []

    for trial in documents_list:
        trial["Brief title"] = json.dumps(trial["Brief title"])
        trial["Official title"] = json.dumps(trial["Official title"])
        trial["Brief summary"] = json.dumps(trial["Brief summary"])
        trial["Detailed description"] = json.dumps(trial["Detailed description"])
        trial["Arms group 0 intervention name"] = json.dumps(trial["Arms group 0 intervention name"])
        trial["Arms group 1 intervention name"] = json.dumps(trial["Arms group 1 intervention name"])
        trial["Eligibility minimum age"] = json.dumps(trial["Eligibility minimum age"])
        trial["Organization"] = json.dumps(trial["Organization"])

        # create a Llama Document object 
        # with text and excluded meta data for llm and embedding model
        llama_document = Document(
            text=trial["Brief title"],
            metadata=trial,
            excluded_llm_metadata_keys= ["Brief title", "Detailed description","Eligibility criteria: Inclusion Criteria"],
            excluded_embed_metadata_keys=["Brief title", "Detailed description", "Eligibility criteria: Inclusion Criteria"],
            metadata_template="{key}=>{value}",
            text_template="Metadata: {metadata_str}\n-----\nContent: {content}"
        )

        llama_documents.append(llama_document)
    
    return llama_documents

llama_documents = create_llama_docs(documents_list)

In [11]:
# Example —LLM sees this:
print(llama_documents[0].get_content(metadata_mode=MetadataMode.LLM))

Metadata: National Clinical Identification NCT ID=>NCT00094887
Organization study identification=>INOT 36
EudraCT number=>None
Organization=>"Mallinckrodt"
Organization class=>INDUSTRY
Official title=>"A Prospective, Multicenter, Double-Blind, Randomized, Placebo-Controlled Study of Nitric Oxide for Inhalation in the Acute Treatment of Sickle Cell Pain Crisis"
Overall status=>COMPLETED
Start date=>2004-10
Primary completion date=>2008-12
Completion date=>2008-12
Verification date=>2017-10
Study first submitted date=>2004-10-28
Results first submitted date=>2009-12-04
Last update submitted date=>2020-01-17
Last update posted date=>2020-02-05
Lead sponsor=>Mallinckrodt
Lead sponsor class=>INDUSTRY
Brief summary=>"This study will examine whether nitric oxide (NO) gas can reduce the time it takes for pain to go away in patients who are in sickle cell crisis. NO is important in regulating blood vessel dilation, and consequently, blood flow. The gas is continuously produced by cells that lin

In [12]:
# Example — Embedding model sees this:
print(llama_documents[0].get_content(metadata_mode=MetadataMode.EMBED))

Metadata: National Clinical Identification NCT ID=>NCT00094887
Organization study identification=>INOT 36
EudraCT number=>None
Organization=>"Mallinckrodt"
Organization class=>INDUSTRY
Official title=>"A Prospective, Multicenter, Double-Blind, Randomized, Placebo-Controlled Study of Nitric Oxide for Inhalation in the Acute Treatment of Sickle Cell Pain Crisis"
Overall status=>COMPLETED
Start date=>2004-10
Primary completion date=>2008-12
Completion date=>2008-12
Verification date=>2017-10
Study first submitted date=>2004-10-28
Results first submitted date=>2009-12-04
Last update submitted date=>2020-01-17
Last update posted date=>2020-02-05
Lead sponsor=>Mallinckrodt
Lead sponsor class=>INDUSTRY
Brief summary=>"This study will examine whether nitric oxide (NO) gas can reduce the time it takes for pain to go away in patients who are in sickle cell crisis. NO is important in regulating blood vessel dilation, and consequently, blood flow. The gas is continuously produced by cells that lin

## Embedding

In [13]:
 def create_nodes(llama_documents):
    parser = SentenceSplitter(chunk_size=2560,chunk_overlap=32) # <== adjust
    nodes = parser.get_nodes_from_documents(llama_documents)

    for node in nodes:
        node_embedding = embed_model.get_text_embedding(
            node.get_content(metadata_mode=MetadataMode.EMBED)
        )
        node.embedding = node_embedding
        
    return nodes

nodes = create_nodes(llama_documents)

## Chroma

In [14]:
# Chroma DB collection name
COLLECTION_NAME = "CLINICAL_RAG"

db = chromadb.PersistentClient(path="chroma_db")
print(f"Looking for the {COLLECTION_NAME} collection in the database..." )
if COLLECTION_NAME not in [col.name for col in db.list_collections()]:
    print(f"{COLLECTION_NAME} collection WAS NOT FOUND in Chroma DB, creating...")
    chroma_collection = db.create_collection(COLLECTION_NAME)
    print("Creating vector store...")
    vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
    storage_context = StorageContext.from_defaults(vector_store=vector_store)
    Settings.embed_model = embed_model
    Settings.llm = OpenAI()
    print("Creating vector store index")
    VectorStoreIndex(
        nodes=nodes,
        storage_context=storage_context,
        store_nodes_override=True
    )
    print(f"record count: {chroma_collection.count()}"     
    )
    
else:
    print(f"{COLLECTION_NAME} collection WAS FOUND in Chroma DB")
    COLLECTION_NAME = db.get_collection(COLLECTION_NAME)
    vector_store = ChromaVectorStore(chroma_collection=COLLECTION_NAME)
    print("Restoring vector store index from the collection...")
    index = VectorStoreIndex.from_vector_store(
        vector_store=vector_store,
        embed_model=embed_model,
        store_nodes_override=True
    )

    print(f"record count: {COLLECTION_NAME.count()}")

Looking for the CLINICAL_RAG collection in the database...
CLINICAL_RAG collection WAS FOUND in Chroma DB
Restoring vector store index from the collection...
record count: 7


###  retrieve from Chroma DB and Query

In [15]:
index = VectorStoreIndex.from_vector_store(vector_store)  

In [16]:
query_engine = index.as_query_engine(similarity_top_k=3) # <== adjust

query = """Which trial studied liver disease?"""
response = query_engine.query(query)
display_response(response)


**`Final Response:`** The trial that studied liver disease is the one titled "A Randomized Controlled Study of BAY43-9006 in Combination With Doxorubicin Versus Doxorubicin in Patients With Advanced Hepatocellular Carcinoma."

In [17]:
query = """What was the NCT ID of tha trial which studied liver disease?"""
response = query_engine.query(query)
display_response(response)


**`Final Response:`** NCT ID: NCT00108953

In [18]:
query_engine = index.as_query_engine(similarity_top_k=3)

query = """Which trial studied anemia? What is the ID?"""
response = query_engine.query(query)
display_response(response)

**`Final Response:`** The trial that studied anemia is the one titled "A Prospective, Multicenter, Double-Blind, Randomized, Placebo-Controlled Study of Nitric Oxide for Inhalation in the Acute Treatment of Sickle Cell Pain Crisis" with the ID NCT00094887.

In [24]:
query_engine = index.as_query_engine(similarity_top_k=3)

# NOTE: first time cell was run ==> poor response
#       second time cell was run ==> better response, sometimes — not consistent
query = """Provide the NCT ID and a detailed description of a trial that studied depression?"""

response = query_engine.query(query)
display_response(response)

**`Final Response:`** I'm sorry, but I cannot provide the NCT ID and a detailed description of a trial that studied depression.

## Enhanced Prompts

In [20]:
PLS_prompt = """Using everyday language to make the clinical results of a study meaningful and understandable to a lay person, rephrase this: """
expert_prompt = """Emulate a PhD scientist and expert statistician to elaborate on the following: """

In [21]:
def get_response(query, prompt_1, prompt_2):
    response = query_engine.query(query)
    print(f"Original response:\n{response}")
 
    query_2 = prompt_1 + response.response
    response_2 = query_engine.query(query_2)
    print(f"\nPlain Language Summary:\n{response_2}")
    
    query_2 = prompt_2 + response.response
    response_2 = query_engine.query(query_2)
    print(f"\nExpert Analysis:\n{response_2}")


In [22]:
query_1 = "Provide the NCT ID and a summary of a trial on the liver."
get_response(query_1, PLS_prompt, expert_prompt)

Original response:
NCT ID: NCT00108953
Summary: The trial focused on assessing the safety and efficacy of combining BAY43-9006 with doxorubicin versus using doxorubicin alone in patients with advanced hepatocellular carcinoma, aiming to evaluate Time to Progression (TTP), Overall Survival, Progression Free Survival (PFS), and other relevant outcomes in individuals with advanced liver cancer.

Plain Language Summary:
The trial examined the effectiveness of combining BAY43-9006 with doxorubicin compared to using doxorubicin alone in individuals with advanced liver cancer. The focus was on evaluating Time to Progression (TTP), Overall Survival, Progression Free Survival (PFS), and other key outcomes in patients with advanced hepatocellular carcinoma.

Expert Analysis:
The study with NCT ID NCT00108953 aimed to assess the safety and effectiveness of combining BAY43-9006 with doxorubicin versus using doxorubicin alone in patients with advanced hepatocellular carcinoma. The primary focus was