## Imports

In [1]:
from dotenv import load_dotenv
import json
import os
import pandas as pd
import pprint
import requests as req
import urllib.request

import chromadb

import openai

from llama_cpp import Llama
from llama_index.core import Document, Settings, StorageContext, VectorStoreIndex
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core import PromptTemplate
from llama_index.core.response.notebook_utils import display_response
from llama_index.core.schema import MetadataMode

from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.embeddings.nomic import NomicEmbedding
from llama_index.embeddings.openai import OpenAIEmbedding

from llama_index.vector_stores.chroma import ChromaVectorStore

from utils_15B import extract_from_json

In [2]:
%reload_ext watermark
%watermark -v -p llama_index.core

# Python implementation: CPython
# Python version       : 3.11.7
# IPython version      : 8.20.0

# llama_index.core: 0.10.12

Python implementation: CPython
Python version       : 3.10.11
IPython version      : 8.22.0

llama_index.core: 0.10.12


In [3]:
! pip list | grep ^l

# llama_cpp_python                         0.2.53
# llama-index-core                         0.10.12
# llama-index-embeddings-openai            0.1.6
# llama-index-llms-llama-cpp               0.1.3
# llama-index-llms-openai                  0.1.6
# llama-index-vector-stores-chroma         0.0.1
# llamaindex-py-client                     0.1.13

'grep' is not recognized as an internal or external command,
operable program or batch file.


## Verify API tokens are available

In [2]:
load_dotenv()  # This loads the variables from .envz
nomic_api_key = os.getenv("NOMIC_API_KEY")
# print(nomic_api_key)

## (Optional) Remove previous JSON files and Chroma DB before starting
<span style="color: darkred; font-size: 18px;">using macOS/Linux %%bash

In [None]:
%%bash
rm -rf chroma_db

In [None]:
%%bash
find ./ -type f -name "*.json" -delete

## Model

###  LlamaIndex helper callback functions to add the system prompt and wrap in special tokens

In [3]:
from typing import List
from llama_index.core.base.llms.types import MessageRole

BOS, EOS = "<s>", "</s>"
B_INST, E_INST = "[INST]", "[/INST]"
B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"
DEFAULT_SYSTEM_PROMPT = """You are an AI assistant that answers questions in a friendly manner, based on the given source documents. Here are some rules you always follow:
- Generate human readable output, avoid creating output with gibberish text.
- Generate only the requested output, don't include any other language before or after the requested output.
- Never say thank you, that you are happy to help, that you are an AI agent, etc. Just answer directly.
- Generate professional language typically used in business documents in North America.
- Never generate offensive or foul language.
"""


def messages_to_prompt_callback(messages, system_prompt=None) -> str:
    string_messages: List[str] = []
    if messages[0].role == MessageRole.SYSTEM:
        # pull out the system message (if it exists in messages)
        system_message_str = messages[0].content or ""
        messages = messages[1:]
    else:
        system_message_str = system_prompt or DEFAULT_SYSTEM_PROMPT

    system_message_str = f"{B_SYS} {system_message_str.strip()} {E_SYS}"

    for i in range(0, len(messages), 2):
        # first message should always be a user
        user_message = messages[i]
        assert user_message.role == MessageRole.USER

        if i == 0:
            # make sure system prompt is included at the start
            str_message = f"{BOS} {B_INST} {system_message_str} "
        else:
            # end previous user-assistant interaction
            string_messages[-1] += f" {EOS}"
            # no need to include system prompt
            str_message = f"{BOS} {B_INST} "

        # include user message content
        str_message += f"{user_message.content} {E_INST}"

        if len(messages) > (i + 1):
            # if assistant message exists, add to str_message
            assistant_message = messages[i + 1]
            assert assistant_message.role == MessageRole.ASSISTANT
            str_message += f" {assistant_message.content}"

        string_messages.append(str_message)

    return "".join(string_messages)

def completion_to_prompt_callback(completion, system_prompt=None) -> str:
    system_prompt_str = system_prompt or DEFAULT_SYSTEM_PROMPT

    return (
        f"{BOS} {B_INST} {B_SYS} {system_prompt_str.strip()} {E_SYS} "
        f"{completion.strip()} {E_INST}"
    )

### Llama 2 model

#### Dmitry's CPP code

In [4]:
from llama_index.llms.llama_cpp.llama_utils import messages_to_prompt
from llama_index.llms.llama_cpp import LlamaCPP

LLAMA2_7B_CHAT = "https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGUF/resolve/main/llama-2-7b-chat.Q4_0.gguf"

selected_model = LLAMA2_7B_CHAT

llm = LlamaCPP(
    model_url=selected_model,
    temperature=0.1,
    max_new_tokens=2048,
    # llama2 has a context window of 4096 tokens, but we set it lower to allow for some wiggle room
    context_window=3900,
    messages_to_prompt=messages_to_prompt_callback,
    completion_to_prompt=completion_to_prompt_callback
)

llm.verbose = False

llama_model_loader: loaded meta data with 19 key-value pairs and 291 tensors from C:\Users\dimson\AppData\Local\llama_index\models\llama-2-7b-chat.Q4_0.gguf (version GGUF V2)
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = LLaMA v2
llama_model_loader: - kv   2:                       llama.context_length u32              = 4096
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   4:                          llama.block_count u32              = 32
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 11008
llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
llama_model_loader: - kv   7:                

### using LlamaIndex reference

## Embedding

In [5]:
# using this as baseline standard
embed_model = OpenAIEmbedding(model="text-embedding-ada-002")

# embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")

In [6]:
Settings.llm = llm
Settings.embed_model = embed_model

## Fetch data from "specific" clinicaltrials.gov
<span style="color: darkred; font-size: 18px;"> source: https://drive.google.com/file/d/1HOsN3v8DLzwoMOXOr_Mfb1Hn6XNwlZ72/view?usp=sharing

In [7]:
def get_trial(nct_id):
    trial = req.get(f"https://clinicaltrials.gov/api/v2/studies/{nct_id}")
    trial_json = trial.json()
    return trial_json

In [8]:
# Some trials to consider (interventional, completed):
# nct_id = "NCT00094887"
# nct_id = "NCT00108953"
# nct_id = "NCT00177671" 
# nct_id = "NCT00281918"
# nct_id = "NCT00404079"
# nct_id = "NCT00426751"
# nct_id = "NCT01865747" #<== good one 


### Use just one one trial


In [9]:
# list_of_nct_id = [
#     "NCT00094887",
#     "NCT00108953",
#     "NCT00177671",
#     "NCT00281918",
#     "NCT00404079",
#     "NCT00426751",
#     "NCT01865747",
# ]

list_of_nct_id = [
    "NCT00108953",
]

In [10]:
def get_downloaded_json(list_of_nct_id):
    downloaded_json = []
    for nct_id in list_of_nct_id:
        trial = get_trial(nct_id)
        downloaded_json.append(trial)
        # save locally for reference
        with open(f"{nct_id}.json", "w") as f:
            json.dump(trial, f, indent=4)
    return downloaded_json

downloaded_json = get_downloaded_json(list_of_nct_id)
# downloaded_json[3]

## Extract a  subset of the data

In [11]:
def list_from_extracted_json(downloaded_json):    
    documents_list  = []
    for json_file in downloaded_json:
        extracted_json = extract_from_json(json_file)
        nct_id = json_file['protocolSection']['identificationModule']['nctId']
        # save manipulated JSON file to disk for review
        save_path = f"{nct_id}_extracted.json"
        with open(save_path, "w") as f:
            json.dump(extracted_json, f, indent=4)
        # prepare for indexing
        documents_list.append(extracted_json)
    return documents_list
    
documents_list = list_from_extracted_json(downloaded_json)
# len(documents_list)

In [12]:
# documents_list[0].keys() # useful later to adjust metadata

## Llama index

### add metadata

In [13]:
# all the keys (for metadata)
all_keys = list(documents_list[0].keys())
# all_keys

In [14]:
# to adjust the metadata keys used
llm_keys_to_incude = [
    "Brief title",
    "National Clinical Identification NCT ID",
    "Lead sponsor",
    "Arms group 0 intervention names",
    "Enrollment count",
]

llm_keys_to_exclude = [key for key in all_keys if key not in llm_keys_to_incude]

# for simplicity, do the same for embedding_keys_to_exclude (in this example)
embedding_keys_to_exclude = llm_keys_to_exclude

In [15]:
# NOTE:  metata data must be one of (str, int, float, None)
# use json.dumps() to convert lists and dictionaries into strings

def create_llama_docs(documents_list):
    llama_documents = []

    for trial in documents_list:
        trial["Brief title"] = json.dumps(trial["Brief title"])
        trial["Official title"] = json.dumps(trial["Official title"])
        trial["Brief summary"] = json.dumps(trial["Brief summary"])
        trial["Detailed description"] = json.dumps(trial["Detailed description"])
        trial["Arms group 0 intervention name"] = json.dumps(trial["Arms group 0 intervention name"])
        trial["Arms group 1 intervention name"] = json.dumps(trial["Arms group 1 intervention name"])
        trial["Eligibility minimum age"] = json.dumps(trial["Eligibility minimum age"])
        trial["Organization"] = json.dumps(trial["Organization"])

        # create a Llama Document object 
        # with text and excluded meta data for llm and embedding model
        llama_document = Document(
            text=trial["Detailed description"],
#             text=json.dumps(trial), #<== testing
            metadata=trial,
            excluded_llm_metadata_keys=llm_keys_to_exclude,
            excluded_embed_metadata_keys=embedding_keys_to_exclude ,
            metadata_template="{key}=>{value}",
            text_template="Metadata:\n{metadata_str}\n===========================\nContent: \n{content}"
        )
        llama_documents.append(llama_document)
    
    return llama_documents

llama_documents = create_llama_docs(documents_list)

In [16]:
# Example —LLM sees this:
print(llama_documents[0].get_content(metadata_mode=MetadataMode.LLM))

Metadata:
National Clinical Identification NCT ID=>NCT00108953
Brief title=>"A Research Study to Treat Patients With Advanced Hepatocellular Carcinoma"
Lead sponsor=>Bayer
Enrollment count=>96
Arms group 0 intervention names=>['Drug: Sorafenib (Nexavar, BAY43-9006) plus Doxorubicin']
Content: 
"In addition to the key secondary outcome parameters the following parameters will be assessed in an exploratory manner: relative time to progression (TTP), time to symptomatic progression (TTSP), response rate (RR) and overall survival between the 2 study populations.\n\nThe possible and potential predictive assays of clinical benefit through an assessment of the correlation between the defined baseline characteristics and key clinical endpoints.\n\nThe safety and tolerability will be assessed in the adverse event section. Doxorubicin pharmacokinetics in HCC patients treated with sorafenib versus placebo will be compared and the pharmacokinetic data will be correlated with doxorubicin-related ad

In [17]:
# Example — Embedding model sees this:
print(llama_documents[0].get_content(metadata_mode=MetadataMode.EMBED))

Metadata:
National Clinical Identification NCT ID=>NCT00108953
Brief title=>"A Research Study to Treat Patients With Advanced Hepatocellular Carcinoma"
Lead sponsor=>Bayer
Enrollment count=>96
Arms group 0 intervention names=>['Drug: Sorafenib (Nexavar, BAY43-9006) plus Doxorubicin']
Content: 
"In addition to the key secondary outcome parameters the following parameters will be assessed in an exploratory manner: relative time to progression (TTP), time to symptomatic progression (TTSP), response rate (RR) and overall survival between the 2 study populations.\n\nThe possible and potential predictive assays of clinical benefit through an assessment of the correlation between the defined baseline characteristics and key clinical endpoints.\n\nThe safety and tolerability will be assessed in the adverse event section. Doxorubicin pharmacokinetics in HCC patients treated with sorafenib versus placebo will be compared and the pharmacokinetic data will be correlated with doxorubicin-related ad

## Create Nodes
<span style="color: darkred; font-size: 15px;">adjust chunk_size, chunk_overlap</span>

In [18]:
def create_nodes(llama_documents):
    parser = SentenceSplitter(chunk_size=2048,chunk_overlap=40) # <== adjust(default = 1024/20)
    nodes = parser.get_nodes_from_documents(llama_documents)

    for node in nodes:
        node_embedding = embed_model.get_text_embedding(
            node.get_content(metadata_mode=MetadataMode.EMBED)
        )
        node.embedding = node_embedding
        
    return nodes

nodes = create_nodes(llama_documents)

## Chroma

In [19]:
# Chroma DB collection name
COLLECTION_NAME = "CLINICAL_RAG"

db = chromadb.PersistentClient(path="chroma_db")
print(f"Looking for the {COLLECTION_NAME} collection in the database..." )
if COLLECTION_NAME not in [col.name for col in db.list_collections()]:
    print(f"{COLLECTION_NAME} collection WAS NOT FOUND in Chroma DB, creating...")
    chroma_collection = db.create_collection(COLLECTION_NAME)
    print("Creating vector store...")
    vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
    storage_context = StorageContext.from_defaults(vector_store=vector_store)

    print("Creating vector store index")
    index = VectorStoreIndex(
        nodes=nodes,
        storage_context=storage_context,
        store_nodes_override=True
    )
    print(f"record count: {chroma_collection.count()}"     
    )
    
else:
    print(f"{COLLECTION_NAME} collection WAS FOUND in Chroma DB")
    COLLECTION_NAME = db.get_collection(COLLECTION_NAME)
    vector_store = ChromaVectorStore(chroma_collection=COLLECTION_NAME)
    print("Restoring vector store index from the collection...")
    index = VectorStoreIndex.from_vector_store(
        vector_store=vector_store,
        embed_model=embed_model,
        store_nodes_override=True
    )

    print(f"record count: {COLLECTION_NAME.count()}")

Looking for the CLINICAL_RAG collection in the database...
CLINICAL_RAG collection WAS NOT FOUND in Chroma DB, creating...
Creating vector store...
Creating vector store index
record count: 1


### Query Engine

In [20]:
query_engine = index.as_query_engine()

In [21]:
response = query_engine.query("Are there any clinical trials about liver disease?")

Number of requested results 2 is greater than number of elements in index 1, updating n_results = 1

llama_print_timings:        load time =    3165.46 ms
llama_print_timings:      sample time =      37.75 ms /   242 runs   (    0.16 ms per token,  6409.92 tokens per second)
llama_print_timings: prompt eval time =    3164.95 ms /   485 tokens (    6.53 ms per token,   153.24 tokens per second)
llama_print_timings:        eval time =   42591.66 ms /   241 runs   (  176.73 ms per token,     5.66 tokens per second)
llama_print_timings:       total time =   46476.68 ms /   726 tokens
