## Imports

In [1]:
from dotenv import load_dotenv 

import evaluate # for ROUGE metric
import json
import os
import requests as req
import time

from llama_index.core import Document, settings, VectorStoreIndex
from llama_index.embeddings.nomic import NomicEmbedding

from utils_3 import safe_get, extract_from_json, flatten_data

## bioBert

In [2]:
from transformers import AutoTokenizer, AutoModel

# reference:  https://huggingface.co/bvanaken/CORe-clinical-outcome-biobert-v1
tokenizer = AutoTokenizer.from_pretrained("bvanaken/CORe-clinical-outcome-biobert-v1")
model = AutoModel.from_pretrained("bvanaken/CORe-clinical-outcome-biobert-v1")

## Verify API tokens are available

In [3]:
load_dotenv()  # This loads the variables from .envz
nomic_api_key = os.getenv("NOMIC_API_KEY")

In [4]:
embed_model = NomicEmbedding(
    api_key=nomic_api_key,
    dimensionality=128,
    model_name="nomic-embed-text-v1.5",
)

embedding = embed_model.get_text_embedding("Nomic Embeddings")

llm = model

settings.llm = llm
settings.embed_model = embed_model

## Fetch data from "specific" clinicaltrials.gov
<span style="color: darkred; font-size: 18px;"> source: https://drive.google.com/file/d/1HOsN3v8DLzwoMOXOr_Mfb1Hn6XNwlZ72/view?usp=sharing

In [5]:
# specify the clinical trial 
# nct_id = "NCT00094887"
nct_id = "NCT00108953"
# nct_id = "NCT00177671" 
# nct_id = "NCT00281918"
# nct_id = "NCT00404079"
# nct_id = "NCT00426751"
# nct_id = "NCT01865747" #<== good one 

# name the downloaded JSON "clinical_study"
response = req.get(f"https://clinicaltrials.gov/api/v2/studies/{nct_id}")
clinical_study = response.json()
# clinical_study

# save the full JSON locally for review (e.g., jsonhero.io)
with open(f"{nct_id}.json", "w") as f:
    json.dump(clinical_study, f, indent=4)

## Extract a  subset of the data

In [6]:
extracted_json = extract_from_json(clinical_study)

In [7]:
# save a JSON file to disk for review
save_path = f"{nct_id}_extracted.json"
with open(save_path, "w") as f:
    json.dump(extracted_json, f, indent=4)

In [8]:
# prepare for indexing
trial_info = trial_info = flatten_data(extracted_json, path="")
# trial_info

In [9]:
# save text file for disk for review
with open("trial_info.txt", "w") as file:
  for item in trial_info:
    file.write(item + "\n")

## LlamaIndex RAG 

In [10]:
documents =[Document(text=t)for t in trial_info]

In [11]:
# Create the vector store that we use find relevant documents
index = VectorStoreIndex.from_documents(documents)

In [12]:
# A query engine 
# query_engine = index.as_query_engine()
query_engine = index.as_query_engine(similarity_top_k=5)# <== adjusted this

### quick test

In [13]:
def get_response(query, prompt_1, prompt_2):
    response = query_engine.query(query)
    print(f"Original response:\n{response}")
 
    query_2 = prompt_1 + response.response
    response_2 = query_engine.query(query_2)
    print(f"\nPlain Language Summary:\n{response_2}")
    
    query_2 = prompt_2 + response.response
    response_2 = query_engine.query(query_2)
    print(f"\nExpert Analysis:\n{response_2}")


#### enhanced prompts

In [14]:
PLS_prompt = """Using everyday language to make the clinical results of a study meaningful and understandable to a lay person, rephrase this: """
expert_prompt = """Emulate a PhD scientist and expert statistician to elaborate on the following: """

#### example 1

In [15]:
query_1 = "For which medical condition and treatment was this study done?"
get_response(query_1, PLS_prompt, expert_prompt)

Original response:
The study was conducted for patients with advanced hepatocellular carcinoma, and the treatment involved a combination of BAY43-9006 and doxorubicin compared to doxorubicin alone.

Plain Language Summary:
The study looked at how effective and safe it was to give patients with advanced liver cancer a combination of two drugs, BAY43-9006 and doxorubicin, versus just using doxorubicin by itself.

Expert Analysis:
The study aimed to assess the safety and effectiveness of combining BAY43-9006 with doxorubicin in treating patients with advanced hepatocellular carcinoma, in comparison to using doxorubicin alone. The treatment regimen involved administering sorafenib (Nexavar, BAY43-9006) along with doxorubicin, with the intention of evaluating any potential benefits of this combination therapy over using doxorubicin as a monotherapy. The study design allowed for a direct comparison between the two treatment approaches, providing valuable insights into the potential synergist

#### example 2

In [16]:
query_2 = """Yes or no, was p-value reported?. What is the p-value?"""
get_response(query_2, PLS_prompt, expert_prompt)

Original response:
Yes, the p-value was reported. The p-value is 0.016.

Plain Language Summary:
Yes, the likelihood of the results occurring by chance is very low at 0.016.

Expert Analysis:
The reported p-value of 0.016 indicates that there is statistically significant evidence to reject the null hypothesis in favor of the alternative hypothesis. This suggests that the results are unlikely to have occurred by chance alone.


#### example 3 

In [17]:
query_3 = """Yes or no, was a hazard ratio reported? If a hazard ratio was reported, what was its value?"""
get_response(query_3, PLS_prompt, expert_prompt)

Original response:
Yes, a hazard ratio was reported. Its value was 0.6.

Plain Language Summary:
Yes, the study showed that there was a 40% lower risk of the event occurring in one group compared to the other.

Expert Analysis:
Yes, a hazard ratio of 0.6 was reported, indicating that the risk of the event occurring in one group is 0.6 times the risk of the event occurring in another group. This suggests a certain level of association between the exposure and the outcome being studied.


6.1.4  example 4

In [18]:
query_4 = """What condition is mentioned in this trial?"""
get_response(query_4, PLS_prompt, expert_prompt)

Original response:
Advanced Hepatocellular Carcinoma

Plain Language Summary:
Late-stage liver cancer

Expert Analysis:
Advanced Hepatocellular Carcinoma typically refers to a stage of liver cancer where the cancer has progressed and spread beyond the liver to other parts of the body. This stage is often characterized by a poorer prognosis and may require more aggressive treatment options such as targeted therapies, immunotherapy, or clinical trials. In advanced hepatocellular carcinoma, the focus is often on managing symptoms, improving quality of life, and potentially extending survival through personalized treatment plans based on the individual patient's health status and the specific characteristics of the cancer.
