## Imports

In [1]:
from dotenv import load_dotenv 

#import evaluate # for ROUGE metric
import json
import os
import requests as req
import time

from llama_index.core import Document, Settings, VectorStoreIndex
from llama_index.embeddings.nomic import NomicEmbedding

from utils_3 import safe_get, extract_from_json, flatten_data

## bioBert

In [2]:
from transformers import AutoTokenizer, AutoModel

# reference:  https://huggingface.co/bvanaken/CORe-clinical-outcome-biobert-v1
tokenizer = AutoTokenizer.from_pretrained("bvanaken/CORe-clinical-outcome-biobert-v1")
model = AutoModel.from_pretrained("bvanaken/CORe-clinical-outcome-biobert-v1")

## Verify API tokens are available

In [3]:
load_dotenv()  # This loads the variables from .envz
nomic_api_key = os.getenv("NOMIC_API_KEY")

In [4]:
embed_model = NomicEmbedding(
    api_key=nomic_api_key,
    dimensionality=128,
    model_name="nomic-embed-text-v1.5",
)

embedding = embed_model.get_text_embedding("Nomic Embeddings")

llm = model

#ettings.llm = llm
Settings.embed_model = embed_model

## Fetch data from "specific" clinicaltrials.gov
<span style="color: darkred; font-size: 18px;"> source: https://drive.google.com/file/d/1HOsN3v8DLzwoMOXOr_Mfb1Hn6XNwlZ72/view?usp=sharing

In [5]:
# specify the clinical trial 
# nct_id = "NCT00094887"
nct_id = "NCT00108953"
# nct_id = "NCT00177671" 
# nct_id = "NCT00281918"
# nct_id = "NCT00404079"
# nct_id = "NCT00426751"

# name the downloaded JSON "clinical_study"
response = req.get(f"https://clinicaltrials.gov/api/v2/studies/{nct_id}")
clinical_study = response.json()
# clinical_study

# save the full JSON locally for review (e.g., jsonhero.io)
with open(f"{nct_id}.json", "w") as f:
    json.dump(clinical_study, f, indent=4)

## Extract a  subset of the data

In [6]:
extracted_json = extract_from_json(clinical_study)

In [7]:
# save a JSON file to disk for review
save_path = f"{nct_id}_extracted.json"
with open(save_path, "w") as f:
    json.dump(extracted_json, f, indent=4)

In [8]:
# prepare for indexing
trial_info =  flatten_data(extracted_json, path="")
# trial_info

In [9]:
# save text file for disk for review
with open("trial_info.txt", "w") as file:
  for item in trial_info:
    file.write(item + "\n")

## LlamaIndex RAG 

In [10]:
documents =[Document(text=t)for t in trial_info]

In [11]:
# Create the vector store that we use find relevant documents
index = VectorStoreIndex.from_documents(documents)

In [12]:
# A query engine 
# query_engine = index.as_query_engine()
query_engine = index.as_query_engine(similarity_top_k=5)# <== adjusted this

### quick test

In [13]:
def get_response(query, prompt_1, prompt_2):
    response = query_engine.query(query)
    print(f"Original response:\n{response}")
 
    query_2 = prompt_1 + response.response
    response_2 = query_engine.query(query_2)
    print(f"\nPlain Language Summary:\n{response_2}")
    
    query_2 = prompt_2 + response.response
    response_2 = query_engine.query(query_2)
    print(f"\nExpert Analysis:\n{response_2}")


#### enhanced prompts

In [14]:
PLS_prompt = """Using everyday language to make the clinical results of a study meaningful and understandable to a lay person, rephrase this: """
expert_prompt = """Emulate a PhD scientist and expert statistician to elaborate on the following: """

#### example 1

In [15]:
query_1 = "For which medical condition and treatment was this study done?"
get_response(query_1, PLS_prompt, expert_prompt)

Original response:
The study was conducted for patients with advanced hepatocellular carcinoma (HCC) and the treatment involved evaluating the safety and efficacy of doxorubicin plus sorafenib versus doxorubicin plus placebo.

Plain Language Summary:
The study aimed to see if using a combination of doxorubicin and sorafenib is safer and more effective than using doxorubicin alone in patients with advanced liver cancer.

Expert Analysis:
The study conducted for patients with advanced hepatocellular carcinoma (HCC) aimed to assess the safety and efficacy of combining doxorubicin with sorafenib compared to doxorubicin with a placebo. The research focused on evaluating key secondary outcome parameters such as relative time to progression, time to symptomatic progression, response rate, and overall survival between the two study populations. Additionally, the study explored potential predictive assays of clinical benefit by analyzing the correlation between baseline characteristics and key 

#### example 2

In [16]:
query_2 = """Yes or no, was p-value reported?. What is the p-value?"""
get_response(query_2, PLS_prompt, expert_prompt)

Original response:
Yes, the p-value was reported. The p-value is 0.016.

Plain Language Summary:
Yes, the study found that there is a statistically significant difference between the two groups with a p-value of 0.016.

Expert Analysis:
The reported p-value of 0.016 indicates that there is statistically significant evidence to reject the null hypothesis that the time to progression (TTP) is the same in both treatment groups. In other words, the probability of observing the data, or more extreme data, if the null hypothesis were true is 0.016. This suggests that there is a significant difference in TTP between the group receiving nexavar+doxorubicin and the group receiving placebo+doxorubicin.


#### example 3 

In [17]:
query_3 = """Yes or no, was a hazard ratio reported? If a hazard ratio was reported, what was its value?"""
get_response(query_3, PLS_prompt, expert_prompt)

Original response:
Yes, a hazard ratio was reported. The hazard ratio value was for nexavar+doxorubicin over placebo+doxorubicin.

Plain Language Summary:
Yes, the study showed that the combination of nexavar and doxorubicin had a certain advantage over using placebo and doxorubicin, as indicated by a specific ratio called the hazard ratio.

Expert Analysis:
In the analysis conducted, the hazard ratio was calculated to compare the treatment group receiving nexavar and doxorubicin to the group receiving placebo and doxorubicin. The hazard ratio provides a measure of the relative risk of an event occurring in one group compared to another over time. In this context, a hazard ratio value for nexavar+doxorubicin over placebo+doxorubicin was reported, indicating the relationship between the two treatment regimens in terms of their impact on the primary outcome of interest.


6.1.4  example 4

In [18]:
query_4 = """What condition is mentioned in this trial?"""
get_response(query_4, PLS_prompt, expert_prompt)

Original response:
Hepatocellular carcinoma (HCC) is the condition mentioned in this trial.

Plain Language Summary:
This study is looking at a type of liver cancer called hepatocellular carcinoma (HCC).

Expert Analysis:
Hepatocellular carcinoma (HCC) is a primary malignancy of the liver that arises from hepatocytes. It is a significant global health issue, particularly in regions with high rates of chronic hepatitis B and C infections. HCC is known for its aggressive nature and poor prognosis, often diagnosed at advanced stages when treatment options are limited. The eligibility criteria outlined in the research study focus on selecting patients with advanced HCC who have not received prior local therapy or have shown specific responses to previous treatments. The study aims to evaluate the safety and efficacy of different treatment regimens, such as doxorubicin plus sorafenib versus doxorubicin plus placebo, in patients with advanced HCC. The study also includes exploratory assessme