## Imports

In [1]:
from dotenv import load_dotenv 

#import evaluate # for ROUGE metric
import json
import os
import requests as req
import time

from llama_index.core import Document, Settings, VectorStoreIndex
from llama_index.embeddings.nomic import NomicEmbedding

from utils_3 import safe_get, extract_from_json, flatten_data

## bioBert

In [2]:
from transformers import AutoTokenizer, AutoModel

# reference:  https://huggingface.co/bvanaken/CORe-clinical-outcome-biobert-v1
tokenizer = AutoTokenizer.from_pretrained("bvanaken/CORe-clinical-outcome-biobert-v1")
model = AutoModel.from_pretrained("bvanaken/CORe-clinical-outcome-biobert-v1")

## Verify API tokens are available

In [3]:
load_dotenv()  # This loads the variables from .envz
nomic_api_key = os.getenv("NOMIC_API_KEY")

In [4]:
embed_model = NomicEmbedding(
    api_key=nomic_api_key,
    dimensionality=128,
    model_name="nomic-embed-text-v1.5",
)

embedding = embed_model.get_text_embedding("Nomic Embeddings")

llm = model

#ettings.llm = llm
Settings.embed_model = embed_model

## Fetch data from "specific" clinicaltrials.gov
<span style="color: darkred; font-size: 18px;"> source: https://drive.google.com/file/d/1HOsN3v8DLzwoMOXOr_Mfb1Hn6XNwlZ72/view?usp=sharing

In [5]:
# specify the clinical trial 
# nct_id = "NCT00094887"
# nct_id = "NCT00108953"
nct_id = "NCT00177671" 
# nct_id = "NCT00281918"
# nct_id = "NCT00404079"
# nct_id = "NCT00426751"
# nct_id = "NCT01865747" #<== good one 

# name the downloaded JSON "clinical_study"
response = req.get(f"https://clinicaltrials.gov/api/v2/studies/{nct_id}")
clinical_study = response.json()
# clinical_study

# save the full JSON locally for review (e.g., jsonhero.io)
with open(f"{nct_id}.json", "w") as f:
    json.dump(clinical_study, f, indent=4)

## Extract a  subset of the data

In [6]:
extracted_json = extract_from_json(clinical_study)

In [7]:
# save a JSON file to disk for review
save_path = f"{nct_id}_extracted.json"
with open(save_path, "w") as f:
    json.dump(extracted_json, f, indent=4)

In [8]:
# prepare for indexing
trial_info =  flatten_data(extracted_json, path="")
# trial_info

In [9]:
# save text file for disk for review
with open("trial_info.txt", "w") as file:
  for item in trial_info:
    file.write(item + "\n")

## LlamaIndex RAG 

In [10]:
documents =[Document(text=t)for t in trial_info]

In [11]:
# Create the vector store that we use find relevant documents
index = VectorStoreIndex.from_documents(documents)

In [12]:
# A query engine 
# query_engine = index.as_query_engine()
query_engine = index.as_query_engine(similarity_top_k=5)# <== adjusted this

### quick test

In [13]:
def get_response(query, prompt_1, prompt_2):
    response = query_engine.query(query)
    print(f"Original response:\n{response}")
 
    query_2 = prompt_1 + response.response
    response_2 = query_engine.query(query_2)
    print(f"\nPlain Language Summary:\n{response_2}")
    
    query_2 = prompt_2 + response.response
    response_2 = query_engine.query(query_2)
    print(f"\nExpert Analysis:\n{response_2}")


#### enhanced prompts

In [14]:
PLS_prompt = """Using everyday language to make the clinical results of a study meaningful and understandable to a lay person, rephrase this: """
expert_prompt = """Emulate a PhD scientist and expert statistician to elaborate on the following: """

#### example 1

In [15]:
query_1 = "For which medical condition and treatment was this study done?"
get_response(query_1, PLS_prompt, expert_prompt)

Original response:
The study was conducted to investigate the effectiveness of combining antidepressant medication (escitalopram, venlafaxine, or duloxetine) with donepezil, a medication used in Alzheimer's Disease, in improving memory, concentration, attention, and problem-solving abilities, as well as reducing the risk of depressive relapse in older individuals with depression.

Plain Language Summary:
The study aimed to see if combining certain antidepressant medications with a drug used for Alzheimer's Disease could help older people with depression by improving memory, focus, attention, problem-solving skills, and lowering the chances of depressive symptoms returning.

Expert Analysis:
The study aimed to assess the efficacy of combining antidepressant medication (escitalopram, venlafaxine, or duloxetine) with donepezil in enhancing cognitive functions such as memory, concentration, attention, and problem-solving skills in older individuals with depression. Additionally, it sought 

#### example 2

In [16]:
query_2 = """Yes or no, was p-value reported?. What is the p-value?"""
get_response(query_2, PLS_prompt, expert_prompt)

Original response:
No, the p-value was not reported in the provided context information.

Plain Language Summary:
No statistical significance level was mentioned in the information provided.

Expert Analysis:
In the provided context information, the p-value was not explicitly mentioned or reported. The absence of the p-value makes it challenging to assess the statistical significance of the results or the likelihood that the observed differences between groups occurred by chance. The p-value is a crucial statistical measure that helps determine the strength of evidence against the null hypothesis. Its absence in the context limits a comprehensive interpretation of the statistical significance of the findings presented.


#### example 3 

In [17]:
query_3 = """Yes or no, was a hazard ratio reported? If a hazard ratio was reported, what was its value?"""
get_response(query_3, PLS_prompt, expert_prompt)

Original response:
No, a hazard ratio was not reported in the context information provided.

Plain Language Summary:
No, they did not provide a hazard ratio in the information given.

Expert Analysis:
No, a hazard ratio was not reported in the context information provided.


6.1.4  example 4

In [18]:
query_4 = """What condition is mentioned in this trial?"""
get_response(query_4, PLS_prompt, expert_prompt)

Original response:
Major depression

Plain Language Summary:
Feeling very sad or down for a long period of time

Expert Analysis:
Major depression is a condition characterized by persistent feelings of sadness, hopelessness, and a loss of interest in activities that were once enjoyable. It is a common mental health disorder that can significantly impact an individual's daily functioning and quality of life. In the context of the research study described, major depression is the primary focus of investigation, particularly in elderly patients aged 65 and above. The study aims to explore the effectiveness of combining antidepressant medication with a medication used in Alzheimer's Disease to improve cognitive functioning and reduce the risk of depressive symptoms returning in this population. The study emphasizes the importance of addressing cognitive impairment in late-life depression, as it is a key aspect of the illness that contributes to disability and impaired quality of life. The 