In [32]:
import json
import os
import re
import requests as req
import urllib.request
from dotenv import load_dotenv
from IPython.display import display, Markdown

import chromadb
import nest_asyncio

nest_asyncio.apply()

from llama_index.core import Document, Settings, StorageContext, VectorStoreIndex
from llama_index.core.memory import ChatMemoryBuffer
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.response.notebook_utils import display_response
from llama_index.core.schema import MetadataMode
#from llama_index.embeddings.nomic import NomicEmbedding
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.llms.openai import OpenAI
from llama_index.vector_stores.chroma import ChromaVectorStore

import openai

from utils_17 import extract_from_json, flatten_dict, replace_double_newline, format_flattened_dict, pfizer_ncts
from loader_utils import *
load_dotenv() 
# nomic_api_key = os.getenv("NOMIC_API_KEY")
openai_api_key = os.getenv("OPENAI_API_KEY")

In [6]:
%%bash
find ./ -type f -name "*.json" -delete

In [7]:
%%bash
rm -rf chroma_db

In [8]:
list_of_nct_id = pfizer_ncts

In [9]:
# fetch the JSON data for the trials
downloaded_json = get_downloaded_json(list_of_nct_id)
# downloaded_json[0] # check
len(downloaded_json) # check

247

In [10]:
documents_list = list_from_extracted_json(downloaded_json)
len(documents_list) # check

247

In [11]:
all_keys = max_keys(documents_list)
len(all_keys)

68

In [12]:
llm_keys_to_incude = [
    "National Clinical Identification NCT ID",
    "Brief title",
    "Condition",
    "Conditions keywords",
    "Lead sponsor",
    "Arms group 0 intervention names",
]

# to exclude the keys not used by LLM
llm_keys_to_exclude = adjust_metadata_keys(all_keys, llm_keys_to_incude)
len(llm_keys_to_exclude)

62

In [13]:
embedding_keys_to_incude = [
    "National Clinical Identification NCT ID",
    "Brief title",
    "Condition",
    "Conditions keywords",
    "Lead sponsor",
    "Arms group 0 intervention names",
]

# to exclude the keys not used by embedding
embedding_keys_to_exclude = adjust_metadata_keys(all_keys, llm_keys_to_incude)
len(embedding_keys_to_exclude)

62

In [14]:
embed_model = OpenAIEmbedding(model="text-embedding-ada-002")

In [15]:
llm3 = OpenAI(temperature=0.001, model="gpt-3.5-turbo", max_tokens=512)

In [16]:
# NOTE:  metata data must be one of (str, int, float, None)
def create_llama_docs(documents_list):
    """
    Converts a list of trial documents into LlamaIndex Document objects.
    """
    
    llama_documents = []  
    for trial in documents_list:
        # apply functions from utils to flatten JSON and create content similar to the example above
        content_text = format_flattened_dict(flatten_dict(trial))

        llama_document = Document(
            text=content_text, 
            metadata=trial, 
            excluded_llm_metadata_keys=llm_keys_to_exclude, #<== adjust?, TBD
            excluded_embed_metadata_keys=embedding_keys_to_exclude , #<== adjust?, TBD
            metadata_template="{key}=>{value}",
            text_template="Metadata:\n{metadata_str}\n===========================\nContent: \n{content}"
        )
        llama_documents.append(llama_document)  
    return llama_documents

llama_documents = create_llama_docs(documents_list)

In [17]:
def create_nodes(llama_documents):
    """
    Generates and embeds nodes from Llama documents.
    """
    parser = SentenceSplitter(chunk_size=1024,chunk_overlap=20) # <== adjust from default
    nodes = parser.get_nodes_from_documents(llama_documents)
    for node in nodes:
        node_embedding = embed_model.get_text_embedding(
            node.get_content(metadata_mode=MetadataMode.EMBED)
        )
        node.embedding = node_embedding
    return nodes

nodes = create_nodes(llama_documents)

In [18]:
# Chroma DB collection name
COLLECTION_NAME = "CLINICAL_RAG"

db = chromadb.PersistentClient(path="chroma_db")
print(f"Looking for the {COLLECTION_NAME} collection in the database..." )
if COLLECTION_NAME not in [col.name for col in db.list_collections()]:
    print(f"{COLLECTION_NAME} collection WAS NOT FOUND in Chroma DB, creating...")
    chroma_collection = db.create_collection(COLLECTION_NAME)
    print("Creating vector store...")
    vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
    storage_context = StorageContext.from_defaults(vector_store=vector_store)
    print("Creating vector store index")
    VectorStoreIndex(
        nodes=nodes,
        storage_context=storage_context,
        store_nodes_override=True
    )
    record_count = chroma_collection.count()
    print(f"record count: {record_count}")   
    
else:
    print(f"{COLLECTION_NAME} collection WAS FOUND in Chroma DB")
    COLLECTION_NAME = db.get_collection(COLLECTION_NAME)
    vector_store = ChromaVectorStore(chroma_collection=COLLECTION_NAME)
    print("Restoring vector store index from the collection...")
    index = VectorStoreIndex.from_vector_store(
        vector_store=vector_store,
        embed_model=embed_model,
        store_nodes_override=True
    )
    record_count = COLLECTION_NAME.count()
    print(f"record count: {record_count}")

Looking for the CLINICAL_RAG collection in the database...
CLINICAL_RAG collection WAS NOT FOUND in Chroma DB, creating...
Creating vector store...
Creating vector store index
record count: 700


# Start of Evaluation

In [20]:
llm4 = OpenAI(model="gpt-4")

In [21]:
vector_index = VectorStoreIndex.from_vector_store(vector_store)  

In [22]:
query_engine = vector_index.as_query_engine(llm=llm3)

response_vector = query_engine.query("What were the primary inclusion criteria for neonates in the study evaluating the safety and efficacy of IV sildenafil in the treatment of persistent pulmonary hypertension of the newborn?")
response_vector_1 = query_engine.query("What are the inclusion criteria for patients participating in the study, specifically regarding their breast cancer diagnosis and treatment history?")

In [23]:
response_vector,response_vector.response

 'The primary inclusion criteria for neonates in the study were: they had to have persistent pulmonary hypertension of the newborn, be less than or equal to 96 hours old and more than or equal to 34 weeks gestational age, have an Oxygenation Index greater than 15 and less than 60, and be undergoing concurrent treatment with inhaled nitric oxide and at least 50% oxygen.')

In [24]:
response_vector.source_nodes[0].get_text()

'"National Clinical Identification NCT ID": "NCT01720524",\n"Organization study identification": "A1481316",\n"EudraCT number": "2012-002619-24",\n"Organization": "Pfizer",\n"Organization class": "INDUSTRY",\n"Brief title": "A Study To Evaluate Safety And Efficacy Of IV Sildenafil In The Treatment Of Neonates With Persistent Pulmonary Hypertension Of The Newborn",\n"Official title": "A MULTI-CENTRE, RANDOMIZED, PLACEBO-CONTROLLED, DOUBLE-BLIND, TWO-ARMED, PARALLEL GROUP STUDY TO EVALUATE EFFICACY AND SAFETY OF IV SILDENAFIL IN THE TREATMENT OF NEONATES WITH PERSISTENT PULMONARY HYPERTENSION OF THE NEWBORN (PPHN) OR HYPOXIC RESPIRATORY FAILURE AND AT RISK FOR PPHN, WITH A LONG TERM FOLLOW-UP INVESTIGATION OF DEVELOPMENTAL PROGRESS 12 AND 24 MONTHS AFTER COMPLETION OF STUDY TREATMENT",\n"Overall status": "COMPLETED",\n"Start date": "2013-08-05",\n"Primary completion date": "2018-10-17",\n"Completion date": "2020-09-28",\n"Verification date": "2021-08",\n"Study first submitted date": "201

## Creating a Question / Context Dataset for Evaluation

In [26]:
from llama_index.core.evaluation import generate_question_context_pairs
qa_dataset = generate_question_context_pairs(
    nodes,
    llm=llm3,
    num_questions_per_chunk=2
)

100%|██████████| 700/700 [19:45<00:00,  1.69s/it]


In [27]:
qa_dataset.save_json("pg_eval_dataset.json")
#qa_dataset = EmbeddingQAFinetuneDataset.from_json("pg_eval_dataset.json")

In [30]:
queries = qa_dataset.queries.values()
print(list(queries)[19])

Describe the treatment regimens for the two recruitment groups in the study, including dosages, administration methods, and cycle durations.


In [87]:
queries

['What is the primary purpose of the study described in the context information?',
 'How many arms groups were involved in the study, and what were the interventions for each group?',
 'Explain the primary outcome of the study comparing adjuvant treatment with exemestane versus tamoxifen therapy in postmenopausal, receptor positive, node negative or node positive breast cancer patients. What is the time frame for assessing this outcome?',
 'Discuss the eligibility criteria for inclusion and exclusion of patients in the study comparing exemestane and tamoxifen therapy for adjuvant treatment of breast cancer. Why were patients with ER and PR negative primary tumors or unknown ER/PR status excluded from the study?',
 'What was the primary purpose of the randomized trial conducted by Pfizer in postmenopausal women with early breast cancer who had already received adjuvant tamoxifen for 2-3 years?',
 'Describe the primary outcome measured in the study, including how Disease-Free Survival (D

In [31]:
sample_id, sample_query = list(qa_dataset.queries.items())[0]

# get the relevant document IDs for the sample question
relevant_doc_ids = qa_dataset.relevant_docs[sample_id]

# get the text content of the relevant documents
relevant_contexts = [qa_dataset.corpus[doc_id] for doc_id in relevant_doc_ids]

# print the question and relevant contexts
print(f"Question: {sample_query}")
for context in relevant_contexts:
    print(f"Relevant Context: {context[:1000]}...")

Question: What is the primary purpose of the study described in the context information?
Relevant Context: "National Clinical Identification NCT ID": "NCT00036270",
"Organization study identification": "971-ONC-0028-081",
"EudraCT number": "A5991026",
"Organization": "Pfizer",
"Organization class": "INDUSTRY",
"Brief title": "Randomized Phase III Study Of Exemestane (Aromasin) For 5 Years Versus Tamoxifen for 2.5 to 3 Years Followed By Exemestane",
"Official title": "Randomized Phase III Study Of Exemestane (Aromasin) For 5 Years Versus Tamoxifen For 2.5- 3 Years Followed By Exemestane (Aromasin) For A Total Of 5 Years As Adjuvant Therapy For Postmenopausal, Receptor Positive, Node Negative or Node Positive Breast Cancer Patients",
"Overall status": "COMPLETED",
"Start date": "2001-08",
"Primary completion date": "2008-10",
"Completion date": "2011-02",
"Verification date": "2015-10",
"Study first submitted date": "2002-05-08",
"Results first submitted date": "2009-10-30",
"Last update

In [33]:
retriever = vector_index.as_retriever(similarity_top_k=2)

In [34]:
retrieved_nodes = retriever.retrieve("What are the key inclusion criteria for patients to be eligible for this study on sunitinib treatment for malignant GIST?")

In [36]:
from llama_index.core.response.notebook_utils import display_source_node

for node in retrieved_nodes:
    display_source_node(node, source_length=10000)

**Node ID:** 9d50f0d2-9388-4879-b58d-13b60c18900f<br>**Similarity:** 0.7527507095974909<br>**Text:** ",
"Primary outcome time frame": "Day 28 of each 6-week cycle : duration of double-blind treatment phase",
"Eligibility criteria": "Key Inclusion Criteria:
* Histologically-proven diagnosis of malignant GIST not amenable to surgery, radiation or combined modality treatment with curative intent
* Failed Gleevec treatment or intolerant to Gleevec therapy
Key Exclusion Criteria:
* Treatment with any chemotherapy, chemoembolization therapy, immunotherapy, or investigational agent since the last dose of Gleevec",
"Eligibility of healthy volunteer": "False",
"Eligibility sex": "ALL",
"Eligibility minimum age": "18 Years",
"Eligibility standard age": "['ADULT', 'OLDER_ADULT']",
"Pre-assignment details": "361 subjects randomized to double-blind treatment in 2:1 ratio (sunitinib vs. Placebo).
255 subjects continued on or crossed over to Open-label treatment.",
"Recruitment details": "Enrollment began (medical clinic) in December 2003. Study was unblinded on 27 January 2005 (end of Double-blind treatment). Subjects experiencing disease progression could crossover to Open-label treatment. Open-label data collection ended May 2008.",
"Recruitment group 0 id": "FG000",
"Recruitment group 0 title": "Sunitinib Double-Blind Treatment",
"Recruitment group 0 description": "Starting dose: 50 mg orally once daily as a single agent for 4 consecutive weeks followed by a 2-week off-treatment period to form a complete cycle of 6 weeks. (Schedule 4/2). Subjects received best supportive care in addition to the study treatment.",
"Recruitment group 1 id": "FG001",
"Recruitment group 1 title": "Placebo Double-Blind Treatment",
"Recruitment group 1 description": "Starting daily dose of 1 capsule, size- and color-matched to the sunitinib 50-mg capsule for 4 consecutive weeks followed by a 2-week off-treatment period to form a complete cycle of 6 weeks (Schedule 4/2). Subjects received best supportive care in addition to the study treatment. Subjects were provided the opportunity to receive open-label sunitinib at the time of confirmed disease progression or study unblinding.",
"Group IDs": "[{'groupIds': ['OG000', 'OG001'], 'groupDescription': 'The study was designed to test the null hypothesis that the median TTP from placebo treatment is 4 months versus the alternative hypothesis that the median TTP from sunitinib treatment is at least 6 months with an overall 2-sided significance level of 0.05 and power of 90%.', 'testedNonInferiority': False, 'nonInferiorityType': 'SUPERIORITY_OR_OTHER', 'pValue': '<0.001', 'pValueComment': "The nominal levels of significance for the interim and final analyses were determined at the time of the analyses using the Lan-DeMets procedure with an O'Brien-Fleming stopping rule.", 'statisticalMethod': 'Log Rank', 'statisticalComment': 'two-sided unstratified log-rank test', 'paramType': 'Hazard Ratio (HR)', 'paramValue': '0.329', 'ciPctValue': '95', 'ciLowerLimit': '0.233', 'ciUpperLimit': '0.466'}]",
"p-value": "<0.001",
"Statistical Method": "Log Rank",
"Limitations and caveats": "Duration of Tumor Response could not be reliably estimated at the time of analysis.",
"Has results": "True"<br>

**Node ID:** 399f4cda-c7df-4fcd-90cb-532515e7b62f<br>**Similarity:** 0.744039125087418<br>**Text:** "National Clinical Identification NCT ID": "NCT00075218",
"Organization study identification": "A6181004",
"EudraCT number": "",
"Organization": "Pfizer",
"Organization class": "INDUSTRY",
"Brief title": "A Study To Assess The Safety And Efficacy Of SU11248 In Patients With Gastrointestinal Stromal Tumor(GIST)",
"Official title": "A Phase III, Randomized, Double-Blind, Placebo-Controlled Study Of SU011248 In The Treatment Of Patients With Imatinib Mesylate (Gleevec Tm, Glivec)-Resistant Or Intolerant Malignant Gastrointestinal Stromal Tumor",
"Overall status": "COMPLETED",
"Start date": "2003-12",
"Primary completion date": "2008-05",
"Completion date": "2008-05",
"Verification date": "2009-08",
"Study first submitted date": "2004-01-06",
"Results first submitted date": "2009-05-06",
"Last update submitted date": "2009-08-31",
"Last update posted date": "2009-09-28",
"Lead sponsor": "Pfizer",
"Lead sponsor class": "INDUSTRY",
"Brief summary": "A study to assess the safety and efficacy of SU11248 in patients with gastrointestinal stromal tumor (GIST) whose disease has failed imatinib therapy or who were intolerant to imatinib treatment.",
"Detailed description": "oof, this data not available",
"Condition": "['Gastrointestinal Stromal Tumor']",
"Conditions keywords": "oof, this data not available",
"Study type": "INTERVENTIONAL",
"Phases": "['PHASE3']",
"Allocation": "RANDOMIZED",
"Intervention model": "CROSSOVER",
"Primary purpose": "TREATMENT",
"Masking": "DOUBLE",
"Who is masked": "['PARTICIPANT', 'INVESTIGATOR']",
"Enrollment count": "361",
"Enrollment type": "ACTUAL",
"Arms group 0 label": "B",
"Arms group 0 type": "PLACEBO_COMPARATOR",
"Arms group 0 description": "",
"Arms group 0 intervention names": "['Drug: Placebo']",
"Arms group 1 label": "A",
"Arms group 1 type": "ACTIVE_COMPARATOR",
"Arms group 1 description": "",
"Arms group 1 intervention names": "['Drug: SU011248']",
"Arms group 0 intervention type": "DRUG",
"Arms group 0 intervention name": "Placebo",
"Arms group 0 intervention description": "50 mg taken orally once a day. 6 week treatment cycle (Schedule 4/2) 4 weeks on study drug/2 weeks off study drug.",
"Arms group 0 intervention labels": "['B']",
"Arms group 1 intervention type": "DRUG",
"Arms group 1 intervention name": "SU011248",
"Arms group 1 intervention description": "50 mg taken orally once a day. 6 week treatment cycle (Schedule 4/2) 4 weeks on study drug/2 weeks off study drug.",
"Arms group 1 intervention labels": "['A']",
"Primary outcome": "Time to Tumor Progression (TTP) as Assessed by Imaging Studies at End of Double-blind Treatment Phase",
"Primary outcome description": "Time from randomization to first documentation of objective tumor progression based on the assessment of an independent, third-party imaging laboratory using RECIST (Response Evaluation Criteria in Solid Tumors).<br>

# Doing Retreival Evalution

In [37]:
from llama_index.core.evaluation import RetrieverEvaluator
import pandas as pd
retriever_evaluator = RetrieverEvaluator.from_metric_names(
    ["mrr", "hit_rate"], retriever=retriever
)

In [42]:
eval_results = await retriever_evaluator.aevaluate_dataset(qa_dataset)

In [43]:
def display_results(name, eval_results):
    """Display results from evaluate."""

    metric_dicts = []
    for eval_result in eval_results:
        metric_dict = eval_result.metric_vals_dict
        metric_dicts.append(metric_dict)

    full_df = pd.DataFrame(metric_dicts)

    hit_rate = full_df["hit_rate"].mean()
    mrr = full_df["mrr"].mean()

    metric_df = pd.DataFrame(
        {"Retriever Name": [name], "Hit Rate": [hit_rate], "MRR": [mrr]}
    )

    return metric_df

In [44]:
display_results("OpenAI Embedding Retriever", eval_results)

Unnamed: 0,Retriever Name,Hit Rate,MRR
0,OpenAI Embedding Retriever,0.385714,0.328929


## Retreieval Evaluation Results

* Hit Rate:
The hit rate measures the fraction of queries for which at least one relevant document is retrieved within the top-k results. A higher hit rate indicates that the retrieval system is able to surface relevant information for more queries. However, the hit rate alone doesn't tell you how highly ranked the relevant documents are within the top-k.

* Mean Reciprocal Rank (MRR):
MRR takes into account the ranking of the relevant documents. It calculates the reciprocal of the rank of the first relevant document for each query, and then takes the mean across all queries. A higher MRR score (closer to 1) indicates that the relevant documents are ranked higher, on average.
MRR is a useful metric because it not only considers whether relevant documents are retrieved but also how highly they are ranked. In many applications, users are more likely to pay attention to the top few results, so having the relevant documents highly ranked is important.

In [41]:
sample_id, sample_query = list(qa_dataset.queries.items())[3]
sample_expected = qa_dataset.relevant_docs[sample_id]

eval_result = retriever_evaluator.evaluate(sample_query, sample_expected)
print(eval_result)

Query: Discuss the eligibility criteria for inclusion and exclusion of patients in the study comparing exemestane and tamoxifen therapy for adjuvant treatment of breast cancer. Why were patients with ER and PR negative primary tumors or unknown ER/PR status excluded from the study?
Metrics: {'mrr': 0.0, 'hit_rate': 0.0}



# Doing a Response Evaluation

In [46]:
queries = list(qa_dataset.queries.values())
queries

['What is the primary purpose of the study described in the context information?',
 'How many arms groups were involved in the study, and what were the interventions for each group?',
 'Explain the primary outcome of the study comparing adjuvant treatment with exemestane versus tamoxifen therapy in postmenopausal, receptor positive, node negative or node positive breast cancer patients. What is the time frame for assessing this outcome?',
 'Discuss the eligibility criteria for inclusion and exclusion of patients in the study comparing exemestane and tamoxifen therapy for adjuvant treatment of breast cancer. Why were patients with ER and PR negative primary tumors or unknown ER/PR status excluded from the study?',
 'What was the primary purpose of the randomized trial conducted by Pfizer in postmenopausal women with early breast cancer who had already received adjuvant tamoxifen for 2-3 years?',
 'Describe the primary outcome measured in the study, including how Disease-Free Survival (D

## Faithfulness Evaluation Metric

In [82]:
from llama_index.core.evaluation import FaithfulnessEvaluator
faithfulness_gpt4 = FaithfulnessEvaluator(llm=llm3)

In [48]:
eval_query = queries[0]

eval_query

'What is the primary purpose of the study described in the context information?'

In [49]:
response_vector = query_engine.query(eval_query)

In [50]:
eval_result = faithfulness_gpt4.evaluate_response(response=response_vector)

In [51]:
# check passing parameter in eval_result if it passed the evaluation.
eval_result.passing

True

In [56]:
from llama_index.core.evaluation import EvaluationResult
from llama_index.core import Response
pd.set_option("display.max_colwidth", 0)

def display_eval_df(response: Response, eval_result: EvaluationResult) -> None:
    if response.source_nodes == []:
        print("no response!")
        return
    eval_df = pd.DataFrame(
        {
            "Response": str(response),
            "Source": response.source_nodes[0].node.text[:1000] + "...",
            "Evaluation Result": "Pass" if eval_result.passing else "Fail",
            "Reasoning": eval_result.feedback,
        },
        index=[0],
    )
    eval_df = eval_df.style.set_properties(
        **{
            "inline-size": "600px",
            "overflow-wrap": "break-word",
        },
        subset=["Response", "Source"]
    )
    display(eval_df)

In [57]:
display_eval_df(response_vector, eval_result)

Unnamed: 0,Response,Source,Evaluation Result,Reasoning
0,"The primary purpose of the first study is to compare patient satisfaction with the prefilled syringe (PFS) and the auto-injector (AI), two different delivery devices for etanercept after 12 weeks of use. The secondary evaluation focuses on identifying patient and device attributes associated with patient satisfaction. The primary purpose of the second study is not explicitly stated in the context. However, it involves a Phase 3, randomized, double-masked, 12-week, parallel group study in pediatric subjects with Glaucoma, using the drug Timolol. The study seems to be comparing the effects of different treatments for glaucoma in children and adults.","""National Clinical Identification NCT ID"": ""NCT00482170"", ""Organization study identification"": ""0881A6-3326"", ""EudraCT number"": """", ""Organization"": ""Pfizer"", ""Organization class"": ""INDUSTRY"", ""Brief title"": ""Compare Perceptions and Satisfaction for Two Different Delivery Mechanisms for Etanercept"", ""Official title"": ""A 3 Month, Randomised, Open Label, Parallel Group, Descriptive Study to Explore and Compare Perceptions and Satisfaction for Two Different Delivery Mechanisms for Etanercept (Etanercept Auto-injector and the Etanercept Prefilled Syringe) in Patients With Psoriasis."", ""Overall status"": ""COMPLETED"", ""Start date"": ""2007-09"", ""Primary completion date"": ""2009-04"", ""Completion date"": ""2009-09"", ""Verification date"": ""2012-03"", ""Study first submitted date"": ""2007-06-01"", ""Results first submitted date"": ""2012-03-01"", ""Last update submitted date"": ""2012-03-01"", ""Last update posted date"": ""2012-03-30"", ""Lead sponsor"": ""Pfizer"", ""Lead sponsor class"": ""INDUSTRY"", ""Brief summary"": ""Prim...",Pass,YES


In [59]:
from llama_index.core.evaluation import DatasetGenerator

question_generator = DatasetGenerator.from_documents(llama_documents)
eval_questions = question_generator.generate_questions_from_nodes(5)

eval_questions

  return cls(


ValueError: Metadata length (1605) is longer than chunk size (1024). Consider increasing the chunk size or decreasing the size of your metadata to avoid this.

## Relevancy Evaluation Metric

In [83]:
from llama_index.core.evaluation import RelevancyEvaluator
relevancy_gpt4 = RelevancyEvaluator(llm=llm3)

In [63]:
query = queries[3]

query

'Discuss the eligibility criteria for inclusion and exclusion of patients in the study comparing exemestane and tamoxifen therapy for adjuvant treatment of breast cancer. Why were patients with ER and PR negative primary tumors or unknown ER/PR status excluded from the study?'

In [64]:
response_vector = query_engine.query(query)

# Relevancy evaluation
eval_result = relevancy_gpt4.evaluate_response(
    query=query, response=response_vector
)

In [65]:
eval_result.passing

True

In [67]:
def display_eval_df(
    query: str, response: Response, eval_result: EvaluationResult
) -> None:
    eval_df = pd.DataFrame(
        {
            "Query": query,
            "Response": str(response),
            "Source": response.source_nodes[0].node.text[:1000] + "...",
            "Evaluation Result": "Pass" if eval_result.passing else "Fail",
            "Reasoning": eval_result.feedback,
        },
        index=[0],
    )
    eval_df = eval_df.style.set_properties(
        **{
            "inline-size": "600px",
            "overflow-wrap": "break-word",
        },
        subset=["Response", "Source"]
    )
    display(eval_df)

In [69]:
display_eval_df(query,response_vector, eval_result)

Unnamed: 0,Query,Response,Source,Evaluation Result,Reasoning
0,Discuss the eligibility criteria for inclusion and exclusion of patients in the study comparing exemestane and tamoxifen therapy for adjuvant treatment of breast cancer. Why were patients with ER and PR negative primary tumors or unknown ER/PR status excluded from the study?,"The eligibility criteria for inclusion in the study required postmenopausal women with histologically or cytologically confirmed primary breast adenocarcinoma, who have been treated with tamoxifen continuously for between 2 and 3 years and one month, and are still free of disease. The exclusion criteria included patients with unresectable breast cancer and those with ER negative primary tumors. The study did not provide a specific reason for excluding patients with ER negative primary tumors or unknown ER/PR status. However, it's generally known that the effectiveness of both tamoxifen and exemestane is often dependent on the presence of hormone receptors such as estrogen receptors (ER) and progesterone receptors (PR). These drugs work by blocking these receptors or reducing their numbers, thereby slowing or stopping the growth of cancer cells. Therefore, patients with ER negative tumors, which do not have these receptors, or those with unknown ER/PR status, may not respond to these treatments, which could affect the results of the study.",""", ""Primary outcome time frame"": ""Baseline up to Month 36"", ""Eligibility criteria"": ""Inclusion Criteria: * postmenopausal women with histologically or cytologically confirmed primary breast adenocarcinoma, receiving tamoxifen and have been treated with tamoxifen continuously for between 2 and 3 years and one month, and still free of disease Exclusion Criteria: * unresectable breast cancer * ER negative primary tumor"", ""Eligibility of healthy volunteer"": ""False"", ""Eligibility sex"": ""FEMALE"", ""Eligibility minimum age"": ""30 Years"", ""Eligibility standard age"": ""['ADULT', 'OLDER_ADULT']"", ""Pre-assignment details"": ""Main study also included 3 sub-studies only for the purpose of tolerability assessment: endometrial status, bone metabolism and quality of life (QoL). Out of 4740 enrolled participants, data for 16 participants from a center were excluded since it was considered unreliable. Results are reported for remaining 4724 participants."", ""Recruitment details"": ""The publication describing ...",Pass,YES


In [84]:
import random
from llama_index.core.evaluation import BatchEvalRunner

llm35 = OpenAI(temperature=0, model="gpt-3.5-turbo-16k-0613")
query_engine = vector_index.as_query_engine(llm=llm35)
#lmm35 = OpenAI(temperature=0, model="gpt-3.5-turbo-16k-0613")

num_samples = 10
batch_eval_queries = random.sample(queries, num_samples)

# Initiate BatchEvalRunner to compute FaithFulness and Relevancy Evaluation.
runner = BatchEvalRunner(
    {"faithfulness": faithfulness_gpt4, "relevancy": relevancy_gpt4},
    workers=8,
)

# Compute evaluation
eval_results = await runner.aevaluate_queries(
    query_engine, queries=batch_eval_queries
)

In [85]:
faithfulness_score = sum(result.passing for result in eval_results['faithfulness']) / len(eval_results['faithfulness'])

faithfulness_score

0.8

In [86]:
relevancy_score = sum(result.passing for result in eval_results['relevancy']) / len(eval_results['relevancy'])

relevancy_score

0.5