# Importing Libraries

In [None]:
from trulens.core import TruSession
from trulens.connectors.snowflake import SnowflakeConnector
from snowflake.snowpark.context import get_active_session

snowpark_session = get_active_session()
conn = SnowflakeConnector(snowpark_session=snowpark_session)
session = TruSession()

# Add and create your custom ground-truth dataset to TruLens


In [None]:

import pandas as pd

# Define expected chunks for clinical trials with more detailed and structured data
expected_chunks = [
    {
        "text": "The D3L-001 clinical trial is a Phase 2 study evaluating the safety and efficacy of a novel HER2-targeted monoclonal antibody in patients with HER2-positive advanced solid tumors, including metastatic breast cancer.",
        "title": "D3L-001 Trial Overview",
        "expected_score": 0.95,  # Very high accuracy
    },
    {
        "text": "The primary outcome measure is progression-free survival (PFS) as assessed by radiographic imaging, with secondary outcomes including overall survival (OS) and quality of life (QoL) assessments.",
        "title": "Outcome Measures",
        "expected_score": 0.9,  # High accuracy
    },
    {
        "text": "Inclusion criteria: Patients must have confirmed HER2-positive advanced solid tumors, including metastatic breast cancer, and must have experienced progression after at least one prior HER2-directed therapy. Exclusion criteria include history of hypersensitivity reactions to monoclonal antibodies.",
        "title": "Eligibility Criteria",
        "expected_score": 0.9,  # High accuracy
    },
    {
        "text": "This clinical trial is being conducted across several sites in the United States, including major cancer centers in New York, California, and Texas. International locations include clinical research sites in Canada and Europe.",
        "title": "Trial Locations",
        "expected_score": 0.85,  # High accuracy
    },
    {
        "text": "Patients participating in this trial are required to have normal organ function and an Eastern Cooperative Oncology Group (ECOG) performance status of 0 to 2.",
        "title": "Additional Criteria",
        "expected_score": 0.8,  # Moderate accuracy
    },
    {
        "text": "While some breast cancer clinical trials focus on earlier-stage disease or non-HER2-positive tumors, the D3L-001 trial specifically targets advanced, HER2-positive breast cancer, a subgroup of patients with a high unmet medical need.",
        "title": "General Clinical Trial Information",
        "expected_score": 0.7,  # Moderate accuracy
    },
]

# Mock data for a query about clinical trials
data = {
    "query": ["What is the D3L-001 trial, and what outcomes does it aim to achieve?"],
    "query_id": ["1"],
    "expected_response": ["D3L-001 Trial Overview and Outcome Measures"],
    "expected_chunks": [expected_chunks],
}

# Create a DataFrame
df = pd.DataFrame(data)

# Display the DataFrame (for debugging or confirmation)
print(df)


In [None]:
session.add_ground_truth_to_dataset(
    dataset_name="breast_cancer_clinical_trials_dataset",  # More descriptive dataset name
    ground_truth_df=df,  # The updated DataFrame with detailed ground truth data
    dataset_metadata={
        "domain": "Breast Cancer Clinical Trials",  # More specific domain description
        "data_source": "Clinical Trial Data (D3L-001)",  #specify where the data is sourced from
        "trial_type": "HER2-positive Advanced Solid Tumors",  # Add more relevant details if necessary
        "language": "English",  # Specify language if applicable
        "version": "1.0",  # Optional: Add versioning for future updates to the dataset
    },
)


In [None]:
ground_truth_df = session.get_ground_truth("breast_cancer_clinical_trials_dataset")

### Inspecting the below dataframe to see the columns and their value

In [None]:
ground_truth_df

In [None]:
from snowflake.snowpark.context import get_active_session
from trulens.connectors.snowflake import SnowflakeConnector
from trulens.feedback import GroundTruthAgreement
from trulens.providers.cortex import Cortex

# Get the active Snowflake session
snowpark_session = get_active_session()

# Create a connector using the active session
conn = SnowflakeConnector(snowpark_session=snowpark_session)

# Create a Cortex provider using the active session
provider = Cortex(
    snowpark_session=snowpark_session, 
    model_engine="mistral-large2"
)

print("Provider initialized successfully!")


In [None]:
expected_chunks

In [None]:
query = "What is the D3L-001 trial, and what outcomes does it aim to achieve?"

retrieved_chunks = [
    "The D3L-001 clinical trial is a Phase 2 study evaluating the safety and efficacy of a novel HER2-targeted monoclonal antibody in patients with HER2-positive advanced solid tumors, including metastatic breast cancer.",
    "This clinical trial is being conducted across several sites in the United States, including major cancer centers in New York, California, and Texas. International locations include clinical research sites in Canada and Europe.",
    "Inclusion criteria: Patients must have confirmed HER2-positive advanced solid tumors, including metastatic breast cancer, and must have experienced progression after at least one prior HER2-directed therapy. Exclusion criteria include history of hypersensitivity reactions to monoclonal antibodies.",
]

retrieved_chunks


# Precision

In [None]:
GroundTruthAgreement(ground_truth_df, provider=provider).precision_at_k(query, retrieved_chunks)

# Recall

In [None]:
GroundTruthAgreement(ground_truth_df, provider=provider).recall_at_k(query, retrieved_chunks)

## What if we get one expected chunk and one unexpected chunk

In [None]:
query = "What is the D3L-001 trial, and what outcomes does it aim to achieve?"

retrieved_chunks = [
    "The D3L-001 clinical trial is a Phase 2 study evaluating the safety and efficacy of a novel HER2-targeted monoclonal antibody in patients with HER2-positive advanced solid tumors, including metastatic breast cancer.",
    "The beluga is adapted to life in the Arctic, with anatomical and physiological characteristics that differentiate it from other cetaceans.",
]


## Precision

In [None]:
GroundTruthAgreement(ground_truth_df, provider=provider).precision_at_k(query, retrieved_chunks)

## Precision@1

In [None]:
GroundTruthAgreement(ground_truth_df, provider=provider).precision_at_k(query, retrieved_chunks, k=1)

## Recall

In [None]:
GroundTruthAgreement(ground_truth_df, provider=provider).recall_at_k(query, retrieved_chunks)

# Retrieval system

In [None]:
DESCRIBE CORTEX SEARCH SERVICE CC_SEARCH_SERVICE_CS

In [None]:
import os
from snowflake.core import Root
from typing import List
from trulens.apps.custom import instrument

from snowflake.snowpark.session import Session

class CortexSearchRetriever:

    def __init__(self, snowpark_session: Session, limit_to_retrieve: int = 3):
        self._snowpark_session = snowpark_session
        self._limit_to_retrieve = limit_to_retrieve

    @instrument
    def retrieve(self, query: str) -> List[str]:
        root = Root(self._snowpark_session)
        cortex_search_service = (
            root.databases["breastcareTrials_db"]
            .schemas["breastcare_schema"]
            .cortex_search_services["CC_SEARCH_SERVICE_CS"]
        )
        resp = cortex_search_service.search(
            query=query,
            columns=["CHUNK"],
            limit=self._limit_to_retrieve,
        )

        if resp.results:
            return [curr["CHUNK"] for curr in resp.results]
        else:
            return []

In [None]:
retriever = CortexSearchRetriever(snowpark_session=snowpark_session, limit_to_retrieve=3)

retrieved_context = retriever.retrieve(query="What is the D3L-001 trial, and what outcomes does it aim to achieve?")

retrieved_context

# Add in TruLens Evals

In [None]:
from trulens.providers.cortex.provider import Cortex
from trulens.core import Feedback
from trulens.core import Select
from snowflake.snowpark.context import get_active_session
from trulens.connectors.snowflake import SnowflakeConnector
import numpy as np

# Get the active Snowflake session
snowpark_session = get_active_session()

# Create a connector using the active session
conn = SnowflakeConnector(snowpark_session=snowpark_session)

# Initialize the Cortex provider using the snowpark_session
provider = Cortex(snowpark_session=snowpark_session, model_engine="mistral-large2")

# Define feedback for context relevance
f_context_relevance = (
    Feedback(provider.context_relevance, name="Context Relevance")
    .on_input_output()
    .aggregate(np.mean)
)

print("Cortex provider and feedback initialized successfully!")


In [None]:
f_context_relevance("What is the D3L-001 trial, and what outcomes does it aim to achieve?", retrieved_context)

# Register an app

In [None]:
from trulens.apps.custom import TruCustomApp

tru_retriever = TruCustomApp(
    retriever,
    app_name="BreastCareTrial Retriever",
    app_version="base",
    feedbacks=[f_context_relevance],
)

In [None]:
with tru_retriever as recording:
    retrieved_documents = retriever.retrieve("What is the D3L-001 trial, and what outcomes does it aim to achieve?")

retrieved_documents

In [None]:
session.get_leaderboard()

In [None]:
from trulens.apps.custom import instrument
from snowflake.cortex import Complete


class RAG:

    def __init__(self):
        self.retriever = CortexSearchRetriever(snowpark_session=snowpark_session, limit_to_retrieve=3)

    @instrument
    def retrieve_context(self, query: str) -> list:
        """
        Retrieve relevant text from vector store.
        """
        return self.retriever.retrieve(query)

    @instrument
    def generate_completion(self, query: str, context_str: list) -> str:
        """
        Generate answer from context.
        """
        prompt = f"""
        You are an intelligent assistant specialized in breast cancer clinical trials.
        Your responses should focus on trial information, eligibility requirements, and next steps.
        Context: {context_str}
          Question:
          {query}
          Answer:
        """
        return Complete("mistral-large2", prompt)

    @instrument
    def query(self, query: str) -> str:
        context_str = self.retrieve_context(query)
        return self.generate_completion(query, context_str)


rag = RAG()

In [None]:
from trulens.providers.cortex.provider import Cortex
from snowflake.snowpark.context import get_active_session
from trulens.connectors.snowflake import SnowflakeConnector
from trulens.core import Feedback
from trulens.core import Select
from functools import partial
import numpy as np

# Get the active Snowflake session (this should be the correct type, which is Session)
snowpark_session = get_active_session()

# Create a connector using the active session
conn = SnowflakeConnector(snowpark_session=snowpark_session)

# Pass the actual session object to Cortex, not the connection
provider = Cortex(snowpark_session, "mistral-large2")

# Feedbacks
f_groundedness = (
    Feedback(partial(provider.groundedness_measure_with_cot_reasons,
            use_sent_tokenize=False), name="Groundedness")
    .on(Select.RecordCalls.retrieve_context.rets[:].collect())
    .on_output()
)

f_context_relevance = (
    Feedback(provider.context_relevance, name="Context Relevance")
    .on_input()
    .on(Select.RecordCalls.retrieve_context.rets[:])
    .aggregate(np.mean)
)

f_answer_relevance = (
    Feedback(provider.relevance, name="Answer Relevance")
    .on_input()
    .on_output()
    .aggregate(np.mean)
)


In [None]:
from trulens.apps.custom import TruCustomApp

tru_rag = TruCustomApp(
    rag,
    app_name="BreastCareTrial",
    app_version="simple",
    feedbacks=[f_groundedness, f_answer_relevance, f_context_relevance],
    )

In [None]:
with tru_rag as recording:
    response = rag.query("What is the D3L-001 trial, and what outcomes does it aim to achieve?")
    
response

In [None]:
session.get_leaderboard()