In [1]:
import os
import openai
openai.api_key = os.environ['OPENAI_API_KEY_']

In [2]:
from llama_index.core import SimpleDirectoryReader

documents = SimpleDirectoryReader(
    input_files=['data/Mind2Web.pdf']
).load_data()

In [3]:
# Merge into a single document for better retrieval
from llama_index.core import Document

document = Document(
    text='\n\n'.join([doc.text for doc in documents])
)

### Auto-Merge Retriever needs Hierarchical Parser

In [4]:
from llama_index.core.node_parser import HierarchicalNodeParser

# create the hierarchical node parser w/ default settings
node_parser = HierarchicalNodeParser.from_defaults(
    chunk_sizes=[2048, 512, 128]
)

In [5]:
#contains all nodes parent intermediate and leaf nodes
nodes = node_parser.get_nodes_from_documents([document])

### Need only leaf nodes for making index

In [6]:
from llama_index.core.node_parser import get_leaf_nodes

leaf_nodes = get_leaf_nodes(nodes)
print(leaf_nodes[30].text)

Each instance in our dataset contains three components:
Task description, which outlines the high-level goal of the task. We intentionally avoid low-level,
step-by-step instructions, aiming to foster the development of agents that can comprehend and carry
out tasks in a more autonomous fashion, rather than merely following prescriptive directives.
Action sequence , which is the sequence of actions required to accomplish the task on the website.
Each action in the sequence comprises a (Target Element, Operation) pair. The Target
Element is an interactable element on the current webpage, and the Operation refers to the action
to be executed on that element.


### Printing Parent Node

In [7]:
nodes_by_id = {node.node_id: node for node in nodes}

parent_node = nodes_by_id[leaf_nodes[30].parent_node.node_id]
print(parent_node.text)

To achieve this, our approach for data collection adheres
to the following principles. Firstly, instead of recreating websites in simulation, which often leads to
oversimplified environments, we engage directly with real-world websites and capture snapshots of
these environments. Secondly, we collate a diverse set of websites from varied domains and crowd-
source realistic tasks that cover a wide range of functionalities provided by these websites. Finally,
acknowledging the challenge of perfectly replicating the complexity of real-world environments, we
strive to capture a comprehensive snapshot of each website and the full interaction trace, to the extent
that all the tasks can be seamlessly replayed offline. This supports rich modeling and evaluation
approaches, ensuring a robust and practical dataset for research.
2.1 Task Definition
The primary objective of MIND2W EBis for the agent to complete a specific task on the target website
through a series of actions. Each instance in our

### Building an Index

In [8]:
from llama_index.core import VectorStoreIndex, StorageContext
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core import ServiceContext
from llama_index.core.settings import Settings


In [9]:
llm = OpenAI(model="gpt-3.5-turbo", temperature=0.1)
embed_model = OpenAIEmbedding(model='text-embedding-ada-002')

Settings.llm = llm
Settings.embed_model = embed_model
Settings.node_parser = node_parser


# auto_merging_context = ServiceContext.from_defaults(
#     llm=llm,
#     embed_model=embed_model,
#     node_parser=node_parser,
# )

storage_context = StorageContext.from_defaults()
storage_context.docstore.add_documents(nodes)

automerging_index = VectorStoreIndex(
    leaf_nodes, storage_context=storage_context
)

automerging_index.storage_context.persist(persist_dir="./merging_index")


In [None]:
# This block of code is optional to check
# if an index file exist, then it will load it
# if not, it will rebuild it

import os
from llama_index.core import VectorStoreIndex, StorageContext, load_index_from_storage
from llama_index.core import load_index_from_storage

if not os.path.exists("./merging_index"):
    storage_context = StorageContext.from_defaults()
    storage_context.docstore.add_documents(nodes)

    automerging_index = VectorStoreIndex(
            leaf_nodes,
            storage_context=storage_context,
            service_context=auto_merging_context
        )

    automerging_index.storage_context.persist(persist_dir="./merging_index")
else:
    automerging_index = load_index_from_storage(
        StorageContext.from_defaults(persist_dir="./merging_index"),
        service_context=auto_merging_context
    )

In [10]:
from llama_index.core.indices.postprocessor import SentenceTransformerRerank
from llama_index.core.retrievers import AutoMergingRetriever
from llama_index.core.query_engine import RetrieverQueryEngine

automerging_retriever = automerging_index.as_retriever(
    similarity_top_k=12
)

retriever = AutoMergingRetriever(
    automerging_retriever, 
    automerging_index.storage_context, 
    verbose=True
)

rerank = SentenceTransformerRerank(top_n=6, model="cross-encoder/ms-marco-MiniLM-L-2-v2")

auto_merging_engine = RetrieverQueryEngine.from_args(
    automerging_retriever, node_postprocessors=[rerank]
)

In [11]:
auto_merging_response = auto_merging_engine.query(
    'What is the accuracy of the mini2web methodology on unseen websites? And if 38.9% / 39.6% then why is it out of 39.6 and not out of 100'
)

In [12]:
from llama_index.core.response.notebook_utils import display_response

In [13]:
display_response(auto_merging_response)

**`Final Response:`** The accuracy of the mini2web methodology on unseen websites is 38.9% / 39.6%. It is expressed as a percentage out of 39.6 and not out of 100 because the evaluation metric used in this context is the step success rate, which is specifically calculated based on the number of successful steps taken by the model in completing a task.

## Putting it all together

In [20]:
import os

from llama_index.core import (
    ServiceContext,
    StorageContext,
    VectorStoreIndex,
    load_index_from_storage,
)
from llama_index.core.node_parser import HierarchicalNodeParser
from llama_index.core.node_parser import get_leaf_nodes
from llama_index.core import StorageContext, load_index_from_storage
from llama_index.core.retrievers import AutoMergingRetriever
from llama_index.core.indices.postprocessor import SentenceTransformerRerank
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.settings import Settings

def build_automerging_index(
    documents,
    llm,
    embed_model,
    save_dir="merging_index",
    chunk_sizes=None,
):
    chunk_sizes = chunk_sizes or [2048, 512, 128]
    node_parser = HierarchicalNodeParser.from_defaults(chunk_sizes=chunk_sizes)
    nodes = node_parser.get_nodes_from_documents(documents)
    leaf_nodes = get_leaf_nodes(nodes)

    Settings.llm = llm
    Settings.embed_model = embed_model
    Settings.node_parser = node_parser

    # merging_context = ServiceContext.from_defaults(
    #     llm=llm,
    #     embed_model=embed_model,
    # )


    storage_context = StorageContext.from_defaults()
    storage_context.docstore.add_documents(nodes)

    if not os.path.exists(save_dir):
        automerging_index = VectorStoreIndex(
            leaf_nodes, storage_context=storage_context
            )
        automerging_index.storage_context.persist(persist_dir=save_dir)
    else:
        automerging_index = load_index_from_storage(
            StorageContext.from_defaults(persist_dir=save_dir),
        )
    return automerging_index


def get_automerging_query_engine(
    automerging_index,
    similarity_top_k=12,
    rerank_top_n=6,
):
    base_retriever = automerging_index.as_retriever(similarity_top_k=similarity_top_k)
    retriever = AutoMergingRetriever(
        base_retriever, automerging_index.storage_context, verbose=True
    )
    rerank = SentenceTransformerRerank(
        top_n=rerank_top_n, model="cross-encoder/ms-marco-MiniLM-L-2-v2"
    )
    auto_merging_engine = RetrieverQueryEngine.from_args(
        retriever, node_postprocessors=[rerank]
    )
    return auto_merging_engine

In [21]:
from llama_index.llms.openai import OpenAI

index = build_automerging_index(
    [document],
    llm=OpenAI(model="gpt-3.5-turbo", temperature=0.1),
    embed_model=OpenAIEmbedding(model="text-embedding-ada-002"),
    save_dir="merging_index",
)

In [22]:
query_engine = get_automerging_query_engine(index, similarity_top_k=6)

## Trulens Evaluation

In [23]:
from trulens_eval import Tru

Tru().reset_database()

ðŸ¦‘ Tru initialized with db url sqlite:///default.sqlite .
ðŸ›‘ Secret keys may be written to the database. See the `database_redact_keys` option of Tru` to prevent this.


In [24]:
auto_merging_index_0 = build_automerging_index(
    documents,
    llm=OpenAI(model="gpt-3.5-turbo", temperature=0.1),
    embed_model=OpenAIEmbedding(model="text-embedding-ada-002"),
    save_dir="merging_index_0",
    chunk_sizes=[2048,512],
)

In [25]:
auto_merging_engine_0 = get_automerging_query_engine(
    auto_merging_index_0,
    similarity_top_k=12,
    rerank_top_n=6,
)

In [30]:
from trulens_eval  import OpenAI as fOpenAI

provider = fOpenAI(api_key=os.environ['OPENAI_API_KEY_'])

In [32]:
import numpy as np

from trulens_eval import TruLlama
from trulens_eval import Feedback

context_selection = TruLlama.select_source_nodes().node.text

f_qs_relevance = (
    Feedback(provider.qs_relevance_with_cot_reasons, name='Contextt Relevance').on_input().on(context_selection).aggregate(np.mean)
)

âœ… In Contextt Relevance, input question will be set to __record__.main_input or `Select.RecordInput` .
âœ… In Contextt Relevance, input context will be set to __record__.app.query.rets.source_nodes[:].node.text .


In [31]:
from trulens_eval import Feedback

f_qa_relevence = Feedback(
    provider.relevance_with_cot_reasons, name='Answer Relevencee'

).on_input().on_output()

âœ… In Answer Relevencee, input prompt will be set to __record__.main_input or `Select.RecordInput` .
âœ… In Answer Relevencee, input response will be set to __record__.main_output or `Select.RecordOutput` .


In [33]:
from trulens_eval.feedback import Groundedness

grounded = Groundedness(groundedness_provider=provider)

f_groundedness = (
    Feedback(grounded.groundedness_measure_with_cot_reasons, name='Groundedness')
    .on(context_selection).on_output().aggregate(grounded.grounded_statements_aggregator)
)

âœ… In Groundedness, input source will be set to __record__.app.query.rets.source_nodes[:].node.text .
âœ… In Groundedness, input statement will be set to __record__.main_output or `Select.RecordOutput` .


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/ibnabeeali/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [28]:
from trulens_eval import TruLlama
from trulens_eval import Feedback

In [34]:

qa_relevance = (
    Feedback(openai.relevance_with_cot_reasons, name="Answer Relevance")
    .on_input_output()
)

qs_relevance = (
    Feedback(openai.relevance_with_cot_reasons, name = "Context Relevance")
    .on_input()
    .on(TruLlama.select_source_nodes().node.text)
    .aggregate(np.mean)
)

#grounded = Groundedness(groundedness_provider=openai, summarize_provider=openai)
grounded = Groundedness(groundedness_provider=openai)

groundedness = (
    Feedback(grounded.groundedness_measure_with_cot_reasons, name="Groundedness")
        .on(TruLlama.select_source_nodes().node.text)
        .on_output()
        .aggregate(grounded.grounded_statements_aggregator)
)

feedbacks = [qa_relevance, qs_relevance, groundedness]

def get_prebuilt_trulens_recorder(query_engine, app_id):
    tru_recorder = TruLlama(
        query_engine,
        app_id=app_id,
        feedbacks=feedbacks
        )
    return tru_recorder

AttributeError: module 'openai' has no attribute 'relevance_with_cot_reasons'

In [26]:
from utils import get_prebuilt_trulens_recorder

tru_recorder = get_prebuilt_trulens_recorder(
    auto_merging_engine_0,
    app_id ='app_0'
)

OpenAIError: The api_key client option must be set either by passing api_key to the client or by setting the OPENAI_API_KEY environment variable

In [None]:
eval_questions = []
with open('generated_questions.text', 'r') as file:
    for line in file:
        # Remove newline character and convert to integer
        item = line.strip()
        eval_questions.append(item)

In [None]:
def run_evals(eval_questions, tru_recorder, query_engine):
    for question in eval_questions:
        with tru_recorder as recording:
            response = query_engine.query(question)

In [None]:
run_evals(eval_questions, tru_recorder, auto_merging_engine_0)

In [None]:
from trulens_eval import Tru

Tru().get_leaderboard(app_ids=[])

In [None]:
Tru().run_dashboard()