In [1]:
import os
import openai
import sys


In [2]:
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv())

openai.api_key = os.environ['OPENAI_API_KEY_']


In [3]:
from llama_index.core import SimpleDirectoryReader

### Loading the Data from a pdf file from simple directory reader

In [4]:
documents = SimpleDirectoryReader(
    input_files=["data/Mind2Web.pdf"]
).load_data()

Seeing inside the structure of document

In [5]:
print(type(documents))
print(len(documents))
print(type(documents[0]))
print(len(documents[1].text))
print(documents[1])


<class 'list'>
24
<class 'llama_index.core.schema.Document'>
3463
Doc ID: 471ce9c8-43c5-4c4d-84fe-aef4ac21516c
Text: (a) Find one-way flights from New York to  Toronto. (b) Book a
roundtrip on July 1 from Mumbai to  London and vice versa on July 5
for two adults.  (c) Find a flight from Chicago to London on  20 April
and return on 23 April.  (d) Find Elon Musk's profile and follow,
start  notifications and like the latest tweet.  (e) Browse comedy
films stream...


Now we'll merge it into a single document as it helps with overall accuracy 

In [6]:
from llama_index.core import Document

In [7]:
text = [doc.text for doc in documents]

In [8]:
text = '\n\n'.join(text)

In [9]:
document = Document(text='\n\n'.join([doc.text for doc in documents]))

In [10]:
print(document)

Doc ID: d8141fb3-ab68-474a-9a84-2b5e334b1978
Text: MIND2W EB: Towards a Generalist Agent for the Web Xiang Deng∗Yu
Gu Boyuan Zheng Shijie Chen Samuel Stevens Boshi Wang Huan Sun∗Yu Su∗
The Ohio State University https://osu-nlp-group.github.io/Mind2Web
Abstract We introduce MIND2W EB, the first dataset for developing and
evaluating generalist agents for the web that can follow language
instructio...


## Indexing the Documents

Service Context contains both the LLM and Embedding Model that we are going to use

In [11]:
from llama_index.core import VectorStoreIndex
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core.settings import Settings


In [12]:
llm = OpenAI(model="gpt-3.5-turbo", temperature=0.1)
embed_model = OpenAIEmbedding(model='text-embedding-ada-002')
Settings.llm = llm
Settings.embed_model = embed_model

index = VectorStoreIndex.from_documents([document])

## Simple Case of Showing How to generate embeddings from embedding model

In [13]:
embedding1 = embed_model.get_text_embedding("Hello How are you")
embedding2 = embed_model.get_text_embedding("Hey What's up")
embedding3 = embed_model.get_text_embedding("Why are you here")

In [14]:
import numpy as np

In [15]:
print(np.dot(embedding1, embedding2))
print(np.dot(embedding1, embedding3))
print(np.dot(embedding3, embedding2))

0.9046455973572882
0.8097210593940989
0.805423429745634


## Query and Response

Next we create a query engine from this index that does allows us to send queries that do reterival and synthesis against this data

In [16]:
query_engine = index.as_query_engine()


In [17]:
response = query_engine.query(
    "What is the figure of their accuracy on unseen websites?"
)

print(str(response))

The accuracy on unseen websites is 38.9% / 39.6%.


## Sentence-Window Retrieval

SentenceWindowNodeParser object that split doc into chunks and augment each chunk with surrounding context

In [18]:
from llama_index.core.node_parser import SentenceWindowNodeParser

node_parser = SentenceWindowNodeParser.from_defaults(
    window_size=3,
    window_metadata_key='window',
    original_text_metadata_key='original_text'
)

#### How it works

In [19]:
text = 'hello. How are you? I am fine! Foo. Bar. Baz. What? Nyan! Ni ji chi san. Nyan. Arigato'

nodes = node_parser.get_nodes_from_documents([Document(text=text)])

In [20]:
print([x.text for x in nodes])

['hello. ', 'How are you? ', 'I am fine! ', 'Foo. ', 'Bar. ', 'Baz. ', 'What? ', 'Nyan! ', 'Ni ji chi san. ', 'Nyan. ', 'Arigato']


In [21]:
print(nodes[5].metadata["window"])

I am fine!  Foo.  Bar.  Baz.  What?  Nyan!  Ni ji chi san. 


### Building the Index

In [22]:
llm = OpenAI(model="gpt-3.5-turbo", temperature=0.1)
embed_model = OpenAIEmbedding(model='text-embedding-ada-002')
node_parser = SentenceWindowNodeParser.from_defaults(
    window_size=3,
    window_metadata_key='window',
    original_text_metadata_key='original_text'
)

Settings.llm = llm
Settings.embed_model = embed_model
Settings.node_parser = node_parser

In [23]:
from llama_index.core import VectorStoreIndex

In [24]:
sentence_index = VectorStoreIndex.from_documents([document])

In [25]:
sentence_index.storage_context.persist(persist_dir='sentence_index')

In [None]:
# This block of code is optional to check
# if an index file exist, then it will load it
# if not, it will rebuild it

import os
from llama_index import VectorStoreIndex, StorageContext, load_index_from_storage
from llama_index import load_index_from_storage

if not os.path.exists("sentence_index"):
    sentence_index = VectorStoreIndex.from_documents(
        [document], service_context=sentence_context
    )

    sentence_index.storage_context.persist(persist_dir="sentence_index")
else:
    sentence_index = load_index_from_storage(
        StorageContext.from_defaults(persist_dir="sentence_index"),
        service_context=sentence_context
    )

## Building Preprocessor
#### MetaDataReplacementPreprocessor 
##### Takes data stored in the metadata window and replace that data with the node 

In [27]:
from llama_index.core.indices.postprocessor import MetadataReplacementPostProcessor

post_proc = MetadataReplacementPostProcessor(target_metadata_key='window')

### Working

In [28]:
from llama_index.core.schema import NodeWithScore
from copy import deepcopy

scored_nodes = [NodeWithScore(node=x, score=1.0) for x in nodes]
nodes_old = [deepcopy(n) for n in nodes]

In [29]:
nodes_old[1].text

'How are you? '

In [30]:
replaced_nodes = post_proc.postprocess_nodes(scored_nodes)

In [31]:
print(replaced_nodes[1].text)

hello.  How are you?  I am fine!  Foo.  Bar. 


## Reranker
{It takes the query and the retreived chunks and reorders them on the basis of the relevance}

In [32]:
from llama_index.core.indices.postprocessor import SentenceTransformerRerank


In [33]:
rerank = SentenceTransformerRerank(top_n=2, model="cross-encoder/ms-marco-MiniLM-L-2-v2")

In [34]:
from llama_index.core import QueryBundle
from llama_index.core.schema import TextNode, NodeWithScore

In [35]:
query = QueryBundle('I want a dog')

scored_nodes = [
    NodeWithScore(node=TextNode(text='I want a cat'), score=0.8),
    NodeWithScore(node=TextNode(text='I want a dog'), score=0.4)
]

In [36]:
reranked_nodes = rerank.postprocess_nodes(
    scored_nodes, query_bundle=query
)

In [37]:
print([(x.text, x.score) for x in reranked_nodes])

[('I want a dog', 7.7261), ('I want a cat', -3.164504)]


In [38]:
sentence_window_engine = sentence_index.as_query_engine(
    similarity_top_k=6, node_postprocessors=[post_proc, rerank]
)

In [39]:
from llama_index.core.response.notebook_utils import display_response

In [40]:
query = "Describe the processing of the small language part in detail that is used in the first stage of pipeline in their methodology"

sentence_response = sentence_window_engine.query(query)
basic_response = query_engine.query(query)

display_response(basic_response)
display_response(sentence_response)

**`Final Response:`** In the first stage of the pipeline in their methodology, the small language model is utilized to rank the elements present on a webpage. This ranking task involves selecting the top-k candidate elements based on the task description, the snapshot of the webpage at a specific step, and the actions performed in the preceding steps. The small language model acts as a classifier, assigning scores to the candidate elements to determine the most promising ones for further processing.

**`Final Response:`** In the first stage of the pipeline, a fine-tuned small language model (LM) is utilized to rank the elements present on a webpage. This process involves treating candidate generation as a ranking task, where the goal is to select the top-k HTML elements based on the task description, the snapshot of the webpage at a specific step, and the actions performed in the preceding steps. The small LM is responsible for scoring and ranking these candidate elements, ultimately producing a small pool of promising candidates for further processing in the subsequent stage of the methodology.

### Putting all the code together

In [41]:
import os
from llama_index.core import ServiceContext, VectorStoreIndex, StorageContext
from llama_index.core.node_parser import SentenceWindowNodeParser
from llama_index.core.indices.postprocessor import MetadataReplacementPostProcessor
from llama_index.core.indices.postprocessor import SentenceTransformerRerank
from llama_index.core import load_index_from_storage

llm = OpenAI(model="gpt-3.5-turbo", temperature=0.1)
embed_model = OpenAIEmbedding(model='text-embedding-ada-002')
node_parser = SentenceWindowNodeParser.from_defaults(
    window_size=3,
    window_metadata_key='window',
    original_text_metadata_key='original_text'
)


def build_sentence_window_index(
    documents,
    llm,
    embed_model=embed_model,
    sentence_window_size=3,
    save_dir="sentence_index",
):
    # create the sentence window node parser w/ default settings
    node_parser = SentenceWindowNodeParser.from_defaults(
        window_size=sentence_window_size,
        window_metadata_key="window",
        original_text_metadata_key="original_text",
    )
    sentence_context = ServiceContext.from_defaults(
        llm=llm,
        embed_model=embed_model,
        node_parser=node_parser,
    )
    if not os.path.exists(save_dir):
        sentence_index = VectorStoreIndex.from_documents(
            documents, service_context=sentence_context
        )
        sentence_index.storage_context.persist(persist_dir=save_dir)
    else:
        sentence_index = load_index_from_storage(
            StorageContext.from_defaults(persist_dir=save_dir),
            service_context=sentence_context,
        )

    return sentence_index


def get_sentence_window_query_engine(
    sentence_index, similarity_top_k=6, rerank_top_n=2
):
    # define postprocessors
    postproc = MetadataReplacementPostProcessor(target_metadata_key="window")
    rerank = SentenceTransformerRerank(
        top_n=rerank_top_n, model="cross-encoder/ms-marco-MiniLM-L-2-v2"
    )

    sentence_window_engine = sentence_index.as_query_engine(
        similarity_top_k=similarity_top_k, node_postprocessors=[postproc, rerank]
    )
    return sentence_window_engine

In [42]:
from trulens_eval import Tru

tru = Tru()
tru.reset_database()

🦑 Tru initialized with db url sqlite:///default.sqlite .
🛑 Secret keys may be written to the database. See the `database_redact_keys` option of Tru` to prevent this.


In [43]:
from llama_index.core import SimpleDirectoryReader

documents = SimpleDirectoryReader(
    input_files=["data/Mind2Web.pdf"]
).load_data()

In [44]:
from llama_index.core import Document

document = Document(text='\n\n'.join([doc.text for doc in documents]))

In [45]:
print(document)

Doc ID: 4ef8e74b-4d60-4a54-9e36-865caded8ea1
Text: MIND2W EB: Towards a Generalist Agent for the Web Xiang Deng∗Yu
Gu Boyuan Zheng Shijie Chen Samuel Stevens Boshi Wang Huan Sun∗Yu Su∗
The Ohio State University https://osu-nlp-group.github.io/Mind2Web
Abstract We introduce MIND2W EB, the first dataset for developing and
evaluating generalist agents for the web that can follow language
instructio...


In [46]:
sentence_index = build_sentence_window_index(
    document,
    llm=llm,
    embed_model=embed_model,
    save_dir='sentence_index'
)

  sentence_context = ServiceContext.from_defaults(


In [47]:
sentence_window_engine = get_sentence_window_query_engine(sentence_index)

In [48]:
output = sentence_window_engine.query(
    "How do you create AI portfolio?"
)
display_response(output)

**`Final Response:`** You can create an AI portfolio by developing and evaluating generalist agents for the web using datasets like MIND2W EB. Leveraging the power of large language models, such as in the case of MINDACT, can be effective in tackling tasks related to web understanding and action taking. Additionally, integrating multi-modal information, exploring reinforcement learning with feedback from real websites, and utilizing specialized language models for web-related tasks are promising directions to consider for building your AI portfolio.

### Evaluation using Truelens

In [49]:
import nest_asyncio
nest_asyncio.apply()

### Quesation Answer Relevence 

In [50]:
from trulens_eval  import OpenAI as fOpenAI

provider = fOpenAI(api_key=os.environ['OPENAI_API_KEY_'])

In [51]:
from trulens_eval import Feedback

f_qa_relevence = Feedback(
    provider.relevance_with_cot_reasons, name='Answer Relevencee'

).on_input().on_output()

✅ In Answer Relevencee, input prompt will be set to __record__.main_input or `Select.RecordInput` .
✅ In Answer Relevencee, input response will be set to __record__.main_output or `Select.RecordOutput` .


### Context-Relevance

In [52]:
import numpy as np

from trulens_eval import TruLlama

context_selection = TruLlama.select_source_nodes().node.text


In [53]:
f_qs_relevance = (
    Feedback(provider.qs_relevance_with_cot_reasons, name='Contextt Relevance').on_input().on(context_selection).aggregate(np.mean)
)

✅ In Contextt Relevance, input question will be set to __record__.main_input or `Select.RecordInput` .
✅ In Contextt Relevance, input context will be set to __record__.app.query.rets.source_nodes[:].node.text .


### Groundedness

In [54]:
from trulens_eval.feedback import Groundedness

grounded = Groundedness(groundedness_provider=provider)

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/ibnabeeali/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [55]:
f_groundedness = (
    Feedback(grounded.groundedness_measure_with_cot_reasons, name='Groundedness')
    .on(context_selection).on_output().aggregate(grounded.grounded_statements_aggregator)
)

✅ In Groundedness, input source will be set to __record__.app.query.rets.source_nodes[:].node.text .
✅ In Groundedness, input statement will be set to __record__.main_output or `Select.RecordOutput` .


## Starting Evaluation

In [56]:
from trulens_eval import TruLlama
from trulens_eval import FeedbackMode

In [57]:
tru_recorder = TruLlama(
    sentence_window_engine,
    app_id='App_1',
    feedbacks=[
        f_qa_relevence, 
        f_qs_relevance,
        f_groundedness
    ]
)

In [58]:
eval_questions = []
with open('data/eval_questions.txt', 'r') as file:
    for line in file:
        # Remove newline character and convert to integer
        item = line.strip()
        eval_questions.append(item)

In [59]:
eval_questions

['What is the primary objective of MIND2WEB?',
 'How many tasks were collected in MIND2WEB, and from how many websites?',
 'What are the three necessary ingredients provided by MIND2WEB for building generalist web agents?',
 'How does MIND2WEB differ from existing datasets for web agents?',
 'What are the desiderata for a generalist agent for the web according to the paper?',
 'What challenges are associated with building an agent for the web, as discussed in the paper?',
 'What is MINDACT, and how does it address the challenges of using large language models (LLMs) for web agents?',
 'What are the key features of the MIND2WEB dataset?',
 'How does the data collection process for MIND2WEB differ from existing datasets?',
 'What are the unique research challenges presented by MIND2WEB for developing generalist agents for the web?',
 'What is the approach used for task demonstration in MIND2WEB?',
 'How many tasks were verified and retained in MIND2WEB after the data collection process?'

In [None]:
for question in eval_questions:
    with tru_recorder as recording:
        sentence_window_engine.query(question)

In [None]:
records, feedback = tru.get_records_and_feedback(app_ids=[])
records.head()

In [62]:
import pandas as pd

pd.set_option("display.max_colwidth", None)
records[["input", "output"] + feedback]

Unnamed: 0,input,output,Answer Relevencee,Groundedness,Contextt Relevance
0,"""What is the primary objective of MIND2WEB?""","""The primary objective of MIND2WEB is for the agent to complete specific tasks on target websites through a series of actions, with each instance in the dataset containing task descriptions, action sequences, and webpage snapshots to facilitate the development of agents that can comprehend and carry out tasks in a more autonomous fashion.""",1.0,0.5,0.9
1,"""How many tasks were collected in MIND2WEB, and from how many websites?""","""In MIND2WEB, a total of 2,411 tasks were collected from 137 websites.""",1.0,0.8,0.85
2,"""What are the three necessary ingredients provided by MIND2WEB for building generalist web agents?""","""The three necessary ingredients provided by MIND2WEB for building generalist web agents are diverse domains, websites, and tasks; use of real-world websites instead of simulated and simplified ones; and a broad spectrum of user interaction patterns.""",1.0,1.0,0.85
3,"""How does MIND2WEB differ from existing datasets for web agents?""","""MIND2W EB differs from existing datasets for web agents by providing a diverse range of domains, websites, and tasks, the use of real-world websites instead of simulated ones, and a broad spectrum of user interaction patterns.""",0.9,1.0,0.9
4,"""What are the desiderata for a generalist agent for the web according to the paper?""","""The desiderata for a generalist agent for the web according to the paper are that it should be able to work on any website on the Internet and handle real-world websites that are dynamic, complex, and noisy.""",0.8,1.0,0.85
5,"""What challenges are associated with building an agent for the web, as discussed in the paper?""","""Challenges associated with building an agent for the web include the need to process long and highly structured documents effectively, the difficulty of planning and grounding when only high-level goals are available, the potential bias in data collection towards English-language websites primarily used in the U.S., the limitation of modeling the web environment using only textual context without considering visual information, the need to effectively model interaction dynamics and dynamic environment transformations, the lack of flexibility in human-agent interaction where users may want to adjust task requirements, the limitations of evaluating systems with cached offline environments leading to potential false negatives, and the importance of addressing safety considerations for real-world deployment such as managing sensitive actions and preventing security breaches.""",1.0,0.8,0.8
6,"""What is MINDACT, and how does it address the challenges of using large language models (LLMs) for web agents?""","""MINDACT is a two-stage model introduced to address the challenges of using large language models (LLMs) for web agents. The first stage of MINDACT involves using a fine-tuned small LM to filter the web elements, selecting a small pool of promising candidates. In the second stage, these candidate elements are consolidated to form a representative snippet of the webpage, which is then processed by an LLM to predict the final action, including predicting both the element for interaction and the corresponding operation. This approach allows for the efficient handling of the vast amount of elements present in HTML documents of real-world webpages, which would otherwise be infeasible or too costly to directly feed into an LLM's context.""",0.9,1.0,0.8
7,"""What are the key features of the MIND2WEB dataset?""","""The key features of the MIND2WEB dataset include diverse domains, websites, and tasks, the use of real-world websites instead of simulated ones, and a broad spectrum of user interaction patterns.""",0.8,1.0,0.85
8,"""How does the data collection process for MIND2WEB differ from existing datasets?""","""The data collection process for MIND2WEB differs from existing datasets in several ways. It covers a wide range of websites from various domains, totaling 137 websites from 31 domains, allowing for comprehensive testing of an agent's ability to generalize across diverse environments. Unlike previous studies, MIND2WEB utilizes real-world websites without manual simplification, resulting in environments with complexity that better reflects the intricacy of the modern web. Additionally, the dataset includes over 1,000 elements per page embedded within complex DOM structures, posing a significant challenge for modeling how to effectively process such long and highly structured documents. Lastly, MIND2WEB tasks are open-ended, exploring different functionalities of websites, unlike prior studies that primarily focus on testing the agent's ability to translate low-level instructions into actions.""",1.0,,0.8
9,"""What are the unique research challenges presented by MIND2WEB for developing generalist agents for the web?""","""The unique research challenges presented by MIND2WEB for developing generalist agents for the web include testing an agent's ability to generalize across varied environments spanning multiple websites and domains without manual simplification, dealing with the complexity of real-world websites with over 1,000 elements per page, and addressing the challenge of planning and grounding when only high-level goals are available instead of step-by-step directives.""",0.9,0.0,0.85


In [64]:
tru.get_leaderboard(app_ids=[])

Unnamed: 0_level_0,Answer Relevencee,Groundedness,Contextt Relevance,latency,total_cost
app_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
App_1,0.905,0.81,0.7925,3.1,0.0


In [None]:
tru.run_dashboard()