In [11]:
import os
import openai
import sys


In [12]:
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv())

openai.api_key = os.environ['OPENAI_API_KEY_']


In [13]:
from llama_index.core import SimpleDirectoryReader

### Loading the Data from a pdf file from simple directory reader

In [14]:
documents = SimpleDirectoryReader(
    input_files=["data/Mind2Web.pdf"]
).load_data()

Seeing inside the structure of document

In [15]:
print(type(documents))
print(len(documents))
print(type(documents[0]))
print(len(documents[1].text))
print(documents[1])


<class 'list'>
24
<class 'llama_index.core.schema.Document'>
3463
Doc ID: 377819e9-66f0-4b68-8fb0-42e8561eec81
Text: (a) Find one-way flights from New York to  Toronto. (b) Book a
roundtrip on July 1 from Mumbai to  London and vice versa on July 5
for two adults.  (c) Find a flight from Chicago to London on  20 April
and return on 23 April.  (d) Find Elon Musk's profile and follow,
start  notifications and like the latest tweet.  (e) Browse comedy
films stream...


Now we'll merge it into a single document as it helps with overall accuracy 

In [16]:
from llama_index.core import Document

In [17]:
text = [doc.text for doc in documents]

In [18]:
text = '\n\n'.join(text)

In [19]:
document = Document(text='\n\n'.join([doc.text for doc in documents]))

In [20]:
print(document)

Doc ID: 520cc211-bb45-4403-beb4-f3ac891e4f54
Text: MIND2W EB: Towards a Generalist Agent for the Web Xiang Deng∗Yu
Gu Boyuan Zheng Shijie Chen Samuel Stevens Boshi Wang Huan Sun∗Yu Su∗
The Ohio State University https://osu-nlp-group.github.io/Mind2Web
Abstract We introduce MIND2W EB, the first dataset for developing and
evaluating generalist agents for the web that can follow language
instructio...


## Indexing the Documents

Service Context contains both the LLM and Embedding Model that we are going to use

In [21]:
from llama_index.core import VectorStoreIndex
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core.settings import Settings


In [22]:
llm = OpenAI(model="gpt-3.5-turbo", temperature=0.1)
embed_model = OpenAIEmbedding(model='text-embedding-ada-002')
Settings.llm = llm
Settings.embed_model = embed_model

index = VectorStoreIndex.from_documents([document])

## Simple Case of Showing How to generate embeddings from embedding model

In [36]:
embedding1 = embed_model.get_text_embedding("Hello How are you")
embedding2 = embed_model.get_text_embedding("Hey What's up")
embedding3 = embed_model.get_text_embedding("Why are you here")

In [37]:
import numpy as np

In [38]:
print(np.dot(embedding1, embedding2))
print(np.dot(embedding1, embedding3))
print(np.dot(embedding3, embedding2))

0.9046455973572882
0.8097291094090971
0.8054277595815253


## Query and Response

Next we create a query engine from this index that does allows us to send queries that do reterival and synthesis against this data

In [23]:
query_engine = index.as_query_engine()


In [24]:
response = query_engine.query(
    "What is the figure of their accuracy on unseen websites?"
)

print(str(response))

The accuracy on unseen websites is 38.9% / 39.6%.


## Sentence-Window Retrieval

SentenceWindowNodeParser object that split doc into chunks and augment each chunk with surrounding context

In [40]:
from llama_index.core.node_parser import SentenceWindowNodeParser

node_parser = SentenceWindowNodeParser.from_defaults(
    window_size=3,
    window_metadata_key='window',
    original_text_metadata_key='original_text'
)

#### How it works

In [41]:
text = 'hello. How are you? I am fine! Foo. Bar. Baz. What? Nyan! Ni ji chi san. Nyan. Arigato'

nodes = node_parser.get_nodes_from_documents([Document(text=text)])

In [42]:
print([x.text for x in nodes])

['hello. ', 'How are you? ', 'I am fine! ', 'Foo. ', 'Bar. ', 'Baz. ', 'What? ', 'Nyan! ', 'Ni ji chi san. ', 'Nyan. ', 'Arigato']


In [43]:
print(nodes[5].metadata["window"])

I am fine!  Foo.  Bar.  Baz.  What?  Nyan!  Ni ji chi san. 


### Building the Index

In [44]:
llm = OpenAI(model="gpt-3.5-turbo", temperature=0.1)
embed_model = OpenAIEmbedding(model='text-embedding-ada-002')
node_parser = SentenceWindowNodeParser.from_defaults(
    window_size=3,
    window_metadata_key='window',
    original_text_metadata_key='original_text'
)

Settings.llm = llm
Settings.embed_model = embed_model
Settings.node_parser = node_parser

In [45]:
from llama_index.core import VectorStoreIndex

In [46]:
sentence_index = VectorStoreIndex.from_documents([document])

In [47]:
sentence_index.storage_context.persist(persist_dir='sentence_index')

In [None]:
# This block of code is optional to check
# if an index file exist, then it will load it
# if not, it will rebuild it

import os
from llama_index import VectorStoreIndex, StorageContext, load_index_from_storage
from llama_index import load_index_from_storage

if not os.path.exists("sentence_index"):
    sentence_index = VectorStoreIndex.from_documents(
        [document], service_context=sentence_context
    )

    sentence_index.storage_context.persist(persist_dir="sentence_index")
else:
    sentence_index = load_index_from_storage(
        StorageContext.from_defaults(persist_dir="sentence_index"),
        service_context=sentence_context
    )

## Building Preprocessor
#### MetaDataReplacementPreprocessor 
##### Takes data stored in the metadata window and replace that data with the node 

In [48]:
from llama_index.core.indices.postprocessor import MetadataReplacementPostProcessor

post_proc = MetadataReplacementPostProcessor(target_metadata_key='window')

### Working

In [49]:
from llama_index.core.schema import NodeWithScore
from copy import deepcopy

scored_nodes = [NodeWithScore(node=x, score=1.0) for x in nodes]
nodes_old = [deepcopy(n) for n in nodes]

In [50]:
nodes_old[1].text

'How are you? '

In [52]:
replaced_nodes = post_proc.postprocess_nodes(scored_nodes)

In [53]:
print(replaced_nodes[1].text)

hello.  How are you?  I am fine!  Foo.  Bar. 


## Reranker
{It takes the query and the retreived chunks and reorders them on the basis of the relevance}

In [54]:
from llama_index.core.indices.postprocessor import SentenceTransformerRerank


In [57]:
rerank = SentenceTransformerRerank(top_n=2, model="cross-encoder/ms-marco-MiniLM-L-2-v2")

In [58]:
from llama_index.core import QueryBundle
from llama_index.core.schema import TextNode, NodeWithScore

In [60]:
query = QueryBundle('I want a dog')

scored_nodes = [
    NodeWithScore(node=TextNode(text='I want a cat'), score=0.8),
    NodeWithScore(node=TextNode(text='I want a dog'), score=0.4)
]

In [61]:
reranked_nodes = rerank.postprocess_nodes(
    scored_nodes, query_bundle=query
)

In [64]:
print([(x.text, x.score) for x in reranked_nodes])

[('I want a dog', 7.7261), ('I want a cat', -3.164504)]


In [65]:
sentence_window_engine = sentence_index.as_query_engine(
    similarity_top_k=6, node_postprocessors=[post_proc, rerank]
)

In [74]:
from llama_index.core.response.notebook_utils import display_response

In [75]:
query = "Describe the processing of the small language part in detail that is used in the first stage of pipeline in their methodology"

sentence_response = sentence_window_engine.query(query)
basic_response = query_engine.query(query)

display_response(basic_response)
display_response(sentence_response)

**`Final Response:`** The small language model is utilized in the first stage of the pipeline for candidate generation. It is employed to rank the elements present on a webpage based on the task description, the snapshot of the webpage at a specific step, and the actions performed in the preceding steps. This ranking task aims to select the top-k candidate elements that are most relevant to the task at hand. The small language model acts as a classifier, assigning scores to the candidate elements to determine their relevance and potential for further processing in the pipeline.

**`Final Response:`** In the first stage of the methodology's pipeline, a fine-tuned small language model (LM) is utilized to rank the elements found on a webpage. This process involves selecting the top-k HTML elements based on the task description, the snapshot of the webpage at a specific step, and the actions performed in the preceding steps. The small LM acts as a classifier to score and rank the candidate elements, ultimately generating a small pool of promising candidates for further processing in the subsequent stage of the methodology.