In [1]:
import os
import openai
import sys


In [2]:
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv())

openai.api_key = os.environ['OPENAI_API_KEY_']


In [4]:
from llama_index.core import SimpleDirectoryReader

### Loading the Data from a pdf file from simple directory reader

In [53]:
documents = SimpleDirectoryReader(
    input_files=["data/Mind2Web.pdf"]
).load_data()

Seeing inside the structure of document

In [57]:
print(type(documents))
print(len(documents))
print(type(documents[0]))
print(len(documents[1].text))
print(documents[1])


<class 'list'>
24
<class 'llama_index.core.schema.Document'>
3463
Doc ID: 0bc95cf3-05be-40ac-8624-6b05fcfeb0a2
Text: (a) Find one-way flights from New York to  Toronto. (b) Book a
roundtrip on July 1 from Mumbai to  London and vice versa on July 5
for two adults.  (c) Find a flight from Chicago to London on  20 April
and return on 23 April.  (d) Find Elon Musk's profile and follow,
start  notifications and like the latest tweet.  (e) Browse comedy
films stream...


Now we'll merge it into a single document as it helps with overall accuracy 

In [60]:
from llama_index.core import Document

In [61]:
text = [doc.text for doc in documents]

In [62]:
text = '\n\n'.join(text)

In [63]:
document = Document(text='\n\n'.join([doc.text for doc in documents]))

In [64]:
print(document)

Doc ID: 89dad68f-2411-4a03-a405-7371c50e9434
Text: MIND2W EB: Towards a Generalist Agent for the Web Xiang Deng∗Yu
Gu Boyuan Zheng Shijie Chen Samuel Stevens Boshi Wang Huan Sun∗Yu Su∗
The Ohio State University https://osu-nlp-group.github.io/Mind2Web
Abstract We introduce MIND2W EB, the first dataset for developing and
evaluating generalist agents for the web that can follow language
instructio...


## Indexing the Documents

Service Context contains both the LLM and Embedding Model that we are going to use

In [41]:
from llama_index.core import VectorStoreIndex
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core.settings import Settings


In [65]:
llm = OpenAI(model="gpt-3.5-turbo", temperature=0.1)
embed_model = OpenAIEmbedding(model='text-embedding-ada-002')
Settings.llm = llm
Settings.embed_model = embed_model


index = VectorStoreIndex.from_documents([document])

## Simple Case of Showing How to generate embeddings from embedding model

In [36]:
embedding1 = embed_model.get_text_embedding("Hello How are you")
embedding2 = embed_model.get_text_embedding("Hey What's up")
embedding3 = embed_model.get_text_embedding("Why are you here")

In [37]:
import numpy as np

In [38]:
print(np.dot(embedding1, embedding2))
print(np.dot(embedding1, embedding3))
print(np.dot(embedding3, embedding2))

0.9046455973572882
0.8097291094090971
0.8054277595815253


## Query and Response

Next we create a query engine from this index that does allows us to send queries that do reterival and synthesis against this data

In [66]:
query_engine = index.as_query_engine()


In [70]:
response = query_engine.query(
    "What is the figure of their accuracy on unseen websites?"
)

print(str(response))

The accuracy on unseen websites is 38.9% / 39.6%.
