In [1]:
import os
import openai

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file

openai.api_key  = os.environ['OPENAI_API_KEY']

In [3]:
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.core.text_splitter import SentenceSplitter


In [4]:
def load_documents(file_path, num_pages=None):
  if num_pages:
    documents = SimpleDirectoryReader(input_files=[file_path]).load_data()[:num_pages]
  else:
    documents = SimpleDirectoryReader(input_files=[file_path]).load_data()
  return documents

def create_nodes(documents, chunk_size=2000, chunk_overlap=450):
  node_parser = SentenceSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
  nodes = node_parser.get_nodes_from_documents(documents)
  return nodes


# load documents
documents1 = load_documents(r"C:\Users\ASUS\Desktop\Jiten\RIL-70_compressed-1-50-1-20.pdf")
# documents2 = load_documents("dense_x_retrieval.pdf", 9)
# documents3 = load_documents("llama_beyond_english.pdf", 7)

# create nodes
nodes1 = create_nodes(documents1)
# nodes2 = create_nodes(documents2)
# nodes3 = create_nodes(documents3)



In [5]:
print(nodes1)



In [8]:
# from llama_index.core.llms import OpenAI
from llama_index.llms.openai import OpenAI

from llama_index.core.evaluation import generate_question_context_pairs

# web_search_queries qa template
web_search_queries_qa_tmpl = """\
Context information is below.

---------------------
{context_str}
---------------------

Given the context information and not prior knowledge. \
generate queries based on the below task. \

Task:
Your task is to create {num_questions_per_chunk} Web search-like queries. \
Restrict the queries to the context information provided. \

Following is the explaination for Web search-like queries: \
Shortened queries similar to those commonly entered into a search engine
An example query: Best retrieval concept
"
"""
llm = OpenAI(model='gpt-3.5-turbo-instruct', temperature=0.3)

web_search_queries_single_document = generate_question_context_pairs(
    nodes1, llm=llm, num_questions_per_chunk=5, qa_generate_prompt_tmpl = web_search_queries_qa_tmpl
)


  0%|          | 0/21 [00:00<?, ?it/s]

100%|██████████| 21/21 [06:14<00:00, 17.84s/it]


In [9]:
# Web search queries
queries = list(web_search_queries_single_document.queries.values())
queries

['"What are the benefits of 5G platforms for digital transformation?"',
 '"How does circularity contribute to a prosperous and connected future?"',
 '"What types of content are included in the integrated annual report for 2022-23?"',
 '"How does the inclusion of culture and performing arts impact community development and innovation?"',
 '"What are the available choices for accessing green and secure mobility options?"',
 'What is the current market capitalisation of Reliance Industries Limited in India?',
 "How has Reliance Industries Limited contributed to India's economic growth?",
 'What is the stakeholder value creation approach of Reliance Industries Limited?',
 'What are the key performance indicators of Reliance Industries Limited?',
 'How has Reliance Industries Limited integrated sustainability and ESG into its business strategy?',
 'What is the revenue and EBITDA of Reliance Industries Limited for the year ended March 31, 2023?',
 'What are the main products and capabilities