In [1]:
#Retrieval augmented generation

import os
from dotenv import load_dotenv
load_dotenv()

True

In [2]:
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")

In [3]:
"""Loading using SimpleDirectoryReader#
The easiest reader to use is our SimpleDirectoryReader, which creates documents out of every file 
in a given directory.It is built in to LlamaIndex and can read a variety of formats including Markdown, 
PDFs, Word documents, PowerPoint decks, images, audio and video."""


from llama_index.core import VectorStoreIndex, SimpleDirectoryReader

documents= SimpleDirectoryReader("data").load_data()
print(documents)
print(documents[0].text)

[Document(id_='93edea83-ae12-448d-ae42-bf7584c67979', embedding=None, metadata={'page_label': '1', 'file_name': 'medical news today intermittent fasting type 2 diabetes.pdf', 'file_path': 'c:\\Users\\HP\\Desktop\\RAG DEMO\\data\\medical news today intermittent fasting type 2 diabetes.pdf', 'file_type': 'application/pdf', 'file_size': 1389503, 'creation_date': '2024-11-22', 'last_modified_date': '2024-11-22'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={}, text='', mimetype='text/plain', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\n\n{content}', metadata_template='{key}: {value}', metadata_seperator='\n'), Document(id_='d139775b-a391-4dc2-8d20-844c48d30b72', embedding=None, metadata={'page_label': '2', 'file_name': 'medical news today

In [4]:
documents

[Document(id_='93edea83-ae12-448d-ae42-bf7584c67979', embedding=None, metadata={'page_label': '1', 'file_name': 'medical news today intermittent fasting type 2 diabetes.pdf', 'file_path': 'c:\\Users\\HP\\Desktop\\RAG DEMO\\data\\medical news today intermittent fasting type 2 diabetes.pdf', 'file_type': 'application/pdf', 'file_size': 1389503, 'creation_date': '2024-11-22', 'last_modified_date': '2024-11-22'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={}, text='', mimetype='text/plain', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\n\n{content}', metadata_template='{key}: {value}', metadata_seperator='\n'),
 Document(id_='d139775b-a391-4dc2-8d20-844c48d30b72', embedding=None, metadata={'page_label': '2', 'file_name': 'medical news toda

In [5]:
"""After the data is loaded, you then need to process and transform your data before putting it into a storage system. 
These transformations include chunking, extracting metadata, and embedding each chunk. This is necessary to make sure 
that the data can be retrieved, and used optimally by the LLM."""

from llama_index.core.node_parser import SentenceSplitter

text_splitter = SentenceSplitter(chunk_size=512, chunk_overlap=10)

# global
from llama_index.core import Settings

Settings.text_splitter = text_splitter

# per-index
index = VectorStoreIndex.from_documents(documents, transformations=[text_splitter])

In [6]:
index

<llama_index.core.indices.vector_store.base.VectorStoreIndex at 0x144238f9450>

In [7]:
query_engine=index.as_query_engine()

In [8]:
query_engine

<llama_index.core.query_engine.retriever_query_engine.RetrieverQueryEngine at 0x144264e4410>

In [14]:
# from llama_index.retrievers import VectorIndexRetriever
from llama_index.core.indices.vector_store.retrievers import VectorIndexRetriever
# from llama_index.query_engine import RetrieverQueryEngine
from llama_index.core.query_engine import RetrieverQueryEngine
#from llama_index.indices.postprocessor import SimilarityPostprocessor
from llama_index.core.postprocessor import SimilarityPostprocessor


retriever = VectorIndexRetriever(index=index, similarity_top_k=4)
postprocessor = SimilarityPostprocessor(similarity_cutoff=0.80)

query_engine = RetrieverQueryEngine(
    retriever=retriever, node_postprocessors=[postprocessor]
)

In [15]:
response=query_engine.query("what is insulin resistance and risk factors")
response

Response(response="Insulin resistance is a condition where the body's cells become resistant to the effects of insulin, leading to elevated blood sugar levels. Risk factors for insulin resistance include obesity, physical inactivity, unhealthy diet, family history of diabetes, and certain medical conditions like polycystic ovary syndrome.", source_nodes=[NodeWithScore(node=TextNode(id_='fdf2ec54-84a5-4eff-a078-ff3743e429fc', embedding=None, metadata={'page_label': '7', 'file_name': 'NIH Insulin Resistance & Prediabetes.pdf', 'file_path': 'c:\\Users\\HP\\Desktop\\RAG DEMO\\data\\NIH Insulin Resistance & Prediabetes.pdf', 'file_type': 'application/pdf', 'file_size': 1762584, 'creation_date': '2024-11-22', 'last_modified_date': '2024-11-22'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accesse

In [16]:
# from llama_index.response.pprint_utils import pprint_response
# from llama_index.utils import pprint_response
# from llama_index.response.utils import pprint_response
# from llama_index.response.pprint_utils import pprint_response
from llama_index.core.response.pprint_utils import pprint_response

pprint_response(response, show_source=True)
print(response)

Final Response: Insulin resistance is a condition where the body's
cells become resistant to the effects of insulin, leading to elevated
blood sugar levels. Risk factors for insulin resistance include
obesity, physical inactivity, unhealthy diet, family history of
diabetes, and certain medical conditions like polycystic ovary
syndrome.
______________________________________________________________________
Source Node 1/4
Node ID: fdf2ec54-84a5-4eff-a078-ff3743e429fc
Similarity: 0.810038181906677
Text:
______________________________________________________________________
Source Node 2/4
Node ID: 97281fa3-aff9-462d-87b3-8fe441f40c8e
Similarity: 0.8090878685576732
Text:
______________________________________________________________________
Source Node 3/4
Node ID: cb975b37-3a46-48fd-a44a-8f130bfd81e1
Similarity: 0.8054640094491757
Text:
______________________________________________________________________
Source Node 4/4
Node ID: 7bc6dad8-9828-4cb9-b5c0-f5704d5ff52a
Similarity: 0.802610

In [19]:
import os.path
from llama_index.core import VectorStoreIndex,SimpleDirectoryReader,StorageContext,load_index_from_storage

# check if storage already exists
PERSIST_DIR = "./storage"
if not os.path.exists(PERSIST_DIR):
    # load the documents and create the index
    documents = SimpleDirectoryReader("data").load_data()
    index = VectorStoreIndex.from_documents(documents)
    # store it for later
    index.storage_context.persist(persist_dir=PERSIST_DIR)
else:
    # load the existing index
    storage_context = StorageContext.from_defaults(persist_dir=PERSIST_DIR)
    index = load_index_from_storage(storage_context)

# either way we can now query the index
query_engine = index.as_query_engine()
response = query_engine.query("what is the connection between insulin resistance and diabetes")
print(response)

Insulin resistance is a condition where the body's cells become less responsive to insulin, leading to higher levels of glucose in the blood. This can eventually result in the development of type 2 diabetes, as the pancreas struggles to produce enough insulin to regulate blood sugar levels effectively. Over time, if insulin resistance is not managed, it can progress to diabetes.
