# RAG Application

In [1]:
!pip install -qqq llama-index llama-hub langchain==0.1.2 openai accelerate==0.21.0 bitsandbytes==0.40.2 transformers sentence_transformers==2.2.2 InstructorEmbedding chromadb

In [2]:
!pip install llama-index-llms-huggingface



In [3]:
!pip install llama-index-vector-stores-chroma



In [4]:
!pip install llama-index-embeddings-langchain



## Setup

1. In this section we will work with the QLoRA paper and create an initial set of nodes (chunk size 1024).
2. We will use Open Source LLM [`zephyr-7b-alpha`](https://huggingface.co/HuggingFaceH4/zephyr-7b-alpha) and embedding [`hkunlp/instructor-large`](https://huggingface.co/hkunlp/instructor-large)

In [5]:
import json
import torch
from pathlib import Path

# transformers
from transformers import BitsAndBytesConfig

# llama_index
from llama_index.core import PromptTemplate
from llama_index.llms.huggingface import HuggingFaceLLM
from llama_index.core import download_loader, Document, VectorStoreIndex, ServiceContext
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.schema import IndexNode
from langchain.embeddings import HuggingFaceInstructEmbeddings
from llama_index.core.response.notebook_utils import display_source_node
from llama_index.core.retrievers import RecursiveRetriever
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.vector_stores.chroma.base import ChromaVectorStore
from llama_index.core.storage.storage_context import StorageContext

# Metadata Extraction
from llama_index.core.extractors import (
    SummaryExtractor,
    QuestionsAnsweredExtractor,
)

# db
import chromadb

DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"




In [6]:
print(DEVICE)

cuda:0


# Load Data

In [7]:
import os

# Set the locale to UTF-8
os.environ['LC_ALL'] = 'en_US.UTF-8'
os.environ['LANG'] = 'en_US.UTF-8'

# Create the 'data' directory
os.system('mkdir data')

# Download the PDF file
#os.system('wget --user-agent "Mozilla" "ENTER URL HERE" -O "data/brochure.pdf"')

256

In [8]:
PDFReader = download_loader("PDFReader")
loader = PDFReader()
#docs = loader.load_data(file=Path("./folder-name/file-name.pdf"))
docs = loader.load_data(file=Path("/content/Dexcom.pdf"))

  PDFReader = download_loader("PDFReader")


In [9]:
docs[1]

Document(id_='aac8668c-b83d-43c0-9a39-0051d4a51a7b', embedding=None, metadata={'page_label': '2', 'file_name': 'Dexcom.pdf'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, text='TOOURSTAKEHOLDERS\nOverthepastyear,Dexcomhasmadesigniﬁcantprogressadvancingourmission–\nempoweringpeop letotakecontrolofhealth. Wehavecommit tedourselvestosimplif ying\nthelivesofourcustomersthrough oursensorandsoftwaretechnology,andtoadvocatingfor\nbroader accesstoDexcom’sleadingContinuous Gluc oseMonitoring(CGM)systems around\ntheglobe .In2023,weexpanded ourglobalrolloutofDexcomG7,comple tedthelargest\nexpans ionofcoverageinourcompan y’shistory,anddeliv erednewoperational andﬁnancial\nefficiencieswhilegrowingourbusiness.WithCGMnowestablishedasthestandardofcare\nforallpeople usinginsulin,wehavefurthersolidiﬁed ourvaluepropositionwithindiabetes\nmanagement. However,aswelookforward,wealsoseealargeroleforDexcomwithinthe\nbroader metabolic health lands cape,andwebelievewecandomo

In [10]:
docs[2].get_content()

'EXPANDI NGOURGLOBALIMPACT\nWearecommittedtoimprovingaccesstoDexcomCGMtechnology forthehundredsofmillionsofpeoplewith diabetes\naround theworld.Advancing thiseffortandnavigatingmulti-facetedglob alhealthsystems requiresstrategicthinking\nacrossourbusiness.Towardthisgoal, wehaveprovenourleade rshipbyadvancingtheindustrywithhigh-p erforming\nCGM systems andkeyclinic altrials that establishthehealthandeconomicbeneﬁtsassociatedwith DexcomCGM.\nIn2023 ,wesigniﬁcantlyincreasedaccesstoDexcomCGMthroughourongoingclinicalandadvocacyeffortsandby\nleveraging ourCGM portfolio,whichallowsustomeettheuniqueneedsofdifferenthealthcaresystems.Mostnotably,in\nrecognition oftheoutcomesproveninDexcom’sMOBILEclinicaltrial,Medicareestablishedcoverageforpeoplewith\ntype2diabe teswhousebasal,orback ground,insulinonly,aswellascertainnon-insulinusingindivid ualsthat\nexperienc ehypoglycemia.Thisdecis ionrepresentedthelargestsingleexpansionofaccessforCGMinourcompany’s\nhistoryandwillbeakeymilestoneasweseektobroade

In [11]:
# combine all the text
doc_text = "\n\n".join([d.get_content() for d in docs])
documents = [Document(text=doc_text)]

# Chunking

In [12]:
node_parser = SentenceSplitter(chunk_size=1024)

In [13]:
base_nodes = node_parser.get_nodes_from_documents(documents)
# set node ids to be a constant
for idx, node in enumerate(base_nodes):
    node.id_ = f"node-{idx}"

In [14]:
# print all the node ids corrosponding to all the chunks
for node in base_nodes:
  print(node.id_)

node-0
node-1
node-2
node-3
node-4
node-5
node-6
node-7
node-8
node-9
node-10
node-11
node-12
node-13
node-14
node-15
node-16
node-17
node-18
node-19
node-20
node-21
node-22
node-23
node-24
node-25
node-26
node-27
node-28
node-29
node-30
node-31
node-32
node-33
node-34
node-35
node-36
node-37
node-38
node-39
node-40
node-41
node-42
node-43
node-44
node-45
node-46
node-47
node-48
node-49
node-50
node-51
node-52
node-53
node-54
node-55
node-56
node-57
node-58
node-59
node-60
node-61
node-62
node-63
node-64
node-65
node-66
node-67
node-68
node-69
node-70
node-71
node-72
node-73
node-74
node-75
node-76
node-77
node-78
node-79
node-80
node-81
node-82
node-83
node-84
node-85
node-86
node-87
node-88
node-89
node-90
node-91
node-92
node-93
node-94
node-95
node-96
node-97
node-98
node-99
node-100
node-101
node-102
node-103
node-104
node-105
node-106
node-107
node-108
node-109
node-110
node-111
node-112
node-113
node-114
node-115
node-116
node-117
node-118
node-119
node-120
node-121
node-122
nod

# LLM (`zephyr-7b-alpha`)

In [15]:
from google.colab import userdata

# huggingface api token
hf_token = userdata.get('hf_token')

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
)


def messages_to_prompt(messages):
  prompt = ""
  for message in messages:
    if message.role == 'system':
      prompt += f"<|system|>\n{message.content}</s>\n"
    elif message.role == 'user':
      prompt += f"<|user|>\n{message.content}</s>\n"
    elif message.role == 'assistant':
      prompt += f"<|assistant|>\n{message.content}</s>\n"

  # ensure we start with a system prompt, insert blank if needed
  if not prompt.startswith("<|system|>\n"):
    prompt = "<|system|>\n</s>\n" + prompt

  # add final assistant prompt
  prompt = prompt + "<|assistant|>\n"

  return prompt


llm = HuggingFaceLLM(
    model_name="HuggingFaceH4/zephyr-7b-alpha",
    tokenizer_name="HuggingFaceH4/zephyr-7b-alpha",
    query_wrapper_prompt=PromptTemplate("<|system|>\n</s>\n<|user|>\n{query_str}</s>\n<|assistant|>\n"),
    context_window=3900,
    max_new_tokens=256,
    model_kwargs={"quantization_config": quantization_config},
    # tokenizer_kwargs={},
    generate_kwargs={"temperature": 0.7, "top_k": 50, "top_p": 0.95},
    messages_to_prompt=messages_to_prompt,
    device_map="auto",
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

You are calling `save_pretrained` to a 4-bit converted model, but your `bitsandbytes` version doesn't support it. If you want to save 4-bit models, make sure to have `bitsandbytes>=0.41.3` installed.


# Embedding (`hkunlp/instructor-large`)

In [16]:
# If TypeError for while using Instruct Embeddings on HuggingFace // Update libraries to - !pip install langchain==0.1.2 sentence_transformers==2.2.2

In [17]:
# embed_model = HuggingFaceInstructEmbeddings(
#     model_name="hkunlp/instructor-large", model_kwargs={"device": DEVICE}
# )

In [18]:
from langchain_community.embeddings import HuggingFaceInstructEmbeddings

embed_model = HuggingFaceInstructEmbeddings(model_name="BAAI/bge-large-en-v1.5")

load INSTRUCTOR_Transformer
max_seq_length  512


In [19]:
# set your ServiceContext for all the next steps
service_context = ServiceContext.from_defaults(
    llm=llm, embed_model=embed_model
)

  service_context = ServiceContext.from_defaults(


## Baseline Retriever

Define a baseline retriever that simply fetches the top-k raw text nodes by embedding similarity.

In [20]:
base_index = VectorStoreIndex(base_nodes, service_context=service_context)

In [21]:
base_retriever = base_index.as_retriever(similarity_top_k=3)

In [22]:
#give me the MRPs for on kitchen skirting hardware 80mm series BL
retrievals = base_retriever.retrieve(
    "What locations does dexcom have a presence in?"
)

In [23]:
for n in retrievals:
    display_source_node(n, source_length=1500)

**Node ID:** node-169<br>**Similarity:** 0.7228175975845952<br>**Text:** Attn:SeanChristensen,VicePresident,
Financ eandInvestorRelations
6340Sequenc eDrive
SanDiego,CA92121
(858)200-0200
www .dexcom.com
investor-relations@d excom.com
ShareholderMeetingDate
May 22,2024
Meetingtobeheld virtuallyBoardofDirectors
KevinSayer
Chairman, President andChie fExecutiv eOfficer,Dexcom
Mark Foletta
Former SeniorVicePresident, Financeand
Chie fFinancialOfficer,AmylinPharm aceutic als
StevenR.Altman
Former President, Qualcomm
NicholasAugustinos
Former President &ChiefExecutiv eOfficerofAver,Inc.
dbaEnlaceHealth
RichardCollins
Form erChie fExecutiv eOfficerforUnitedHealthc are’s
NortheastRegion
KarenDahut
Chie fExecutiv eOfficerofGoogle Public Sector
Rimma Driscoll
Executiv eVicePresident andHead ofGlobalStrategy,
Commer cialandBusinessDevelopment, and
GlobalBioDevices,Zoetis
Bridge tteHeller
Former Executiv eVicePresident &PresidentofNutricia,
SpecializedNutrition,Danone
BarbaraE.Kahn
PattyandJayJ.BakerProfessorofMarketing,TheWharton
School attheUniversityofPennsylvania
KyleMalady
Executiv eVicePresident ofGlobal Networksand
Technology &ChiefTechnology Office r,Verizon
EricJ.Topol,M.D.
Director,Scripp sTranslational ScienceInstitute2023Annual Report

www .dex com.com|6 340 Sequence Drive |SanDiego, CA92121
Dexcom, DexcomClarit y,DexcomOne,andanyrelated logosanddesign marksareeither registered trademarks or
trademark sofDexcom,Inc.intheUnited Statesand/o rothercountries.<br>

**Node ID:** node-6<br>**Similarity:** 0.7035752295033949<br>**Text:** CorporateInformation
Themailingaddressofourheadquartersis6340Sequenc eDrive,SanDiego,California,92121, andourtelephone
numberatthatlocationis(858)200-0200 .Ourwebsiteaddressislocatedatdexcom.comandourinvestorrelations
websiteislocatedatinvestors.dexcom.com.WefileelectronicallywiththeSEC ourannual reportsonForm10-K,
quarterlyreportsonForm10-Q,currentreportsonForm8-K,andamendm entstothosereportsfiledorfurnished
pursuant toSection13(a)or15(d)oftheExchange Act.Wemakeavailableonourwebsite,freeofcharge,copiesof
thesereportsandotherinformationassoonasreasonabl ypracticableafterweelectronicallyfilesuchmaterialwith,
orfurnishitto,theSEC.Thereportsarealsoavailableatwww.sec.gov.
Weannou ncematerialinformationtothepublicabout us,ourproduc ts,andothermattersthroughavarietyof
means,includingfilingswiththeSEC ,pressreleases,publicconferencecalls,presentations,webcasts,andour
investorrelationswebsiteinordertoachievebroad,non-exclusionarydistributionofinformationtothepublicandto
complywithourdisclosureobligationsunderRegulationFD.Wealsoroutinelypostimportantinformationfor
investorsonourwebsitenotedabov e,andwemayusethiswebsiteasameansofdisclosingmaterial,non-public
informationandforcomplyingwithourdisclosureobligationsunder RegulationFD.Accordingly,investorsshould
monitortheInvestorRelationsportionofourwebsitenotedabov e.Alsoavailableonourwebsiteareprintable
versionsofourAuditCommitteecharter,CompensationCommitteecharter,NominatingandGovernance
Committeecharter,TechnologyCommitteecharter,C...<br>

**Node ID:** node-124<br>**Similarity:** 0.670940936360954<br>**Text:** 93

[REMAINDER OFPAGEINTENTIONALLYLEFTBLANK]

DexCom,Inc.
IndextoConsolidatedFinancialStatements
Page
ReportofIndep endent RegisteredPublicAccountingFirm.................................................................................... F-2
ConsolidatedBalanceSheets..................................................................................................................................... F-4
ConsolidatedStatementsofOperations.................................................................................................................... F-5
ConsolidatedStatementsofComprehens iveIncome............................................................................................. F-6
ConsolidatedStatementsofStockholders’Equity.................................................................................................. F-7
ConsolidatedStatementsofCashFlows................................................................................................................... F-8
NotestoConsolidatedFinanc ialStatements............................................................................................................ F-10
F-1

REPORTOFINDEPENDENTREGISTEREDPUBLICACCO UNTINGFIRM
TotheStockholdersandtheBoardofDirectorsofDexCom,Inc.
OpinionontheFinancialStatements
WehaveauditedtheaccompanyingconsolidatedbalancesheetsofDexCom,Inc.(theCompany)asof
December31,2023and2022, therelatedconsolidatedstatementsofoperations,comprehens iveincome,
stockholders’equityandcashfl...<br>

In [24]:
query_engine_base = RetrieverQueryEngine.from_args(
    base_retriever, service_context=service_context
)

In [25]:
response = query_engine_base.query(
    "What locations does dexcom have a presence in?"
)
print(str(response))

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


The provided context information does not explicitly mention the locations where Dexcom has a presence. However, it does provide contact information for the company's headquarters in San Diego, California, and notes that the company's annual report is available on their website, which is also located in San Diego. It is possible that Dexcom has additional locations, but this information is not provided in the given context.


The baseline retriever gives a very generalistic answer. Let's use some advanced RAG techniques to generate a better response.

## 1. Chunk References: Smaller Child Chunks Referring to Bigger Parent Chunk
## Automerging Retrieval

Now, we will build smaller chunks that will point to their bigger parent chunks.

During query-time, we retrieve smaller chunks, but we follow references to bigger chunks. This allows us to have more context for synthesis.

In [26]:
sub_chunk_sizes = [256, 512]
sub_node_parsers = [SentenceSplitter(chunk_size=c) for c in sub_chunk_sizes]

all_nodes = []
for base_node in base_nodes:
    for n in sub_node_parsers:
        sub_nodes = n.get_nodes_from_documents([base_node])
        sub_inodes = [
            IndexNode.from_text_node(sn, base_node.node_id) for sn in sub_nodes
        ]
        all_nodes.extend(sub_inodes)

    # also add original node to node
    original_node = IndexNode.from_text_node(base_node, base_node.node_id)
    all_nodes.append(original_node)

In [27]:
all_nodes_dict = {n.node_id: n for n in all_nodes}

In [28]:
all_nodes_dict.keys()

dict_keys(['c6ccd534-e4d6-4f75-8c6b-829f7224ee5d', 'b155db7b-ab82-411e-829f-fbf01783465e', '61ce0abb-d02a-440f-b72c-285101fb8de7', '7c4adf15-acd9-406e-96ca-f763e9e167c5', 'bc66e24c-0ec4-4807-af02-caec78658a94', 'c5ccbb4d-a1e8-4d29-abc8-519fc5008305', '9e9d4060-9953-4b29-be59-0b0fd36c7e7d', 'a23f1a35-bd5b-49ed-9783-26ca81ac43f8', '045fbaee-447e-4c84-aaa9-f34909bce90b', 'node-0', 'd46bef00-54df-401a-b593-9ee4bf7bfb8b', '299113d8-2dad-4a93-90fc-9855d1147ffd', '75c929ab-4da2-4ab9-a6e2-4a179b6b1531', '34467be0-45f7-4dab-9c16-33f147356a35', '7768c327-04c3-42f7-8d3a-cd0fa2987585', 'e5a60134-ac54-43f6-9d09-87c601b1d419', '3fb65881-8770-4196-9cf5-9eb12f55fcd3', 'f728ac94-1f78-43e5-b114-96da3bf46b11', '22ce40ef-3b1e-49c3-8afc-0cabd4512757', '5111d448-345e-4dc6-a54a-9282170be8db', 'b9b3dc06-1702-4091-a802-b1d5cc258f7e', 'node-1', '9ad1fbb5-ec81-47df-abb6-4e032bdf420f', '74549f40-3421-4d0e-8b0f-6874a2ca6fbc', '044b687e-a254-4882-ac33-f0b01a47826a', 'bc448ee7-d009-4911-9cc6-ceecdca7ed80', 'df43884d

In [29]:
#all_nodes_dict['5a10ca10-8282-4225-8882-523d4310a406']

In [30]:
# all_nodes_list = list(all_nodes_dict.keys())
# index_id = [x for x in all_nodes_list if "node-" not in x]
# for id in index_id:
#   print(f"{id} ---> {all_nodes_dict[id].index_id}")
#   print("-"*40, end="\n")

See that these many smaller chunks (`IndexNode`) are associated with each of the original text chunks(`TextNode`) for example `node-0`. In fact, all of the smaller chunks reference to the large chunk in the metadata with `index_id` pointing to the index ID of the larger chunk.

## Create Index from these smaller chunks (IndexNode)

In [31]:
vector_index_chunk = VectorStoreIndex(
    all_nodes, service_context=service_context
)

In [32]:
vector_retriever_chunk = vector_index_chunk.as_retriever(similarity_top_k=2)

When we perform retrieval, we want to retrieve the reference as opposed to the raw text. You can have multiple references point to the same node.

In [33]:
retriever_chunk = RecursiveRetriever(
    "vector",
    retriever_dict={"vector": vector_retriever_chunk},
    node_dict=all_nodes_dict,
    verbose=True,
)

In [34]:
nodes = retriever_chunk.retrieve(
    "What locations does dexcom have a presence in?"
)
for node in nodes:
    display_source_node(node, source_length=2000)

[1;3;34mRetrieving with query id None: What locations does dexcom have a presence in?
[0m[1;3;38;5;200mRetrieved node with id, entering: node-4
[0m[1;3;34mRetrieving with query id node-4: What locations does dexcom have a presence in?
[0m[1;3;38;5;200mRetrieved node with id, entering: node-168
[0m[1;3;34mRetrieving with query id node-168: What locations does dexcom have a presence in?
[0m

**Node ID:** node-4<br>**Similarity:** 0.7445489718479577<br>**Text:** 7262(b )) by the 
registered public accounting firm that prepared or issued its audit report. È 
If securities are registered pursuant to Section 12(b) of the Act, indicate by check mark whether the financial statements of t he registrant 
included in the filing reflect the correction of an error to previously issued financial statements. ‘ 
Indicate by check mark whether any of those error corrections are restatements that required a recovery analysis of incentive-
based compensation received by any of the registrant’s executive officers during the relevant recovery period pursuant to §240.10D-1(b). ‘ 
Indicate by check mark whether the registrant is a shell company (as defined in Rule 12b-2 of the Act). Yes ‘NoÈ 
As of June 30, 2023, the last business day of the registrant’s most recently completed second fiscal quarter, the aggregate mar ket value 
of the registrant’s common stock held by non-affiliates of the registrant was approximately $49.4 billion based on the closing sales price of 
$128.51 per share as reported on the Nasdaq Global Select Market on that date. Shares held by persons who may be deemed affilia tes 
have been excluded. This determination of affiliate status with respect to the foregoing calculation is not a determination for  other purposes. 
Indicate the number of shares outstanding of each of the registrant’s classes of common stock, as of the latest practicable dat e. 
Class Outstanding at February 1, 2024 
Common stock, $0.001 par value per share 385,515,421 
DOCUMENTS INCORPORATED BY REFERENCE 
Portions of the registrant’s definitive proxy statement relating to its 2024 Annual Meeting of Stockholders (the “Proxy Stateme nt”) 
are incorporated by reference in Part III, Items 10 through 14 of this Annual Report on Form 10-K, as specified in the response s to 
those item numbers, which proxy statement will be filed with the Securities and Exchange Commission within 120 days after the e nd of 
the fiscal year covered by this Form 10-K. 

D...<br>

**Node ID:** node-168<br>**Similarity:** 0.7294333914871163<br>**Text:** SCHEDULEII–VALUATIONAND QUALIFYINGACCO UNTS
(Inmillions)
TwelveMonthsEnded
December31,
Allowancefordoubtfulacco unts 2023 2022 2021
BeginningBalance............................................................................................ $7 .3$5 .4$7 .2
Provisionfordoubt fulaccounts................................................................. 2.02 .4 (1.4)
Write-offsandadjustments......................................................................... —( 0.5) (0.5)
Recoveries.................................................................................................... —— 0.1
EndingBalance................................................................................................. $9 .3$7 .3$5 .4
F-44

Forw ard-Lookin gStatement s
Thissumm aryrepor tandDexcom’s
Annual ReportonForm10-K
(Annual Report)include statement s
relating toDexcom’sbusinessplans ,
objectiv es,andexpect edoper ating
resultsthatare“forward-looking
statement s”within themeaning of
thePrivateSecuritie sLitigation
ReformActof1995.Forward-looking
statement sarebasedoncurrent
expect ations andassumptions that
aresubjecttorisksanduncertainties
that may causeactualresultsto
differmaterially .SeeDexcom’sﬁlings
with theSecuritie sandExchange
Commis sion,including itsmost
recentAnnual Report,fora
discussionofimpor tantriskfactors
that couldcauseactual eventsor
resultstodiffermaterially .Office rs
KevinSayer
Chair man,PresidentandChie fExecutiveOfficer
Donald M.Abbe y
Executiv eVicePresident, Glob alBusinessServices,IT,Quality
andRegulatoryAffairs
Michael Brown
Executiv eVicePresident, Chie fLegal Office r
MatthewDolan
Executiv eVicePresident, Strategy,CorporateDevelopment
andDexcomLabs
PaulFlynn
Executiv eVicePresident, Glob alRevenue
TeriLawver
Executiv eVicePresident, Chie fComm ercialOfficer
JakeLeach
Executiv eVicePresident, Chie fOperatingOfficer
LeverneMarsh
Executiv eVicePresident, Marketing
GirishNagan athan
Executiv eVicePresident, Chie fTechn olog yOfficer
StevenR.Pacelli
Exe...<br>

In [35]:
query_engine_chunk = RetrieverQueryEngine.from_args(
    retriever_chunk, service_context=service_context
)

In [36]:
response = query_engine_chunk.query(
    "What locations does dexcom have a presence in?"
)
print(str(response))

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[1;3;34mRetrieving with query id None: What locations does dexcom have a presence in?
[0m[1;3;38;5;200mRetrieved node with id, entering: node-4
[0m[1;3;34mRetrieving with query id node-4: What locations does dexcom have a presence in?
[0m[1;3;38;5;200mRetrieved node with id, entering: node-168
[0m[1;3;34mRetrieving with query id node-168: What locations does dexcom have a presence in?
[0mThe provided context information does not explicitly state the locations where Dexcom has a presence. However, it does mention that Dexcom's Annual Report on Form 10-K includes a discussion of important risk factors that could cause actual events or results to differ materially, and that officeholders Kevin Sayer, Donald M. Abbe, Michael Brown, Matthew Dolan, Leverne Marsh, Sadie M. Stern, and Jerome M. Sylvain are located in various locations. Additionally, the company's legal counsel, Fenwick & West LLP, is based in San Francisco, California. The company's independent auditors, Ernst & Youn

## 2. Metadata References: Summaries + Generated Questions referring to a bigger chunk

## Multirepresentation Indexing
Now, we will add some additional context that references the source node.

This additional context includes summaries as well as generated questions. 'Due to the limited compute I am only extracting questions, but you can uncomment the summarizer to extract summaries.'

During query-time, we retrieve smaller chunks, but we follow references to bigger chunks. This allows us to have more context for synthesis.

In [37]:
import nest_asyncio

nest_asyncio.apply()

In [38]:
extractors = [
    # SummaryExtractor(summaries=["self"], llm=llm, show_progress=True),
    QuestionsAnsweredExtractor(questions=1, llm=llm, show_progress=True),
]

In [39]:
# run metadata extractor across base nodes, get back dictionaries
metadata_dicts = []
for extractor in extractors:
    metadata_dicts.extend(extractor.extract(base_nodes))

  0%|          | 0/170 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting 

In [40]:
# all nodes consists of source nodes, along with metadata
import copy

all_nodes = copy.deepcopy(base_nodes)
for idx, d in enumerate(metadata_dicts):
    inode_q = IndexNode(
        text=d["questions_this_excerpt_can_answer"],
        index_id=base_nodes[idx].node_id,
    )
    # inode_s = IndexNode(
    #     text=d["section_summary"], index_id=base_nodes[idx].node_id)
    all_nodes.extend([inode_q]) #, inode_s

In [41]:
all_nodes_dict = {n.node_id: n for n in all_nodes}

In [42]:
vector_index_metadata = VectorStoreIndex(all_nodes, service_context=service_context)
vector_retriever_metadata = vector_index_metadata.as_retriever(similarity_top_k=2)

In [43]:
retriever_metadata = RecursiveRetriever(
    "vector",
    retriever_dict={"vector": vector_retriever_metadata},
    node_dict=all_nodes_dict,
    verbose=True,
)

In [44]:
nodes = retriever_metadata.retrieve(
    "What locations does dexcom have a presence in?"
)
for node in nodes:
    display_source_node(node, source_length=2000)

[1;3;34mRetrieving with query id None: What locations does dexcom have a presence in?
[0m[1;3;38;5;200mRetrieved node with id, entering: node-106
[0m[1;3;34mRetrieving with query id node-106: What locations does dexcom have a presence in?
[0m[1;3;38;5;200mRetrieved node with id, entering: node-126
[0m[1;3;34mRetrieving with query id node-126: What locations does dexcom have a presence in?
[0m

**Node ID:** node-106<br>**Similarity:** 0.751852563005777<br>**Text:** Overview
WhoWeAre
Weaream edicaldevicecompany primarilyfocusedonthedesign,developmentandcomme rcializationof
continuous glucosemonitoring,orCGM,systemsforthemanagem entofdiabetesbypatients,caregivers,and
cliniciansaroundtheworld.
WereceivedapprovalfromtheFoodandDrugAdministration,orFDA,andcomme rcializedourfirstproductin
2006.Welaunchedourlatestgene rationsystems,theDexcomG6®integratedContinuous GlucoseMonitoring
System,orG6,in2018andmorerecentlyreceivedmarketingclearancefromtheFDAontheDexcomG7®,or
G7,inDecember2022.
Unlessthecontextrequiresotherwise,theterms“we,”“us,”“our,”the“company,”or“Dexcom”refertoDexCom,
Inc.anditssubsidiaries.
GlobalPresence
WehavebuiltadirectsalesorganizationinNorthAmericaandcertaininternationalmarketstocallonhealth
careprofessionals,suchasendocrinologists,physiciansanddiabeteseduc ators,whocaneduc ateand
influenc epatientadoptionofcontinuous glucosemonitoring.Tocomplementourdirectsalesefforts,wehave
enteredintodistributionarrangementsinNorthAmericaandseveralinternationalmarketsthatallowdistributors
tosellourproduc ts.
FutureDevelopments
Produc tDevelopment:Weplantodevelopfuturegener ationsoftechnologiesthatarefocusedonimproved
performanceandconvenienceandthatwillenabl eintelligentinsulinadministration.Overthelonger term,we
plantocontinuetodevelopandimprovenetworkedplatformswithopenarchitecture,connec tivityand
transmitterscapableofcommunicatingwithotherdevices.Wealsointendtoexpand oureffortstoaccumulate
CGMpatientdataandmetricsandapplypredictivemodelingandmachinelearningtogener ateinteractiveCGM
insightsthatcaninformpatientbehavior.
Partnerships:Wealsocontinuetopursueandsupportdevelopmentpartnershipswithinsulinpumpcompanies
andcompaniesorinstitutionsdevelopinginsulindeliverysystems,includingautomatedinsulindeliverysystems.
NewOppor tunities:Wearealsoexploringhow toextendourofferingstootheroppor tunities,includingfor
peoplewithType2diabetesthatarenon-insulinusing,peopl ewithpre-diabetes,peoplewhoareobese,peopl e
whoarepregnant,andpeopl e...<br>

**Node ID:** node-126<br>**Similarity:** 0.7492809112610931<br>**Text:** Ourauditproceduresalsoincluded, amongothers,evaluatingthesignificantassumptionsand
theaccuracyandcompletenessoftheunder lyingdatausedinmanagem ent’scalculations.
Thisincluded testingcontractualrates,managem ent’sestimatesofproductssoldsubjectto
rebate,andinventoryheldbythirdpartiesattheendoftheperiod,throughacombinationof
under lyingdatavalidationbyinspectionofsourcedocuments,agreementtounder lying
contracts,andreviewforconsistencyagainsthistoricaldata.Inaddition,weinspectedthe
resultsoftheCompany’sanalysisofpharmacyrebatesclaimedandevaluatedtheestimates
madebasedonhistoricalexperience.
/s/Ernst&Young LLP
WehaveservedastheCompany’sauditorsince2000.
SanDiego,California
February8,2024
F-3

DexCom,Inc.
ConsolidatedBalanceSheets
December31,
2023 2022
(Inmillions,exceptparvaluedata)
Assets
Currentassets:.......................................................................................................................
Cashandcashequivalents............................................................................................. $5 66.3$ 642.3
Short-termmarketablesecurities................................................................................... 2,157.81 ,813.9
Accountsreceivable,net................................................................................................. 973.97 13.3
Inventory............................................................................................................................ 559.63 06.7
Prepaidandothercurrentassets................................................................................... 168.31 92.6
Totalcurrentassets..................................................................................................... 4,425.93 ,668.8
Propertyandequipment,net............................................................................................... 1,113.11 ,055.6
Operatingleaseright-of-useassets................................................................................... 71.4 80.0
Goodwi...<br>

In [45]:
query_engine_metadata = RetrieverQueryEngine.from_args(
    retriever_metadata, service_context=service_context
)

In [46]:
response = query_engine_metadata.query(
    "What locations does dexcom have a presence in?"
)
print(str(response))

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[1;3;34mRetrieving with query id None: What locations does dexcom have a presence in?
[0m[1;3;38;5;200mRetrieved node with id, entering: node-106
[0m[1;3;34mRetrieving with query id node-106: What locations does dexcom have a presence in?
[0m[1;3;38;5;200mRetrieved node with id, entering: node-126
[0m[1;3;34mRetrieving with query id node-126: What locations does dexcom have a presence in?
[0mThe text mentions that dexcom has built a direct sales organization in North America and certain international markets to call on healthcare professionals, such as endocrinologists, physicians, and diabetes educators, who can educate and influence patient adoption of continuous glucose monitoring. However, the specific international markets are not explicitly listed.


In [47]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"
!pip install cohere

Collecting cohere
  Downloading cohere-5.5.0-py3-none-any.whl (158 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m158.5/158.5 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting boto3<2.0.0,>=1.34.0 (from cohere)
  Downloading boto3-1.34.109-py3-none-any.whl (139 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m139.3/139.3 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting fastavro<2.0.0,>=1.9.4 (from cohere)
  Downloading fastavro-1.9.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m
Collecting httpx-sse<0.5.0,>=0.4.0 (from cohere)
  Downloading httpx_sse-0.4.0-py3-none-any.whl (7.8 kB)
Collecting types-requests<3.0.0,>=2.0.0 (from cohere)
  Downloading types_requests-2.32.0.20240521-py3-none-any.whl (15 kB)
Collecting botocore<1.35.0,>=1.34.109 (from boto3<2.0.0,>=1.34.0->cohere)
  Dow

In [48]:
 !pip install -qqq llama-index llama-hub cohere langchain openai accelerate==0.21.0 bitsandbytes==0.40.2 transformers sentence_transformers InstructorEmbedding

In [49]:
 import nest_asyncio
# nest_asyncio.apply()

import logging
import sys

logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

import json
import torch
from pathlib import Path
import pandas as pd
pd.set_option("display.max_colwidth", 0)

from copy import deepcopy

# transformers
from transformers import BitsAndBytesConfig

# llama_index
from llama_index.core import PromptTemplate
from llama_index.llms.huggingface import HuggingFaceLLM
from llama_index.core import download_loader, Document, VectorStoreIndex, ServiceContext
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.schema import IndexNode
from langchain.embeddings import HuggingFaceInstructEmbeddings

from llama_index.core.indices.query.query_transform import HyDEQueryTransform
from llama_index.core.query_engine.transform_query_engine import TransformQueryEngine

from IPython.display import Markdown, display
from llama_index.core.response.notebook_utils import display_source_node

from llama_index.core.query_engine import RetrieverQueryEngine
from IPython.display import Markdown, display, HTML
from llama_index.core.retrievers import VectorIndexRetriever

from sentence_transformers import SentenceTransformer

DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"

In [50]:
PDFReader = download_loader("PDFReader")
loader = PDFReader()
docs = loader.load_data(file=Path("/content/Dexcom.pdf"))

  PDFReader = download_loader("PDFReader")


In [51]:
node_parser = SentenceSplitter(chunk_size=256)
nodes = node_parser.get_nodes_from_documents(docs)

In [52]:
len(nodes)

1479

In [53]:
# from google.colab import userdata

# # huggingface api token
# hf_token = userdata.get('hf_token')

# quantization_config = BitsAndBytesConfig(
#     load_in_4bit=True,
#     bnb_4bit_compute_dtype=torch.float16,
#     bnb_4bit_quant_type="nf4",
#     bnb_4bit_use_double_quant=True,
# )


# def messages_to_prompt(messages):
#   prompt = ""
#   for message in messages:
#     if message.role == 'system':
#       prompt += f"<|system|>\n{message.content}</s>\n"
#     elif message.role == 'user':
#       prompt += f"<|user|>\n{message.content}</s>\n"
#     elif message.role == 'assistant':
#       prompt += f"<|assistant|>\n{message.content}</s>\n"

#   # ensure we start with a system prompt, insert blank if needed
#   if not prompt.startswith("<|system|>\n"):
#     prompt = "<|system|>\n</s>\n" + prompt

#   # add final assistant prompt
#   prompt = prompt + "<|assistant|>\n"

#   return prompt


# llm = HuggingFaceLLM(
#     model_name="HuggingFaceH4/zephyr-7b-alpha",
#     tokenizer_name="HuggingFaceH4/zephyr-7b-alpha",
#     query_wrapper_prompt=PromptTemplate("<|system|>\n</s>\n<|user|>\n{query_str}</s>\n<|assistant|>\n"),
#     context_window=3900,
#     max_new_tokens=256,
#     model_kwargs={"quantization_config": quantization_config},
#     # tokenizer_kwargs={},
#     generate_kwargs={"temperature": 0.7, "top_k": 50, "top_p": 0.95, "do_sample":True},
#     messages_to_prompt=messages_to_prompt,
#     device_map="auto",
# )

In [54]:
# from langchain_community.embeddings import HuggingFaceInstructEmbeddings

# embed_model = HuggingFaceInstructEmbeddings(model_name="BAAI/bge-large-en-v1.5")

In [55]:
# ServiceContext
service_context = ServiceContext.from_defaults(llm=llm,
                                               embed_model=embed_model
                                               )

# index
vector_index = VectorStoreIndex(
    nodes, service_context=service_context
)

  service_context = ServiceContext.from_defaults(llm=llm,


## 3. HyDE Query Transformation

In [56]:
#What locations does dexcom have a presence in?
query_str = "What locations does dexcom have a presence in?"

In [57]:
query_engine = vector_index.as_query_engine()
response = query_engine.query(query_str)
display(Markdown(f"<b>{response}</b>"))

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


<b>Dexcom has a presence in San Diego, California, Mesa, Arizona, and Penang, Malaysia.</b>

In [58]:
#What locations does dexcom have a presence in?
query_str = "What is the address of Dexcom's headquarters and the locations where they deliver their components?"
query_engine = vector_index.as_query_engine()
response = query_engine.query(query_str)
display(Markdown(f"<b>{response}</b>"))

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


<b>The address of Dexcom's headquarters is 634 Sequence Drive, San Diego, CA 92121. The locations where they deliver their components are not explicitly mentioned in the given context information. However, it is mentioned that they have three manufacturing facilities located in San Diego, California, Mesa, Arizona, and Penang, Malaysia.</b>

In [59]:
hyde = HyDEQueryTransform(include_original=True, llm=llm)
hyde_query_engine = TransformQueryEngine(query_engine, hyde)
response = hyde_query_engine.query(query_str)
display(Markdown(f"<b>{response}</b>"))

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


<b>The provided context information does not include the address of Dexcom's headquarters or the locations where they deliver their components.</b>

In [60]:
# To look at the hyde doc
query_bundle = hyde(query_str)
hyde_doc = query_bundle.embedding_strs[0]
hyde_doc

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


"Dexcom, a leading manufacturer of continuous glucose monitoring (CGM) systems, has its headquarters located at 100 Dexcom Drive, San Diego, California, USA. The company's components are delivered to various locations worldwide, including the United States, Canada, Europe, Asia, and Australia. Dexcom's products are distributed through a network of authorized distributors and directly to healthcare providers and patients. The company's CGM systems are designed to provide real-time glucose monitoring, enabling individuals with diabetes to better manage their condition and improve their overall health outcomes. Dexcom's headquarters is a state-of-the-art facility that houses research and development, manufacturing, and administrative functions. The company's commitment to innovation and excellence has earned it numerous awards and accolades, including being named one of the fastest-growing companies in the world by Fortune magazine."