# This is a End to End LLM test project
<hr>

- Data Ingestion
- Data Indexing
- Query Interface

### Create the dot envrionment

In [6]:
import os

from dotenv import load_dotenv
load_dotenv()

True

### Get OpenAI API Key

In [7]:
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY')

### Get the data

In [8]:
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
documents = SimpleDirectoryReader('data').load_data()

In [8]:
documents

[Document(id_='4b1a07b7-80fa-4b01-bc26-46e90bbdcfab', embedding=None, metadata={'page_label': '1', 'file_name': 'Haida bracelet.pdf', 'file_path': '/Users/chris/Desktop/7980/CS7980_Capstone_RBCMuseum/llamaindex/data/Haida bracelet.pdf', 'file_type': 'application/pdf', 'file_size': 227621, 'creation_date': '2024-09-29', 'last_modified_date': '2024-09-29'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={}, text="Visit\nPlan Your Visit (http://royalbcmuseum.bc.ca/visit)\nAbout (http://royalbcmuseum.bc.ca/about)\nSupport Us (http://royalbcmuseum.bc.ca/support)\nContact Us (http://royalbcmuseum.bc.ca/contact)\nTickets (http://royalbcmuseum.bc.ca/tickets)\nWhat's On\nExhibitions (http://royalbcmuseum.bc.ca/exhibitions)\nImax® (http://royalbcmuseum.bc.ca/imax)\nCalenda

### Convert to vector and index them from the documents

- `show_progress` will show how fast it is indexing

In [9]:
index = VectorStoreIndex.from_documents(documents, show_progress=True)

Parsing nodes: 100%|██████████| 2/2 [00:00<00:00, 972.82it/s]
Generating embeddings: 100%|██████████| 2/2 [00:00<00:00,  6.09it/s]


Vector store index

In [63]:
index

<llama_index.core.indices.vector_store.base.VectorStoreIndex at 0x1642b45d0>

<hr>

### Modify the retrieval parameters

- this is modify the query engin by set different parameters
<hr>

[Vector doc](https://docs.llamaindex.ai/en/stable/api_reference/retrievers/vector/)

[Customer Retrieve](https://docs.llamaindex.ai/en/stable/examples/query_engine/CustomRetrievers/)

[Node Processor](https://docs.llamaindex.ai/en/stable/module_guides/querying/node_postprocessors/)

In [10]:
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.postprocessor import SimilarityPostprocessor

retriever = VectorIndexRetriever(index=index, similarity_top_k=3)
postprocessor = SimilarityPostprocessor(similarity_cutoff=0.77)

query_engine = RetrieverQueryEngine(retriever=retriever,
                                    node_postprocessors=[postprocessor])

<hr>

In [50]:
query_engine

<llama_index.core.query_engine.retriever_query_engine.RetrieverQueryEngine at 0x1668b2a50>

### Can use this index for query

query from the index as the query engine

In [11]:
query_engin = index.as_query_engine()

It is a retrieval engine, retrieving the from data index

In [12]:
query_engin

<llama_index.core.query_engine.retriever_query_engine.RetrieverQueryEngine at 0x1244871d0>

### Generate a query

In [11]:
query = 'What is the bracelet made of, who made it, which year it was made'

In [13]:
response = query_engine.query(query)

### Print the output

-`show_source` can show the source of the data

In [14]:
from llama_index.core.response.pprint_utils import pprint_response
pprint_response(response, show_source=True)

Final Response: The bracelet is made of silver, it was made by Robert
Davidson in the year 1980.
______________________________________________________________________
Source Node 1/1
Node ID: ba9e9264-7304-4c40-bbcb-e0cccaf467ac
Similarity: 0.7925291996483909
Text: Visit Plan Your Visit (http://royalbcmuseum.bc.ca/visit) About
(http://royalbcmuseum.bc.ca/about) Support Us
(http://royalbcmuseum.bc.ca/support) Contact Us
(http://royalbcmuseum.bc.ca/contact) Tickets
(http://royalbcmuseum.bc.ca/tickets) What's On Exhibitions
(http://royalbcmuseum.bc.ca/exhibitions) Imax®
(http://royalbcmuseum.bc.ca/imax) Calen...


## Storing
<hr>

### The most deafult way of storing in storage file

- the `PRESIST_DIR` is `storage`
- generally load the index from `load_index_from_storage`

In [25]:
import os.path
from llama_index.core import (
    VectorStoreIndex,
    SimpleDirectoryReader,
    StorageContext,
    load_index_from_storage,
)

# check if storage already exists
PERSIST_DIR = "./storage"
if not os.path.exists(PERSIST_DIR):
    # load the documents and create the index
    documents = SimpleDirectoryReader("data").load_data()
    index = VectorStoreIndex.from_documents(documents)
    # store it for later
    index.storage_context.persist(persist_dir=PERSIST_DIR)
else:
    # load the existing index
    storage_context = StorageContext.from_defaults(persist_dir=PERSIST_DIR)
    index = load_index_from_storage(storage_context)

## Using Chromadb

[ChromaDB](https://docs.llamaindex.ai/en/stable/understanding/storing/storing/)

To use Chroma to store the embeddings from a VectorStoreIndex, you need to:

- initialize the Chroma client
- create a Collection to store your data in Chroma
- assign Chroma as the `vector_store` in a `StorageContext`
- initialize your `VectorStoreIndex` using that `StorageContext`
- `chroma_collection` is a identity

In [None]:
import chromadb
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core import StorageContext

# load some documents
documents = SimpleDirectoryReader("./data").load_data()

# initialize client, setting path to save data
db = chromadb.PersistentClient(path="./chroma_db")

# create collection
chroma_collection = db.get_or_create_collection("bracelet")

# assign chroma as the vector_store to the context
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

# create your index
index = VectorStoreIndex.from_documents(
    documents, storage_context=storage_context
)

test index peformance

In [26]:
retriever = VectorIndexRetriever(index=index, similarity_top_k=3)
postprocessor = SimilarityPostprocessor(similarity_cutoff=0.2)

query_engine = RetrieverQueryEngine(retriever=retriever,
                                    node_postprocessors=[postprocessor])

response = query_engine.query(query)

pprint_response(response, show_source=True)

Final Response: The bracelet is made of silver, it was created by
Robert Davidson, and it was made in the year 1980.
______________________________________________________________________
Source Node 1/2
Node ID: 747acbbc-aae4-4185-88a9-e494065c0634
Similarity: 0.7925291996483909
Text: Visit Plan Your Visit (http://royalbcmuseum.bc.ca/visit) About
(http://royalbcmuseum.bc.ca/about) Support Us
(http://royalbcmuseum.bc.ca/support) Contact Us
(http://royalbcmuseum.bc.ca/contact) Tickets
(http://royalbcmuseum.bc.ca/tickets) What's On Exhibitions
(http://royalbcmuseum.bc.ca/exhibitions) Imax®
(http://royalbcmuseum.bc.ca/imax) Calen...
______________________________________________________________________
Source Node 2/2
Node ID: 7e2775dc-061f-4235-b4b0-636eab5fb48b
Similarity: 0.7611975805372408
Text: (/)PLAN YOUR VISIT Buy Tickets (http://royalbcmuseum.bc.‐
ca/visit/tickets-admission) Hours
(http://royalbcmuseum.bc.ca/visit/plan- your-visit/hours) & Location
(http://roy‐ albcmuseum.bc.ca/v

test the index performance

In [24]:
retriever = VectorIndexRetriever(index=index, similarity_top_k=3)
postprocessor = SimilarityPostprocessor(similarity_cutoff=0.2)

query_engine = RetrieverQueryEngine(retriever=retriever,
                                    node_postprocessors=[postprocessor])

response = query_engine.query(query)

pprint_response(response, show_source=True)

Number of requested results 3 is greater than number of elements in index 2, updating n_results = 2


Final Response: The bracelet is made of silver, it was created by the
artist Robert Davidson, and it was made in the year 1980.
______________________________________________________________________
Source Node 1/2
Node ID: 4591dc01-23a7-46aa-ae5e-b4b660694d28
Similarity: 0.6603788506952825
Text: Visit Plan Your Visit (http://royalbcmuseum.bc.ca/visit) About
(http://royalbcmuseum.bc.ca/about) Support Us
(http://royalbcmuseum.bc.ca/support) Contact Us
(http://royalbcmuseum.bc.ca/contact) Tickets
(http://royalbcmuseum.bc.ca/tickets) What's On Exhibitions
(http://royalbcmuseum.bc.ca/exhibitions) Imax®
(http://royalbcmuseum.bc.ca/imax) Calen...
______________________________________________________________________
Source Node 2/2
Node ID: 12ce35fc-1a0c-42fb-8a99-ef959f11cc39
Similarity: 0.620267265854705
Text: (/)PLAN YOUR VISIT Buy Tickets (http://royalbcmuseum.bc.‐
ca/visit/tickets-admission) Hours
(http://royalbcmuseum.bc.ca/visit/plan- your-visit/hours) & Location
(http://roy‐ albcmuse

### If you've already created and stored your embeddings, 

### you'll want to load them directly without loading your documents or creating a new VectorStoreIndex:


In [21]:
import chromadb
from llama_index.core import VectorStoreIndex
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core import StorageContext

# initialize client
db = chromadb.PersistentClient(path="./chroma_db")

# get collection
chroma_collection = db.get_or_create_collection("bracelet")

# assign chroma as the vector_store to the context
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

# load your index from stored vectors
index = VectorStoreIndex.from_vector_store(
    vector_store, storage_context=storage_context
)

test `query_engine` and query from previous 

In [22]:
retriever = VectorIndexRetriever(index=index, similarity_top_k=3)
postprocessor = SimilarityPostprocessor(similarity_cutoff=0.2)

query_engine = RetrieverQueryEngine(retriever=retriever,
                                    node_postprocessors=[postprocessor])

response = query_engine.query(query)

pprint_response(response, show_source=True)

Number of requested results 3 is greater than number of elements in index 2, updating n_results = 2


Final Response: The bracelet is made of silver and it was created by
the artist Robert Davidson in the year 1980.
______________________________________________________________________
Source Node 1/2
Node ID: 4591dc01-23a7-46aa-ae5e-b4b660694d28
Similarity: 0.6603788506952825
Text: Visit Plan Your Visit (http://royalbcmuseum.bc.ca/visit) About
(http://royalbcmuseum.bc.ca/about) Support Us
(http://royalbcmuseum.bc.ca/support) Contact Us
(http://royalbcmuseum.bc.ca/contact) Tickets
(http://royalbcmuseum.bc.ca/tickets) What's On Exhibitions
(http://royalbcmuseum.bc.ca/exhibitions) Imax®
(http://royalbcmuseum.bc.ca/imax) Calen...
______________________________________________________________________
Source Node 2/2
Node ID: 12ce35fc-1a0c-42fb-8a99-ef959f11cc39
Similarity: 0.620267265854705
Text: (/)PLAN YOUR VISIT Buy Tickets (http://royalbcmuseum.bc.‐
ca/visit/tickets-admission) Hours
(http://royalbcmuseum.bc.ca/visit/plan- your-visit/hours) & Location
(http://roy‐ albcmuseum.bc.ca/visit

<hr>

### Conclusion on Storing

- Both storage and chromadb works
- The **similarity** is diffrent.
- Storage is 0.79
- Chromadb is 0.66