## LLamaindex - solution 3 

In [2]:
from llama_index.core import SimpleDirectoryReader, Document, VectorStoreIndex, Settings
from llama_index.core.node_parser import SentenceSplitter
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core.query_engine import RetrieverQueryEngine

In [3]:
loader = SimpleDirectoryReader(input_files=[".\data\llama2.pdf"])
docs = loader.load_data()
doc_text = "\n\n".join(d.get_content() for d in docs)
docs = [Document(text=doc_text)]

In [4]:
parser = SentenceSplitter()
nodes = parser.get_nodes_from_documents(docs)
for idx, node in enumerate(nodes):
    node.id_ = f"node-{idx}"

In [5]:
Settings.embed_model = HuggingFaceEmbedding(
    model_name="BAAI/bge-small-en-v1.5"
)

In [None]:
index = VectorStoreIndex(nodes)

## ChromaDB

In [11]:
import chromadb
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core import StorageContext

# load some documents
# I have nodes already 
# documents = SimpleDirectoryReader(r".\data\llama2.pdf").load_data()

# initialize client, setting path to save data
db = chromadb.PersistentClient(path="./chroma_db")

# create collection
chroma_collection = db.get_or_create_collection("quickstart")

# assign chroma as the vector_store to the context
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

# create your index
index = VectorStoreIndex(
    nodes, storage_context=storage_context, embed_model=HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")
    # documents, storage_context=storage_context
)

Insert of existing embedding ID: node-0
Insert of existing embedding ID: node-1
Insert of existing embedding ID: node-2
Insert of existing embedding ID: node-3
Insert of existing embedding ID: node-4
Insert of existing embedding ID: node-5
Insert of existing embedding ID: node-6
Insert of existing embedding ID: node-7
Insert of existing embedding ID: node-8
Insert of existing embedding ID: node-9
Insert of existing embedding ID: node-10
Insert of existing embedding ID: node-11
Insert of existing embedding ID: node-12
Insert of existing embedding ID: node-13
Insert of existing embedding ID: node-14
Insert of existing embedding ID: node-15
Insert of existing embedding ID: node-16
Insert of existing embedding ID: node-17
Insert of existing embedding ID: node-18
Insert of existing embedding ID: node-19
Insert of existing embedding ID: node-20
Insert of existing embedding ID: node-21
Insert of existing embedding ID: node-22
Insert of existing embedding ID: node-23
Insert of existing embeddi

### Querying

In [13]:
from llama_index.core.retrievers import VectorIndexRetriever

In [19]:
retriever = VectorIndexRetriever(index=index, similarity_top_k=1)
nodes = retriever.retrieve("Can you tell me about the key concepts for safety finetuning")
nodes[0].get_content()

'Further\ntesting and mitigation should be done to understand bias and other social issues for the specific context\nin which a system may be deployed. For this, it may be necessary to test beyond the groups available in\nthe BOLD dataset (race, religion, and gender). As LLMs are integrated and deployed, we look forward to\ncontinuing research that will amplify their potential for positive impact on these important social issues.\n4.2 Safety Fine-Tuning\nIn this section, we describe our approach to safety fine-tuning, including safety categories, annotation\nguidelines, and the techniques we use to mitigate safety risks. We employ a process similar to the general\nfine-tuning methods as described in Section 3, with some notable differences related to safety concerns.\nSpecifically, we use the following techniques in safety fine-tuning:\n1. Supervised Safety Fine-Tuning: We initialize by gathering adversarial prompts and safe demonstra-\ntions that are then included in the general super