# Background
* Objective : To understand data ingestion into RAG

# Prepare documents/nodes

In [1]:
print("Going to inject the data")

Going to inject the data


In [2]:
from llama_index.core.readers import SimpleDirectoryReader
from llama_index.core.node_parser import SimpleNodeParser
from llama_index.readers.file.unstructured import UnstructuredReader
from llama_index.core.settings import Settings


In [3]:
parser = UnstructuredReader()

dir_reader = SimpleDirectoryReader(
    input_dir="..\\data\\llamaindex-docs\\",
    file_extractor={".html":parser}
)

documents = dir_reader.load_data()

node_parser = SimpleNodeParser.from_defaults(
    chunk_size=500,
    chunk_overlap=50
)

# nodes = node_parser.get_nodes_from_documents(documents=documents) # This is not needed, llama-index does automatically

Settings._node_parser = node_parser

libmagic is unavailable but assists in filetype detection. Please consider installing libmagic for better results.


# Model settings

In [4]:

from llama_index.llms.ollama import Ollama
from llama_index.embeddings.ollama import OllamaEmbedding

In [5]:
llm = Ollama(model='mistral', temperature=0)

embedding_model = OllamaEmbedding(
    model_name='mxbai-embed-large'
)

Settings._embed_model = embedding_model
Settings._llm = llm

# Create vector store

In [6]:
import chromadb
from llama_index.core import StorageContext, VectorStoreIndex
from llama_index.vector_stores.chroma import ChromaVectorStore

# Enable if you don't want to send telemetry data
# import os
# os.environ["CHROMA_TELEMETRY_ENABLED"] = "false"

In [7]:
chroma_client = chromadb.PersistentClient(path="./data/6_chroma")
Chroma_collections = chroma_client.get_or_create_collection(name="chromadb")
chroma_store = ChromaVectorStore(chroma_collection=Chroma_collections)

# Create index

In [8]:
storage_context = StorageContext.from_defaults(
    vector_store=chroma_store
)

In [9]:
storage_index = VectorStoreIndex.from_documents(
    documents=documents,
    storage_context=storage_context,
    show_progress=True
)

Parsing nodes:   0%|          | 0/324 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/1993 [00:00<?, ?it/s]

# Create query engine

In [10]:
engine = storage_index.as_query_engine()

In [11]:
res = engine.query("What is llama index?")
print(res)

INFO:backoff:Backing off send_request(...) for 0.1s (requests.exceptions.ReadTimeout: HTTPSConnectionPool(host='us.i.posthog.com', port=443): Read timed out. (read timeout=15))


 LlamaIndex appears to be a toolkit that assists in preparing a knowledge base using various data connectors and indexes. It helps in ingesting data from different sources, parsing them into simple Document representations, and indexing the data for easy retrieval. In the querying stage, it retrieves the most relevant context given a user query to provide up-to-date knowledge that is not in its original training data, reducing hallucination. It also offers composable modules for building RAG pipelines for Q&A, chatbots, or agents.
