In [1]:
# !pip install llama_index==0.11.4
# !pip install docx2txt==0.8
# !pip install chromadb==0.5.5
# !pip install llama-index-vector-stores-chroma==0.2.0

In [2]:
from llama_index.core import Settings
from llama_index.llms.openai import OpenAI
import openai

openai.api_key = "sk-proj-your-openai-api-key"
Settings.llm = OpenAI(model="gpt-4o-mini", temperature=0.2)

# Indexing

## VectorStoreIndex

In [3]:
from llama_index.core import Document
from llama_index.core import VectorStoreIndex

text = """Cats are cute.
          The most concentrated hydrochloric acid has a maximum concentration of 40%.
          In its concentrated form, this acid can produce acid mists, all of which are corrosive to human tissue, 
          causing damage to the respiratory system, eyes, skin, and itching. """
doc = Document(text=text)
index = VectorStoreIndex.from_documents([doc])

In [4]:
index.vector_store

SimpleVectorStore(stores_text=False, is_embedding_query=True, data=SimpleVectorStoreData(embedding_dict={'b87dac78-6ee8-4c31-a793-9e6f07aba8af': [0.017988823354244232, 0.026263946667313576, 0.013541110791265965, -0.04445074126124382, 0.014346186071634293, 0.005866494961082935, -0.007437052205204964, -0.019282225519418716, 0.007357864640653133, -0.004282739013433456, 0.008948219008743763, 0.036004044115543365, -0.011158877983689308, 0.004147460218518972, -0.022410141304135323, 0.029906585812568665, 0.029325874522328377, 0.005955581087619066, -0.00415405910462141, -0.01233349647372961, -0.008149742148816586, 0.0004932738956995308, -0.008116747252643108, -0.020905574783682823, -0.018490346148610115, 0.016259891912341118, 0.02176344208419323, -0.015652785077691078, -0.0007263004081323743, -0.02375633455812931, 0.014900500886142254, -0.015085272490978241, -0.010888319462537766, 0.004579693544656038, -0.014187810942530632, 0.021209128201007843, -0.0017107860185205936, -0.008836036548018456, 

## Create index from node

In [5]:
import nest_asyncio
nest_asyncio.apply()

In [6]:
from llama_index.core import Document
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.extractors import TitleExtractor, QuestionsAnsweredExtractor
from llama_index.core.ingestion import IngestionPipeline, IngestionCache
from llama_index.core import VectorStoreIndex


text = """Cats are cute.
          The most concentrated hydrochloric acid has a maximum concentration of 40%.
          In its concentrated form, this acid can produce acid mists, all of which are corrosive to human tissue, causing damage to the respiratory system, eyes, skin, and itching. """

doc = Document(text=text)

# create the pipeline with transformations
pipeline = IngestionPipeline(
    transformations=[
        SentenceSplitter(chunk_size=50, chunk_overlap=0),
        TitleExtractor(),
        OpenAIEmbedding(),
    ]
)

# run the pipeline
nodes = pipeline.run(documents=[doc])

# Create index from nodes
index = VectorStoreIndex(nodes)

for node in nodes:
    print(node)

  0%|          | 0/2 [00:00<?, ?it/s]

100%|██████████| 2/2 [00:01<00:00,  1.59it/s]


Node ID: 8a660b33-5407-4618-bdb1-a40751908e73
Text: Cats are cute.           The most concentrated hydrochloric acid
has a maximum concentration of 40%.
Node ID: d3f11923-6f17-4c31-b0f9-b16c9379b7d2
Text: In its concentrated form, this acid can produce acid mists, all
of which are corrosive to human tissue, causing damage to the
respiratory system, eyes, skin, and itching.


In [7]:
index.vector_store

SimpleVectorStore(stores_text=False, is_embedding_query=True, data=SimpleVectorStoreData(embedding_dict={'8a660b33-5407-4618-bdb1-a40751908e73': [0.02688053622841835, 0.03097958117723465, 0.0194264966994524, -0.04637467488646507, 0.008921848610043526, 0.029437366873025894, -0.001681724563241005, -0.019494138658046722, -0.013440268114209175, -0.02107693813741207, 0.015543903224170208, 0.030925467610359192, -0.01074815634638071, 0.013758180662989616, -0.0284633357077837, 0.030573735013604164, 0.031980667263269424, 0.00116004329174757, -0.002634617267176509, -0.006425895728170872, -0.027381079271435738, 0.003960380796343088, -0.012926196679472923, -0.017803113907575607, -0.01651793345808983, 0.01619325764477253, 0.0338205024600029, -0.028652731329202652, 0.0014931752812117338, -0.025365376845002174, 0.009375043213367462, 0.005864475388079882, -0.011776299215853214, -0.01261504739522934, 0.006300759967416525, 0.012466237880289555, -0.011715422384440899, -0.00881362333893776, 0.005624349694

## Store vector storage index 

- Store index to file 
- Store index to database

### Vector Store

In [8]:
from llama_index.core import StorageContext, load_index_from_storage

# Persist the index to a directory
index.storage_context.persist(persist_dir="index_cache")

# Load the index from the directory
storage_context = StorageContext.from_defaults(
    persist_dir="index_cache")
reload_index = load_index_from_storage(storage_context)


### Vector Database

In [9]:
import chromadb
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core import VectorStoreIndex, StorageContext, Document


db = chromadb.PersistentClient(path="database")
chroma_collection = db.get_or_create_collection("my_chroma_store")

vector_store = ChromaVectorStore(
    chroma_collection=chroma_collection
)
storage_context = StorageContext.from_defaults(
    vector_store=vector_store
)

text = "I love my cat!"
doc = Document(text=text)
index = VectorStoreIndex.from_documents(
    documents=[doc],
    storage_context=storage_context,
)

In [10]:
reload_index = VectorStoreIndex.from_vector_store(
    vector_store=vector_store,
    storage_context=storage_context
)