In [None]:
# !pip install llama_index==0.11.4
# !pip install docx2txt==0.8
# !pip install chromadb==0.5.5
# !pip install llama-index-vector-stores-chroma==0.2.0

In [1]:
from llama_index.core import Settings
from llama_index.llms.openai import OpenAI
import openai

openai.api_key = "sk-proj-your-openai-api-key"
Settings.llm = OpenAI(model="gpt-4o-mini", temperature=0.2)

# Indexing

## VectorStoreIndex


In [40]:
from llama_index.core import Document
from llama_index.core import VectorStoreIndex

text = """Mèo rất đáng yêu.
        Acid hydrochloric đậm đặc nhất có nồng độ tối đa là 40%.
        Ở dạng đậm đặc, acid này có thể tạo thành các sương mù acid,
        chúng đều có khả năng ăn mòn các mô con người, gây tổn thương cơ quan hô hấp,
        mắt, da và ruột."""
doc = Document(text=text)
index = VectorStoreIndex.from_documents([doc])

In [41]:
index.vector_store

SimpleVectorStore(stores_text=False, is_embedding_query=True, data=SimpleVectorStoreData(embedding_dict={'66128371-b22f-42d4-b531-e08037b7a147': [0.0034373453818261623, -0.017817014828324318, 0.01122822891920805, -0.051358748227357864, -0.02619919925928116, 0.006205415818840265, -0.014386167749762535, -0.003049100749194622, -0.040442414581775665, -0.00899947714060545, 0.01313208881765604, 0.043899256736040115, -0.0016260788543149829, -0.02050711214542389, -0.037167515605688095, 0.029682030901312828, 0.028902292251586914, 0.004135861061513424, 0.01676436886191368, -0.014139250852167606, 0.010766883380711079, 0.025536423549056053, 0.00029443236417137086, -0.0012004717718809843, -0.0040871272794902325, 0.037609368562698364, 0.00844066496938467, -0.009558289311826229, -0.010162586346268654, -0.01107877865433693, 0.03877897560596466, -0.008154761046171188, -0.023054257035255432, 0.006380856968462467, -0.014100263826549053, 0.00784936361014843, -0.015165906399488449, 0.0026787251699715853, 0

## Tạo index từ node

In [3]:
import nest_asyncio
nest_asyncio.apply()

In [5]:
from llama_index.core import Document
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.extractors import TitleExtractor, QuestionsAnsweredExtractor
from llama_index.core.ingestion import IngestionPipeline, IngestionCache
from llama_index.core import VectorStoreIndex


text = """Mèo rất đáng yêu.
        Acid hydrochloric đậm đặc nhất có nồng độ tối đa là 40%.
        Ở dạng đậm đặc, acid này có thể tạo thành các sương mù acid,
        chúng đều có khả năng ăn mòn các mô con người, gây tổn thương cơ quan hô hấp,
        mắt, da và ruột."""

doc = Document(text=text)

# create the pipeline with transformations
pipeline = IngestionPipeline(
    transformations=[
        SentenceSplitter(chunk_size=50, chunk_overlap=0),
        TitleExtractor(),
        OpenAIEmbedding(),
    ]
)

# run the pipeline
nodes = pipeline.run(documents=[doc])

# Create index from nodes
index = VectorStoreIndex(nodes)

100%|██████████| 4/4 [00:01<00:00,  2.58it/s]


In [7]:
index.vector_store

SimpleVectorStore(stores_text=False, is_embedding_query=True, data=SimpleVectorStoreData(embedding_dict={'c5ad74e4-346e-49da-ab4a-4a87ef015dd7': [0.005797962658107281, 0.004194855224341154, 0.02869802713394165, -0.03391585126519203, -0.007799272425472736, 0.02238171547651291, -0.006961674429476261, 0.0010495720198377967, -0.013930214568972588, -0.011444883421063423, 0.019882652908563614, 0.0279840100556612, -0.004030081909149885, 0.0069513763301074505, -0.025430021807551384, 0.02203843742609024, 0.041770048439502716, 0.012845456600189209, 0.012708146125078201, -0.016806883737444878, -0.012172631919384003, 0.007044061552733183, 0.013065154664218426, 0.005001557990908623, -0.0157221257686615, 0.029137423262000084, 0.03652476146817207, -0.029521893709897995, 0.0068140653893351555, -0.03388838842511177, 0.031142165884375572, -0.0001878373441286385, -0.021722622215747833, -0.0009946475038304925, 0.011877412907779217, 0.012914112769067287, -0.029549356549978256, -0.014060660265386105, 0.0012

## Lưu trữ vector store index

- Lưu trữ index vào file
- Lưu trữ index vào database    

### Vector Store

In [43]:
from llama_index.core import StorageContext, load_index_from_storage

# Lưu trữ index vào thư mục
index.storage_context.persist(persist_dir="index_cache")

# Load index từ thư mục
storage_context = StorageContext.from_defaults(
    persist_dir="index_cache")
reload_index = load_index_from_storage(storage_context)

### Vector Database

In [45]:
import chromadb
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core import VectorStoreIndex, StorageContext, Document


db = chromadb.PersistentClient(path="database")
chroma_collection = db.get_or_create_collection("my_chroma_store")

vector_store = ChromaVectorStore(
    chroma_collection=chroma_collection
)
storage_context = StorageContext.from_defaults(
    vector_store=vector_store
)

text = "I love my cat!"
doc = Document(text=text)
index = VectorStoreIndex.from_documents(
    documents=[doc],
    storage_context=storage_context,
)

In [46]:
reload_index = VectorStoreIndex.from_vector_store(
    vector_store=vector_store,
    storage_context=storage_context
)