In [22]:
# !pip install llama_index==0.11.4
# !pip install PyYAML
# !pip install plotly==5.24.0
# !pip install docx2txt==0.8
# !pip install chromadb==0.5.5
# !pip install llama-index-vector-stores-chroma==0.2.0
# !pip install llama-index-extractors-entity==0.2.0
# !pip install llama-index-readers-web==0.2.2
# !pip install transformers==4.40.2
# !pip install nest_asyncio
# !pip install chromadb


In [23]:
from llama_index.core import Settings
from llama_index.llms.openai import OpenAI
import openai

openai.api_key = "sk-proj-your-openai-api-key"
Settings.llm = OpenAI(model="gpt-4o-mini", temperature=0.2)

In [25]:
import chromadb
import uuid
import nest_asyncio
from llama_index.core import Document
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.extractors import TitleExtractor
from llama_index.core.ingestion import IngestionPipeline
from llama_index.core.postprocessor import SimilarityPostprocessor
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core import get_response_synthesizer

# Ensure nest_asyncio is applied to handle async operations in certain environments like Jupyter
nest_asyncio.apply()

# Initialize Chroma client and create a collection (use get_or_create to avoid duplication errors)
chroma_client = chromadb.Client()
collection = chroma_client.create_collection("documents", get_or_create=True)

# Define the document text
text = """Cats are cute.
          The most concentrated hydrochloric acid has a maximum concentration of 40%.
          In its concentrated form, this acid can produce acid mists, all of which are corrosive to human tissue, 
          causing damage to the respiratory system, eyes, skin, and itching."""
doc = Document(text=text)

# Create the pipeline with transformations for text processing and embedding
pipeline = IngestionPipeline(
    transformations=[
        SentenceSplitter(chunk_size=50, chunk_overlap=0),
        TitleExtractor(),
        OpenAIEmbedding(),
    ]
)

# Run the pipeline to generate nodes with embeddings from the document
nodes = pipeline.run(documents=[doc])

# Add embeddings and document text to the Chroma collection
for node in nodes:
    unique_id = str(uuid.uuid4())  # Generate a unique ID for each chunk
    collection.add(ids=[unique_id], embeddings=[node.embedding], documents=[node.text])

# Function to generate embedding for the query text using the pipeline
def generate_query_embedding(query_text):
    # Create a document for the query text
    query_doc = Document(text=query_text)
    # Run it through the pipeline
    query_nodes = pipeline.run(documents=[query_doc])
    # Return the embedding from the first node
    return query_nodes[0].embedding

# Example retrieval: query for similar documents
query_text = "Effects of hydrochloric acid on human tissue"
query_embedding = generate_query_embedding(query_text)
results = collection.query(embedding=query_embedding, top_k=2)

# Display retrieved results
for result in results['documents']:
    print(f"Retrieved Document: {result}")


100%|██████████| 2/2 [00:00<00:00,  2.09it/s]
100%|██████████| 1/1 [00:00<00:00,  1.17it/s]


TypeError: Collection.query() got an unexpected keyword argument 'embedding'

In [None]:
retriever = VectorIndexRetriever(
    index=index,
    similarity_top_k=2,
)

response_synthesizer = get_response_synthesizer(
    response_mode="tree_summarize",
    verbose=True
)

pp = SimilarityPostprocessor(similarity_cutoff=0.5)

In [None]:
query_engine = RetrieverQueryEngine(
    retriever=retriever,
    response_synthesizer=response_synthesizer,
    node_postprocessors=[pp]
)

In [None]:
response = query_engine.query(
    "Cats are cute, aren't they?"
)
print(response)