In [1]:
import os
import re
import numpy as np
import pandas as pd

from llama_index.core import (
    SimpleDirectoryReader,
    load_index_from_storage,
    VectorStoreIndex,
    StorageContext,
    Settings,
)
from llama_index.vector_stores.faiss import FaissVectorStore
from IPython.display import Markdown, display

from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.extractors import (
    SummaryExtractor,
    QuestionsAnsweredExtractor,
    TitleExtractor,
    KeywordExtractor,
)
from llama_index.extractors.entity import EntityExtractor

from llama_index.llms.ollama import Ollama
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core.ingestion import IngestionPipeline

from llama_index.vector_stores.redis import RedisVectorStore
from llama_index.retrievers.bm25 import BM25Retriever

import asyncio
import nest_asyncio

nest_asyncio.apply()

  from .autonotebook import tqdm as notebook_tqdm


resource module not available on Windows


In [2]:
llm = Ollama(model="gemma:2b", request_timeout=30.0)
embed_model = HuggingFaceEmbedding(model_name='sentence-transformers/all-MiniLM-L6-v2')

In [3]:
Settings.llm = llm
Settings.embed_model = embed_model

In [4]:
%%time

# 2. Load documents from directory
input_dir = "./knowledge/"  # replace with your directory path
reader = SimpleDirectoryReader(input_dir=input_dir)
documents = reader.load_data()  # returns a list of Document objects

# 3. Build an ingestion pipeline with metadata extractors
pipeline = IngestionPipeline(transformations=[
    SentenceSplitter(chunk_size=325, chunk_overlap=50),                       # chunking
    QuestionsAnsweredExtractor(questions=3),  # up to 3 QA pairs
    SummaryExtractor(summaries=["self"]),     # chunk summary
    KeywordExtractor(keywords=10),            # up to 10 keywords
])
nodes = pipeline.run(documents=documents)

# 4. Set up Redis vector store
#    Install and configure Redis server with RediSearch module
redis_host = "localhost"
redis_port = 6379
redis_password = None  # set if your Redis requires auth
index_name = "product_catalog_vdb"

vector_store = RedisVectorStore(
    host=redis_host,
    port=redis_port,
    password=redis_password,
    index_name=index_name,
    embedding_function=embed_model
)

# 5. Assemble the index, persist, and query
service_context = ServiceContext.from_defaults(embed_model=embed_model)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

index = VectorStoreIndex.from_nodes(
    nodes,
    service_context=service_context,
    storage_context=storage_context,
)

# 6. Configure BM25 retriever on the raw nodes
bm25_retriever = BM25Retriever(
    nodes=nodes,
    k=5  # number of passages to retrieve
)

# 7. Build a hybrid query engine combining BM25 and Dense retrieval
#    Here we use RetrieverQueryEngine to plug in BM25 retriever
query_engine_bm25 = RetrieverQueryEngine(
    retriever=bm25_retriever,
    llm=llm,
)

# 8. Example query functions
def query_vector(query_text: str, top_k: int = 5):
    # Dense vector retrieval
    q_engine = index.as_query_engine(k=top_k)
    return q_engine.query(query_text)


def query_bm25(query_text: str):
    # BM25 retrieval
    return query_engine_bm25.query(query_text)

# Usage examples:
# dense_resp = query_vector("Key feature of document X?")
# bm25_resp = query_bm25("Key feature of document X?")
# print(dense_resp)
# print(bm25_resp)


  0%|          | 0/157095 [00:09<?, ?it/s]


ConnectionError: Failed to connect to Ollama. Please check that Ollama is downloaded, running and accessible. https://ollama.com/download

In [None]:
# 6. Simple semantic query
query_engine = index.as_query_engine()
result = query_engine.query("What is the main topic across these documents?")
print(result)

In [34]:
col_left = ['id_left',
 'category_left',
 'cluster_id_left',
 'brand_left',
 'title_left',
 'description_left',
 'price_left',
 'specTableContent_left']

col_right = ['id_right',
 'category_right',
 'cluster_id_right',
 'brand_right',
 'title_right',
 'description_right',
 'price_right',
 'specTableContent_right']

df_left = df[col_left]
df_right = df[col_right]

with open('combined_data.txt', 'w', encoding='utf-8') as file:
    for _, row in df_left.iterrows():
        # Combine 'product_type' and 'description' (replace with your actual column names)
        combined_data = f"Brand: {row['brand_left']}, Description: {row['description_left']}, Price: {row['price_left']}, Specs: {row['specTableContent_left']}"
        re.sub(r'\s+', ' ', combined_data)
        file.write(combined_data + '\n')  # Write each combined entry to the text file

    for _, row in df_right.iterrows():
        # Combine 'product_type' and 'description' (replace with your actual column names)
        combined_data = f"Brand: {row['brand_right']}, Description: {row['description_right']}, Price: {row['price_right']}, Specs: {row['specTableContent_right']}"
        re.sub(r'\s+', ' ', combined_data)
        file.write(combined_data + '\n')  # Write each combined entry to the text file

In [None]:
metadata = {
    "titles": extracted_titles,
    "questions": questions,
    "summaries": summaries,
    "keywords": keywords,
    "entities": entities
}


In [None]:
transformations = [
    SentenceSplitter(),
    QuestionsAnsweredExtractor(questions=3),
    SummaryExtractor(summaries=["prev", "self"]),
    KeywordExtractor(keywords=10),
    EntityExtractor(prediction_threshold=0.5),
]