In [12]:
import os
import sys
import json
import logging
from typing import Dict

In [13]:
sys.path.append(os.path.dirname(os.getcwd()))
from configs import BASE_MAPPING_PATH, QANDA_FILE_READER_PATH, PIPELINE_FIELD_MAP
from client import (
    OsMlClientWrapper,
    get_client,
)
from data_process import QAndAFileReader
from mapping import get_base_mapping, mapping_update
from ml_models import get_remote_connector_configs, MlModel
from main import get_ml_model, load_category

In [14]:
def load_dataset_lexical(
    client: OsMlClientWrapper,
    pqa_reader: QAndAFileReader,
    config: Dict[str, str],
    delete_existing: bool,
    index_name: str,
):
    if delete_existing:
        logging.info(f"Deleting existing index {index_name}")
        client.delete_then_create_index(
            index_name=config["index_name"], settings=config["index_settings"]
        )

    logging.info("Setting up without KNN")
    client.setup_without_kNN(
        index_name=config["index_name"],
        index_settings=config["index_settings"],
    )

    for category in config["categories"]:
        load_category(
            client=client.os_client,
            pqa_reader=pqa_reader,
            category=category,
            config=config,
        )

In [15]:
host_type = "aos"
index_name = "lexical_search"
dataset_path = QANDA_FILE_READER_PATH
number_of_docs = 500
client = OsMlClientWrapper(get_client(host_type))

pqa_reader = QAndAFileReader(
    directory=dataset_path, max_number_of_docs=number_of_docs
)

categories = ["sheet and pillowcase sets"]

config = {
    "categories": categories,
    "index_name": index_name,
    "index_settings": get_base_mapping(BASE_MAPPING_PATH),
}

logging.info(f"Config:\n {json.dumps(config, indent=4)}")

load_dataset_lexical(
    client,
    pqa_reader,
    config,
    delete_existing=True,
    index_name=index_name,
)

2025-04-04:08:59:05,168 INFO     [base.py:258] GET https://search-opensearch-ml-quickstart-bpt2cjvxdhn7xan6ogr7pdd53m.us-west-2.es.amazonaws.com:443/ [status:200 request:0.056s]
2025-04-04:08:59:05,177 INFO     [base.py:258] GET https://search-opensearch-ml-quickstart-bpt2cjvxdhn7xan6ogr7pdd53m.us-west-2.es.amazonaws.com:443/_plugins/_ml/model_groups/_search [status:200 request:0.009s]
2025-04-04:08:59:05,191 INFO     [3262137201.py:19] Config:
 {
    "categories": [
        "sheet and pillowcase sets"
    ],
    "index_name": "lexical_search",
    "index_settings": {
        "settings": {
            "index": {
                "number_of_shards": 1,
                "number_of_replicas": 2
            }
        },
        "mappings": {
            "properties": {
                "answer_aggregated": {
                    "type": "keyword"
                },
                "answers": {
                    "properties": {
                        "age": {
                            "typ

2025-04-04:08:59:05,564 INFO     [base.py:258] DELETE https://search-opensearch-ml-quickstart-bpt2cjvxdhn7xan6ogr7pdd53m.us-west-2.es.amazonaws.com:443/lexical_search [status:200 request:0.349s]
2025-04-04:08:59:06,038 INFO     [base.py:258] PUT https://search-opensearch-ml-quickstart-bpt2cjvxdhn7xan6ogr7pdd53m.us-west-2.es.amazonaws.com:443/lexical_search [status:200 request:0.473s]
2025-04-04:08:59:06,038 INFO     [os_ml_client_wrapper.py:100] idempotent_create_index response: {'acknowledged': True, 'shards_acknowledged': True, 'index': 'lexical_search'}
2025-04-04:08:59:06,039 INFO     [2292998341.py:14] Setting up without KNN
2025-04-04:08:59:06,045 INFO     [base.py:258] HEAD https://search-opensearch-ml-quickstart-bpt2cjvxdhn7xan6ogr7pdd53m.us-west-2.es.amazonaws.com:443/lexical_search [status:200 request:0.006s]
2025-04-04:08:59:06,045 INFO     [os_ml_client_wrapper.py:115] Deleting index lexical_search
2025-04-04:08:59:06,387 INFO     [base.py:258] DELETE https://search-opensea

In [16]:
query_text = "I wish to sleep well"
search_query = {
    "_source": {"include": "chunk"},
    "query": {"match": {"chunk": query_text}},
}

In [17]:
search_results = client.os_client.search(index=index_name, body=search_query)
hits = search_results["hits"]["hits"]
hits = [hit["_source"]["chunk"] for hit in hits]
hits = list(set(hits))
for i, hit in enumerate(hits):
    print(f"{i + 1}th search result:\n {hit}")

2025-04-04:08:59:08,377 INFO     [base.py:258] POST https://search-opensearch-ml-quickstart-bpt2cjvxdhn7xan6ogr7pdd53m.us-west-2.es.amazonaws.com:443/lexical_search/_search [status:200 request:0.007s]


1th search result:
 Sweet Home Collection: Why Choose Us? At Sweet Home Collection, we strive to create products using the finest textiles possible at reasonable prices. Our 1500 Supreme Collection is a fine display of our commitment to quality and customer feedback. Our goal is to create repeat customers for years to come. With our new kids collection we offer contemporary designs that were created using color palettes that will complement you boys and girls bedroom. With a quick change of the bed sheets the room can go from a ballerina themed bedroom to a llama themed bedroom! If your switching to a boys room, for example, choose between the orange and yellow tones of the contruction sheet set or the blue and red color scheme found in the sports design. There is a wide variety of selection to choose from in order to meet as many needs as possible. We use fine microfiber as well, which is paving the way for a future of fine linens using the latest technology. Used across all industrie

In [18]:
def load_dataset_dense(
    client: OsMlClientWrapper,
    ml_model: MlModel,
    pqa_reader: QAndAFileReader,
    config: Dict[str, str],
    delete_existing: bool,
    index_name: str,
    pipeline_name: str,
):
    if delete_existing:
        logging.info(f"Deleting existing index {index_name}")
        client.delete_then_create_index(
            index_name=config["index_name"], settings=config["index_settings"]
        )

    logging.info("Setting up for KNN")
    client.setup_for_kNN(
        ml_model=ml_model,
        index_name=config["index_name"],
        pipeline_name=pipeline_name,
        index_settings=config["index_settings"],
        pipeline_field_map=config["pipeline_field_map"],
        delete_existing=delete_existing,
        embedding_type=config["embedding_type"],
    )

    for category in config["categories"]:
        load_category(
            client=client.os_client,
            pqa_reader=pqa_reader,
            category=category,
            config=config,
        )

In [19]:
def create_index_settings(base_mapping_path, index_config):
    settings = get_base_mapping(base_mapping_path)
    pipeline_name = index_config["pipeline_name"]
    model_dimension = index_config["model_dimensions"]
    knn_settings = {
        "settings": {"index": {"knn": True}, "default_pipeline": pipeline_name},
        "mappings": {
            "properties": {
                "chunk": {"type": "text", "index": False},
                "chunk_embedding": {
                    "type": "knn_vector",
                    "dimension": model_dimension,
                },
            }
        },
    }
    mapping_update(settings, knn_settings)
    return settings

In [20]:
model_type = "bedrock"
index_name = "dense_exact_search"
categories = ["sheet and pillowcase sets"]
config = {"with_knn": True, "pipeline_field_map": PIPELINE_FIELD_MAP}

pipeline_name = "amazon_pqa_pipeline"
embedding_type = "dense"
config["categories"] = categories
config["index_name"] = index_name
config["pipeline_name"] = pipeline_name
config["embedding_type"] = embedding_type

model_name = f"{host_type}_{model_type}"

model_config = get_remote_connector_configs(
    host_type=host_type, connector_type=model_type
)
model_config["model_name"] = model_name
model_config["embedding_type"] = embedding_type
config.update(model_config)

ml_model = get_ml_model(
    host_type=host_type,
    model_type=model_type,
    model_config=model_config,
    client=client,
)

config["index_settings"] = create_index_settings(
    base_mapping_path=BASE_MAPPING_PATH,
    index_config=config,
)

load_dataset_dense(
    client,
    ml_model,
    pqa_reader,
    config,
    delete_existing=True,
    index_name=index_name,
    pipeline_name=pipeline_name,
)

2025-04-04:08:59:08,707 INFO     [base.py:258] POST https://search-opensearch-ml-quickstart-bpt2cjvxdhn7xan6ogr7pdd53m.us-west-2.es.amazonaws.com:443/_plugins/_ml/connectors/_search [status:200 request:0.006s]
2025-04-04:08:59:08,714 INFO     [base.py:258] POST https://search-opensearch-ml-quickstart-bpt2cjvxdhn7xan6ogr7pdd53m.us-west-2.es.amazonaws.com:443/_plugins/_ml/connectors/_search [status:200 request:0.006s]
2025-04-04:08:59:08,715 INFO     [ml_connector.py:38] Connector id 1I-E_pUBMwhocUpVazPb
2025-04-04:08:59:08,720 INFO     [base.py:258] POST https://search-opensearch-ml-quickstart-bpt2cjvxdhn7xan6ogr7pdd53m.us-west-2.es.amazonaws.com:443/_plugins/_ml/models/_search [status:200 request:0.004s]
2025-04-04:08:59:08,726 INFO     [base.py:258] POST https://search-opensearch-ml-quickstart-bpt2cjvxdhn7xan6ogr7pdd53m.us-west-2.es.amazonaws.com:443/_plugins/_ml/models/_search [status:200 request:0.006s]
2025-04-04:08:59:08,727 INFO     [ml_model.py:34] MlModel id twKE_pUBdIibWvQqbhr

KeyboardInterrupt: 

In [21]:
index_name = "dense_exact_search"
search_query = {
    "_source": {"include": "chunk"},
    "query": {
        "neural": {
            "chunk_embedding": {
                "query_text": query_text,
                "model_id": ml_model.model_id(),
            }
        }
    },
}

In [22]:
search_results = client.os_client.search(index=index_name, body=search_query)
hits = search_results["hits"]["hits"]
hits = [hit["_source"]["chunk"] for hit in hits]
hits = list(set(hits))
for i, hit in enumerate(hits):
    print(f"{i + 1}th search result:\n {hit}")

2025-04-04:08:59:48,516 INFO     [base.py:258] POST https://search-opensearch-ml-quickstart-bpt2cjvxdhn7xan6ogr7pdd53m.us-west-2.es.amazonaws.com:443/dense_exact_search/_search [status:200 request:0.673s]


1th search result:
 HC CollectionUltimate blend of craftsmanship and elegance, our linens are designed to offer you a unique and sophisticated sleeping experience. 1800 Platinum CollectionLight to touch, cool and exceptional strength you can count on, our sheet sets assure you a great night’s sleep. Designed with simple sophistication and ability to retain its vibrancy these brushed microfiber collections are the ultimate for light weight, crisp sleeping comfort. BenefitsMicrofiber is the new favorite in the world of bed linens! Recent reports show microfiber is the current trend. 19 out of 25 home bed sheets today contain some form of improved fiber or weaving technology. Microfibers are the newest bedding fabric and is distinctive for its unique thinness and strength. These fibers are even thinner than most luxurious natural fibers such as silk. Woven tightly, it guarantees extra strength and durability. This fabric has desirable properties such as stain resistance, breathing ability