In [None]:
import os
import sys
import logging
import pandas as pd
from typing import Dict

In [None]:
sys.path.append(os.path.dirname(os.getcwd()))
from configs import get_remote_connector_configs, BASE_MAPPING_PATH, QANDA_FILE_READER_PATH, PIPELINE_FIELD_MAP
from client import (
    get_client,
    load_category,
    OsMlClientWrapper,
)
from data_process import QAndAFileReader
from mapping import get_base_mapping, mapping_update
from ml_models import get_ml_model, MlModel

In [None]:
def load_dataset_lexical(
    client: OsMlClientWrapper,
    pqa_reader: QAndAFileReader,
    config: Dict[str, str],
    delete_existing: bool,
    index_name: str,
):
    if delete_existing:
        logging.info(f"Deleting existing index {index_name}")
        client.delete_then_create_index(
            index_name=config["index_name"], settings=config["index_settings"]
        )

    logging.info("Setting up without KNN")
    client.setup_without_kNN(
        index_name=config["index_name"],
        index_settings=config["index_settings"],
    )

    for category in config["categories"]:
        load_category(
            client=client.os_client,
            pqa_reader=pqa_reader,
            category=category,
            config=config,
        )

In [None]:
def load_dataset_dense(
    client: OsMlClientWrapper,
    ml_model: MlModel,
    pqa_reader: QAndAFileReader,
    config: Dict[str, str],
    delete_existing: bool,
    index_name: str,
    pipeline_name: str,
):
    if delete_existing:
        logging.info(f"Deleting existing index {index_name}")
        client.delete_then_create_index(
            index_name=config["index_name"], settings=config["index_settings"]
        )

    logging.info("Setting up for KNN")
    client.setup_for_kNN(
        ml_model=ml_model,
        index_name=config["index_name"],
        pipeline_name=pipeline_name,
        index_settings=config["index_settings"],
        pipeline_field_map=config["pipeline_field_map"],
        delete_existing=delete_existing,
        embedding_type=config["embedding_type"],
    )

    for category in config["categories"]:
        load_category(
            client=client.os_client,
            pqa_reader=pqa_reader,
            category=category,
            config=config,
        )

In [None]:
def create_dense_index_settings(base_mapping_path, index_config):
    settings = get_base_mapping(base_mapping_path)
    pipeline_name = index_config["pipeline_name"]
    model_dimension = index_config["model_dimensions"]
    knn_settings = {
        "settings": {"index": {"knn": True}, "default_pipeline": pipeline_name},
        "mappings": {
            "properties": {
                "chunk": {"type": "text", "index": False},
                "chunk_embedding": {
                    "type": "knn_vector",
                    "dimension": model_dimension,
                },
            }
        },
    }
    mapping_update(settings, knn_settings)
    return settings

In [None]:
host_type = "aos"
number_of_docs = 500
dataset_path = QANDA_FILE_READER_PATH
categories = ["sheet and pillowcase sets"]

client = OsMlClientWrapper(get_client(host_type))

pqa_reader = QAndAFileReader(directory=dataset_path, max_number_of_docs=number_of_docs)

In [None]:
lexical_index_name = "lexical_search"

lexical_index_config = {
    "categories": categories,
    "index_name": lexical_index_name,
    "index_settings": get_base_mapping(BASE_MAPPING_PATH),
}

load_dataset_lexical(
    client,
    pqa_reader,
    lexical_index_config,
    delete_existing=True,
    index_name=lexical_index_name,
)

In [None]:
embedding_type = "dense"
dense_model_type = "sagemaker"
dense_index_name = "dense_search"
dense_model_name = "Sagemaker Dense Model"
dense_pipeline_name = "dense_embedding_pipeline"

dense_index_config = {
    "with_knn": True,
    "categories": categories,
    "index_name": dense_index_name,
    "embedding_type": embedding_type,
    "pipeline_name": dense_pipeline_name,
    "pipeline_field_map": PIPELINE_FIELD_MAP,
}

dense_model_config = get_remote_connector_configs(
    host_type=host_type, connector_type=dense_model_type
)
dense_model_config["model_name"] = dense_model_name
dense_model_config["embedding_type"] = embedding_type

dense_index_config.update(dense_model_config)

dense_model = get_ml_model(
    host_type=host_type,
    model_type=dense_model_type,
    model_config=dense_model_config,
    os_client=client.os_client,
    ml_commons_client=client.ml_commons_client,
    model_group_id=client.ml_model_group.model_group_id(),
)

dense_index_config["index_settings"] = create_dense_index_settings(
    base_mapping_path=BASE_MAPPING_PATH,
    index_config=dense_index_config,
)

load_dataset_dense(
    client,
    dense_model,
    pqa_reader,
    dense_index_config,
    delete_existing=True,
    index_name=dense_index_name,
    pipeline_name=dense_pipeline_name,
)

In [None]:
query_text = "The most beautiful pillow"
search_size = 50

lexical_search_query = {
    "size": search_size,
    "_source": {"include": "chunk"},
    "query": {"match": {"chunk": query_text}},
}
dense_search_query = {
    "size": search_size,
    "_source": {"include": "chunk"},
    "query": {
        "neural": {
            "chunk_embedding": {
                "k": search_size,
                "query_text": query_text,
                "model_id": dense_model.model_id(),
            }
        }
    },
}

In [None]:
lexical_search_results = client.os_client.search(
    index=lexical_index_name, body=lexical_search_query
)

dense_search_results = client.os_client.search(
    index=dense_index_name, body=dense_search_query
)

lexical_hits = lexical_search_results["hits"]["hits"]
lexical_hits = [lexical_hit["_source"]["chunk"] for lexical_hit in lexical_hits]
lexical_hits = list(set(lexical_hits))

dense_hits = dense_search_results["hits"]["hits"]
dense_hits = [dense_hit["_source"]["chunk"] for dense_hit in dense_hits]
dense_hits = list(set(dense_hits))

In [None]:
top10_dense_hits, top10_lexical_hits = dense_hits[:10], lexical_hits[:10]

df = pd.DataFrame(
    {
        "top 10 dense search results": top10_dense_hits,
        "top 10 lexical search results": top10_lexical_hits,
    }
)

In [None]:
from IPython.display import display, HTML

# Create a title and style the DataFrame
styled_df = (
    df.style.set_caption(f"Search Results from query: {query_text}")
    .set_table_styles(
        [
            {
                "selector": "caption",
                "props": [
                    ("text-align", "center"),
                    ("font-size", "20px"),
                    ("font-weight", "bold"),
                    ("color", "#333333"),
                    ("background-color", "#f0f0f0"),
                    ("padding", "10px"),
                ],
            },
            {
                "selector": "th",
                "props": [
                    ("font-size", "16px"),
                    ("text-align", "center"),
                    ("background-color", "#4472C4"),
                    ("color", "white"),
                    ("font-weight", "bold"),
                    ("padding", "10px"),
                ],
            },
            {
                "selector": "td",
                "props": [
                    ("font-size", "14px"),
                    ("padding", "8px"),
                    ("border", "1px solid #ddd"),
                ],
            },
            {
                "selector": "tr:nth-of-type(odd)",
                "props": [("background-color", "#f9f9f9")],
            },
        ]
    )
    .format_index(str.upper, axis=1)
    .relabel_index([str(num) for num in range(1, 11)], axis=0)
    .set_properties(**{"max-width": "800px", "white-space": "pre-wrap"})
)

# Display the styled DataFrame with increased size
display(HTML("<style>.rendered_html table {font-size: 16px; width: 100%;}</style>"))
display(styled_df)