#### `Imports`

In [1]:
import ollama
import json

import logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

#### `MAIN query_embedding`

In [7]:
query_embedding()

2025-04-07 11:39:05,728 - INFO - Started query_embedding.
2025-04-07 11:39:05,730 - INFO - Started load_query_data for migrated queries from path ../Query Processing/4_json_results/migrated_query_data.json.
2025-04-07 11:39:05,735 - INFO - Completed load_query_data for migrated queries.
2025-04-07 11:39:05,736 - INFO - Started embed_query_data for migrated queries.
2025-04-07 11:39:08,347 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
2025-04-07 11:39:08,960 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
2025-04-07 11:39:09,980 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
2025-04-07 11:39:10,641 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
2025-04-07 11:39:13,291 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
2025-04-07 11:39:14,986 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
2025-04-07 11:

#### `Methods`

##### `query_embedding`

In [2]:
def query_embedding():
    logging.info(f"Started {query_embedding.__name__}.")

    MIGRATED_QUERY_PATH = "../Query Processing/4_json_results/migrated_query_data.json"
    MIGRATED_QUERY_IDENTIFIER = "migrated"
    migrated_query_data = load_query_data(MIGRATED_QUERY_PATH, MIGRATED_QUERY_IDENTIFIER)
    migrated_query_data_embedded = embed_query_data(migrated_query_data, MIGRATED_QUERY_IDENTIFIER)
    write_query_data_to_json(migrated_query_data_embedded, MIGRATED_QUERY_IDENTIFIER)

    # NEW_QUERY_PATH = "../Query Processing/4_json_results/new_query_data.json"
    # NEW_QUERY_IDENTIFIER = "new"
    # new_query_data = load_query_data(NEW_QUERY_PATH, NEW_QUERY_IDENTIFIER)
    # new_query_data_embedded = embed_query_data(new_query_data, NEW_QUERY_IDENTIFIER)
    # write_query_data_to_json(new_query_data_embedded, NEW_QUERY_IDENTIFIER)

    logging.info(f"Queries successfully embedded.")

##### `load_query_data`

In [None]:
def load_query_data(PATH, QUERY_IDENTIFIER):
    """
    Load query metadata from a JSON file and log the process.

    Parameters:
    PATH (str): The file path to the JSON file containing the query metadata.
    QUERY_IDENTIFIER (str): A string identifier used to label the type of queries being processed.

    Returns:
    dict: A dictionary containing the query metadata loaded from the JSON file.

    Function Workflow:
    1. Log the start of the function execution, including the QUERY_IDENTIFIER and PATH.
    2. Open the JSON file specified by PATH and load its contents into a dictionary.
    3. Log the completion of the function execution.
    4. Return the loaded query metadata.
    """
    logging.info(f"Started {load_query_data.__name__} for {QUERY_IDENTIFIER} queries from path {PATH}.")

    with open(PATH) as f:
        query_meta_data = json.load(f)

    logging.info(f"Completed {load_query_data.__name__} for {QUERY_IDENTIFIER} queries.")
    return query_meta_data

##### `embed_query_data`

In [None]:
def embed_query_data(query_meta_data, QUERY_IDENTIFIER):
    """
    Embed query metadata using a specified model and update the metadata with the embedding results.

    Parameters:
    query_meta_data (dict): A dictionary containing query metadata. Each key-value pair represents a query and its associated metadata.
    QUERY_IDENTIFIER (str): A string identifier used to label the type of queries being processed.

    Returns:
    dict: The updated query metadata dictionary, now containing embeddings for each query.

    Function Workflow:
    1. Log the start of the function execution, including the QUERY_IDENTIFIER.
    2. Iterate over each key-value pair in the query_meta_data dictionary:
        - Join the query metadata into a single format suitable for embedding using the join_query_meta_data function.
        - Generate embeddings for the joined query data using the specified model ("mxbai-embed-large").
        - Update the query metadata with the generated embeddings.
    3. Return the updated query metadata dictionary.
    """
    logging.info(f"Started {embed_query_data.__name__} for {QUERY_IDENTIFIER} queries.")

    for query_key, query_value in query_meta_data.items():
        joined_query_data = join_query_meta_data(query_value)
        response = ollama.embed(model="mxbai-embed-large", input=joined_query_data)
        query_meta_data[query_key]["embedding"] = response["embeddings"]

    return query_meta_data

##### `join_query_meta_data`

In [None]:
def join_query_meta_data(query_value):
    """
    Join all elements of the query metadata into a single string.

    Parameters:
    query_value (dict): A dictionary containing metadata for a query. The values can be lists or individual elements.

    Returns:
    str: A single string containing all elements of the query metadata, joined by commas.

    Function Workflow:
    1. Initialize an empty list to store all elements.
    2. Iterate over each key-value pair in the query_value dictionary:
        - If the value is a list, extend the all_elements list with the items in the list.
        - If the value is not a list, append the value to the all_elements list.
    3. Join all elements in the all_elements list into a single string, separated by commas.
    4. Return the resulting string.
    """
    all_elements = []
    for key, value in query_value.items():
        if isinstance(value, list):
            all_elements.extend(value)
        else:
            all_elements.append(value)

    return ", ".join(all_elements)

##### `write_query_data_to_json`

In [None]:
def write_query_data_to_json(queries, QUERY_IDENTIFIER):
    """
    Write query data with embeddings to a JSON file and log the process.

    Parameters:
    queries (dict): A dictionary containing the query data with embeddings.
    QUERY_IDENTIFIER (str): A string identifier used to label the type of queries being processed.

    Returns:
    None

    Function Workflow:
    1. Log the start of the function execution, including the QUERY_IDENTIFIER.
    2. Define the path for the output JSON file based on the QUERY_IDENTIFIER.
    3. Open the JSON file for writing with UTF-8 encoding.
    4. Write the query data to the JSON file with indentation and ensure ASCII characters are preserved.
    5. Log the completion of the function execution.
    """
    logging.info(f"Started {write_query_data_to_json.__name__} for {QUERY_IDENTIFIER} queries.")

    JSON_PATH = f"./json_results/{QUERY_IDENTIFIER}_query_data_with_embedding.json"

    with open(JSON_PATH, "w", encoding="utf-8") as outfile:
        json.dump(queries, outfile, indent=4, ensure_ascii=False)

    logging.info(f"Completed {write_query_data_to_json.__name__} for {QUERY_IDENTIFIER} queries.")