#### `Imports`

In [1]:
import ollama
import chromadb
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import json
import re
import time

In [2]:
client = chromadb.Client()
collection = client.create_collection(name="docs")

#### `MAIN Embedding`

In [81]:
client.delete_collection(name="docs")
collection = client.create_collection(name="docs")

PATH = "../Query Processing/4_json_results/migrated_query_data.json"
documents = load_json(PATH)
document_embeddings = document_embedding(documents)


Embedding Time: 8.19s


#### `MAIN Output`

In [77]:
desired_columns = ["ACT_END_DATE, BUA_NO_START"] # YVES INBOUND NOT COMPLETED
# desired_columns = ["ONHANDQTY, PARTNO"] # CWIS ASTRO CDC CMR
# desired_columns = ["ECARRNO"] # MIP 490 INBOUND PALLETS AND TOTAL OH
# desired_columns = ["ASTRO_TRIP_ID", "ITEM_ID"] # YVES STORE ORDERS VOLUME BY DAY
# desired_columns = [ # AAYE STOCKAGE
#     "AMOONCR",
#     "CARRSTAT",
#     "CARRTYPE",
#     "DIVCODE",
#     "ECARRNO",
#     "FLEXDAYS",
#     "G08PDATE",
#     "LDCT",
#     "LOCKCODE",
#     "LOTID",
# ]

get_output_main(desired_columns, document_embeddings)

Based on the provided data, there is a report named **'YVES INBOUND NOT COMPLETED'** that contains the columns  'ACT_END_DATE' and 'BUA_NO_START'. 


Let me know if you have any other data requests! 

Reports with highest Similarity:
Similarity for Report Name 'YVES INBOUND NOT COMPLETED': 63.53%
Similarity for Report Name 'ADM CHECK': 60.53%
Similarity for Report Name 'ACTIVITY FOLLOW UP': 58.68%
Runtime: 59.72s.


#### `Query Embedding Methods`

##### `load_json`

In [None]:
def load_json(PATH):
    """
    Load query data from a JSON file, process it, and return a list of formatted document strings.

    Parameters:
    PATH (str): The file path to the JSON file containing the query data.

    Returns:
    list: A list of strings, where each string describes a report and its columns.

    Function Workflow:
    1. Open the JSON file specified by PATH and load its contents into a dictionary.
    2. Initialize two lists: report_names and report_columns.
    3. Iterate over the items in the dictionary:
        - Append the stripped key to report_names.
        - Append the "columns_cleansed" value to report_columns.
    4. Create a DataFrame with columns "report_name" and "report_columns" from the lists.
    5. Apply the function liste_zu_string to the "report_columns" column.
    6. Create a new column "documents" that combines the report name and columns into a formatted string.
    7. Handle any exceptions that occur during file operations or data processing.
    8. Return a list of formatted document strings by applying the function zeile_zu_string to the "documents" column.
    """

    try:
        with open(PATH, 'r') as json_file:
            query_data = json.load(json_file)

        report_names = []
        report_columns = []

        for key, value in query_data.items():
            report_names.append(key.strip())
            report_columns.append(value["columns_cleansed"])

        df = pd.DataFrame({"report_name": report_names, "report_columns": report_columns})
        df['report_columns'] = df["report_columns"].apply(liste_zu_string)

        df["documents"] = "Report Name " + "'" + df["report_name"]+ "'" + " contains columns: " + df["report_columns"]

    except Exception as e:
        print(f"{e}")

    return df[["documents"]].apply(zeile_zu_string, axis=1).tolist()


def liste_zu_string(liste):
    return ', '.join(map(str, liste))

def zeile_zu_string(row):
    return ', '.join(map(str, row))

##### `document_embedding`

In [None]:
def document_embedding(documents):
    """
    Embed a list of documents using a specified model and store the embeddings in a collection.

    Parameters:
    documents (list): A list of strings, where each string is a document to be embedded.

    Returns:
    list: A list of embeddings, where each embedding corresponds to a document in the input list.

    Function Workflow:
    1. Record the start time of the embedding process.
    2. Initialize an empty list to store the document embeddings.
    3. Iterate over the documents:
        - For each document, generate embeddings using the specified model ("mxbai-embed-large:latest").
        - Append the generated embeddings to the document_embeddings list.
        - Add the embeddings and the corresponding document to a collection with a unique identifier.
    4. Record the end time of the embedding process.
    5. Calculate the total runtime of the embedding process and print it.
    6. Return the list of document embeddings.
    """
    start = time.time()
    document_embeddings = []

    for i, doc in enumerate(documents):
        response = ollama.embed(model="mxbai-embed-large:latest", input=doc)
        embeddings = response["embeddings"]
        document_embeddings.append(embeddings)
        collection.add(ids=[str(i)], embeddings=embeddings, documents=[doc])
    end = time.time()

    runtime = end - start
    print(f"Embedding Time: {runtime:.2f}s")
    return document_embeddings

#### `Output Methods`

##### `get_output_main`

In [None]:
def get_output_main(desired_columns, document_embeddings):
    """
    Generate output based on desired columns and document embeddings by retrieving relevant documents and calculating similarity.

    Parameters:
    desired_columns (list): A list of columns that are desired in the output.
    document_embeddings (list): A list of embeddings for the documents to be compared.

    Returns:
    None

    Function Workflow:
    1. Record the start time of the process.
    2. Convert the desired columns list into a string format using the `liste_zu_string` function.
    3. Generate embeddings for the desired columns using the specified model ("mxbai-embed-large:latest").
    4. Retrieve relevant documents based on the prompt embeddings.
    5. Calculate the similarity between the document embeddings and the prompt embeddings, using the retrieval results.
    6. Generate the final output based on the desired columns, retrieval results, and similarity scores.
    7. Record the end time of the process.
    8. Calculate the total runtime of the process and print it.
    """
    start = time.time()

    desired_columns = liste_zu_string(desired_columns)

    prompt_embeddings = ollama.embeddings(model="mxbai-embed-large:latest", prompt=desired_columns)

    retrieval_results = retrieve_relevant_documents(prompt_embeddings)

    similarity = calculate_similarity(document_embeddings, prompt_embeddings, retrieval_results)

    generate_output(desired_columns, retrieval_results, similarity)

    end = time.time()
    runtime = end - start
    print(f"Runtime: {runtime:.2f}s.")

##### `retrieve_relevant_documents`

In [None]:
def retrieve_relevant_documents(embedded_prompt):
    """
    Retrieve relevant documents based on the embedded prompt.

    Parameters:
    embedded_prompt (dict): A dictionary containing the embeddings of the prompt.

    Returns:
    list: A list of retrieval results, where each result is a relevant document.

    Function Workflow:
    1. Query the collection using the embeddings from the embedded prompt.
    2. Retrieve the top 3 relevant documents based on the query embeddings.
    3. Return the retrieval results.
    """
    retrieval_results = collection.query(query_embeddings=embedded_prompt["embedding"], n_results=3)
    return retrieval_results

##### `calculate_similarity`

In [None]:
def calculate_similarity(document_embeddings, embedded_prompt, retrieval_results):
    """
    Calculate the cosine similarity between document embeddings and an embedded prompt.

    Parameters:
    document_embeddings (list): A list of embeddings for the documents to be compared.
    embedded_prompt (dict): A dictionary containing the embeddings of the prompt.
    retrieval_results (dict): A dictionary containing the retrieval results, including document IDs.

    Returns:
    list: A list of similarity scores between the embedded prompt and each retrieved document.

    Function Workflow:
    1. Initialize an empty list to store the similarity scores.
    2. Convert the embedded prompt into a NumPy array and reshape it for similarity calculation.
    3. Iterate over the retrieved document IDs:
        - For each document ID, convert the corresponding document embedding into a NumPy array and reshape it.
        - Calculate the cosine similarity between the document embedding and the prompt embedding.
        - Append the similarity score to the similarities list.
    4. Return the list of similarity scores.
    """
    similarities = []

    prompt_vector = np.array(embedded_prompt["embedding"]).reshape(1, -1)
    for i in range(len(retrieval_results["ids"][0])):
        document_vector = np.array(document_embeddings[int(retrieval_results["ids"][0][i])][0]).reshape(1, -1)
        similarities.append(cosine_similarity(document_vector, prompt_vector).flatten()[0])

    return similarities

##### `generate_output`

In [None]:
def generate_output(desired_columns, retrieval_results, similarity):
    """
    Generate output based on desired columns, retrieval results, and similarity scores.

    Parameters:
    desired_columns (str): A string representing the columns desired in the document.
    retrieval_results (dict): A dictionary containing the retrieval results, including document details.
    similarity (list): A list of similarity scores between the embedded prompt and each retrieved document.

    Returns:
    None

    Function Workflow:
    1. Define a prompt asking for a report containing the desired columns.
    2. Generate a response using the specified model ("gemma2:2b") based on the retrieval results and the prompt.
    3. Print the generated response.
    4. Print the reports with the highest similarity scores:
        - Iterate over the retrieved documents.
        - Extract the report name using a regular expression.
        - Print the similarity score for each report.
    """
    PROMPT = f"Please check if there is a report containing the columns '{desired_columns}'. If such a report exists, provide the report name."

    output = ollama.generate(
        model="gemma2:2b",
        prompt=f"Using this data: {retrieval_results["documents"][0][0]}. Respond to this prompt: {PROMPT}",
    )

    print(output["response"])
    print("Reports with highest Similarity:")
    for i, doc in enumerate(retrieval_results["documents"][0]):
        match = re.search(r"Report Name '.*?'", retrieval_results["documents"][0][i])
        if match:
            print(f"Similarity for {match.group(0)}: {round(similarity[i] * 100, 2)}%")
        else:
            print(f"Similarity for {retrieval_results["documents"][0][i]}: {round(similarity[i] * 100, 2)}%")