#### `Imports`

In [1]:
import ollama
import chromadb
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import json
import re
import time

In [3]:
client = chromadb.Client()
collection = client.create_collection(name="docs")

#### `MAIN Embedding`

In [81]:
client.delete_collection(name="docs")
collection = client.create_collection(name="docs")

PATH = "../Query Processing/4_json_results/migrated_query_data.json"
documents = load_json(PATH)
document_embeddings = document_embedding(documents)


Embedding Time: 8.19s


#### `MAIN get_output_main`

In [77]:
desired_columns = ["ACT_END_DATE, BUA_NO_START"] # YVES INBOUND NOT COMPLETED
# desired_columns = ["ONHANDQTY, PARTNO"] # CWIS ASTRO CDC CMR
# desired_columns = ["ECARRNO"] # MIP 490 INBOUND PALLETS AND TOTAL OH
# desired_columns = ["ASTRO_TRIP_ID", "ITEM_ID"] # YVES STORE ORDERS VOLUME BY DAY
# desired_columns = [ # AAYE STOCKAGE
#     "AMOONCR",
#     "CARRSTAT",
#     "CARRTYPE",
#     "DIVCODE",
#     "ECARRNO",
#     "FLEXDAYS",
#     "G08PDATE",
#     "LDCT",
#     "LOCKCODE",
#     "LOTID",
# ]

get_output_main(desired_columns, document_embeddings)

Based on the provided data, there is a report named **'YVES INBOUND NOT COMPLETED'** that contains the columns  'ACT_END_DATE' and 'BUA_NO_START'. 


Let me know if you have any other data requests! 

Reports with highest Similarity:
Similarity for Report Name 'YVES INBOUND NOT COMPLETED': 63.53%
Similarity for Report Name 'ADM CHECK': 60.53%
Similarity for Report Name 'ACTIVITY FOLLOW UP': 58.68%
Runtime: 59.72s.


#### `Query Embedding Methods`

##### `load_json`

In [78]:
def load_json(PATH):

    try:
        with open(PATH, 'r') as json_file:
            query_data = json.load(json_file)

        report_names = []
        report_columns = []

        for key, value in query_data.items():
            report_names.append(key.strip())
            report_columns.append(value["columns_cleansed"])

        df = pd.DataFrame({"report_name": report_names, "report_columns": report_columns})
        df['report_columns'] = df["report_columns"].apply(liste_zu_string)

        df["documents"] = "Report Name " + "'" + df["report_name"]+ "'" + " contains columns: " + df["report_columns"]

    except Exception as e:
        print(f"{e}")

    return df[["documents"]].apply(zeile_zu_string, axis=1).tolist()


def liste_zu_string(liste):
    return ', '.join(map(str, liste))

def zeile_zu_string(row):
    return ', '.join(map(str, row))

##### `document_embedding`

In [24]:
def document_embedding(documents):
    start = time.time()
    document_embeddings = []

    for i, doc in enumerate(documents):
        response = ollama.embed(model="mxbai-embed-large:latest", input=doc)
        embeddings = response["embeddings"]
        document_embeddings.append(embeddings)
        collection.add(ids=[str(i)], embeddings=embeddings, documents=[doc])
    end = time.time()

    runtime = end - start
    print(f"Embedding Time: {runtime:.2f}s")
    return document_embeddings

#### `Output Methods`

##### `get_output_main`

In [38]:
def get_output_main(desired_columns, document_embeddings):
    start = time.time()

    desired_columns = liste_zu_string(desired_columns)

    prompt_embeddings = ollama.embeddings(model="mxbai-embed-large:latest", prompt=desired_columns)

    retrieval_results = retrieve_relevant_documents(prompt_embeddings)

    similarity = calculate_similarity(document_embeddings, prompt_embeddings, retrieval_results)

    generate_output(desired_columns, retrieval_results, similarity)

    end = time.time()
    runtime = end - start
    print(f"Runtime: {runtime:.2f}s.")

##### `retrieve_relevant_documents`

In [18]:
def retrieve_relevant_documents(embedded_prompt):
    retrieval_results = collection.query(query_embeddings=embedded_prompt["embedding"], n_results=3)
    return retrieval_results

##### `calculate_similarity`

In [37]:
def calculate_similarity(document_embeddings, embedded_prompt, retrieval_results):
    # document_vector = np.array(document_embeddings[0][int(retrieval_results["ids"][0][0])]).reshape(1, -1)
    # prompt_vector = np.array(embedded_prompt["embedding"]).reshape(1, -1)
    # similarity = cosine_similarity(document_vector, prompt_vector).flatten()[0]

    # return similarity

    similarities = []

    prompt_vector = np.array(embedded_prompt["embedding"]).reshape(1, -1)
    for i in range(len(retrieval_results["ids"][0])):
        document_vector = np.array(document_embeddings[int(retrieval_results["ids"][0][i])][0]).reshape(1, -1)
        similarities.append(cosine_similarity(document_vector, prompt_vector).flatten()[0])

    return similarities

##### `generate_output`

In [None]:
def generate_output(desired_columns, retrieval_results, similarity):

    PROMPT = f"Please check if there is a report containing the columns '{desired_columns}'. If such a report exists, provide the report name."

    output = ollama.generate(
        model="gemma2:2b",
        prompt=f"Using this data: {retrieval_results["documents"][0][0]}. Respond to this prompt: {PROMPT}",
    )

    print(output["response"])
    print("Reports with highest Similarity:")
    for i, doc in enumerate(retrieval_results["documents"][0]):
        match = re.search(r"Report Name '.*?'", retrieval_results["documents"][0][i])
        if match:
            print(f"Similarity for {match.group(0)}: {round(similarity[i] * 100, 2)}%")
        else:
            print(f"Similarity for {retrieval_results["documents"][0][i]}: {round(similarity[i] * 100, 2)}%")