In [99]:
import ollama
import chromadb
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import json
import re


In [2]:
client = chromadb.Client()
collection = client.create_collection(name="docs")

In [10]:
client.delete_collection(name="docs")
collection = client.create_collection(name="docs")

documents = load_json()
document_embeddings = document_embedding(documents)

In [None]:
# desired_columns = ["ACT_END_DATE, BUA_NO_START"] # YVES INBOUND NOT COMPLETED
# desired_columns = ["ONHANDQTY, PARTNO"] # CWIS ASTRO CDC CMR
# desired_columns = ["ECARRNO"] # MIP 490 INBOUND PALLETS AND TOTAL OH
# desired_columns = ["ASTRO_TRIP_ID", "ITEM_ID"] # YVES STORE ORDERS VOLUME BY DAY
desired_columns = [ # AAYE STOCKAGE
    "AMOONCR",
    "CARRSTAT",
    "CARRTYPE",
    "DIVCODE",
    "ECARRNO",
    "FLEXDAYS",
    "G08PDATE",
    "LDCT",
    "LOCKCODE",
    "LOTID",
]

main(desired_columns, document_embeddings)

Yes, a report containing the specified columns named **"AAYE STOCKAGE"** exists. 

Similarity for Report Name 'AAYE STOCKAGE': 70.71%
Similarity for Report Name 'ART SUP CWIS': 65.84%
Similarity for Report Name 'ADM CHECK': 65.32%


#### `main`

In [None]:
def main(desired_columns, document_embeddings):

    desired_columns = liste_zu_string(desired_columns)

    prompt_embeddings = ollama.embeddings(model="mxbai-embed-large:latest", prompt=desired_columns)

    retrieval_results = retrieve(prompt_embeddings)

    similarity = calculate_similarity(document_embeddings, prompt_embeddings, retrieval_results)

    generate_output(desired_columns, retrieval_results, similarity)

##### `load_json`

In [4]:
def liste_zu_string(liste):
    return ', '.join(map(str, liste))

def zeile_zu_string(row):
    return ', '.join(map(str, row))

In [5]:
def load_json():

    PATH = "../Query Processing/4_json_results/migrated_query_data.json"

    with open(PATH, 'r') as json_file:
        query_data = json.load(json_file)

    report_names = []
    report_columns = []

    for key, value in query_data.items():
        report_names.append(value["report_name"].strip())
        report_columns.append(value["columns_cleansed"])

    df = pd.DataFrame({"report_name": report_names, "report_columns": report_columns})
    df['report_columns'] = df["report_columns"].apply(liste_zu_string)

    df["documents"] = "Report Name " + "'" + df["report_name"]+ "'" + " contains columns: " + df["report_columns"]

    return df[["documents"]].apply(zeile_zu_string, axis=1).tolist()

##### `document_embedding`

In [102]:
def document_embedding(documents):

    document_embeddings = []

    for i, d in enumerate(documents):
        response = ollama.embed(model="mxbai-embed-large:latest", input=d)
        embeddings = response["embeddings"]
        document_embeddings.append(embeddings)
        collection.add(ids=[str(i)], embeddings=embeddings, documents=[d])

    return document_embeddings,

##### `retrieve`

In [90]:
def retrieve(embedded_prompt):
    retrieval_results = collection.query(query_embeddings=embedded_prompt["embedding"], n_results=3)
    return retrieval_results

##### `calculate_similarity`

In [None]:
def calculate_similarity(document_embeddings, embedded_prompt, retrieval_results):
    # document_vector = np.array(document_embeddings[0][int(retrieval_results["ids"][0][0])]).reshape(1, -1)
    # prompt_vector = np.array(embedded_prompt["embedding"]).reshape(1, -1)
    # similarity = cosine_similarity(document_vector, prompt_vector).flatten()[0]

    # return similarity

    similarities = []

    prompt_vector = np.array(embedded_prompt["embedding"]).reshape(1, -1)
    for i in range(len(retrieval_results["ids"][0])):
        document_vector = np.array(document_embeddings[0][int(retrieval_results["ids"][0][i])]).reshape(1, -1)
        similarities.append(cosine_similarity(document_vector, prompt_vector).flatten()[0])

    return similarities

##### `generate_output`

In [101]:
def generate_output(desired_columns, retrieval_results, similarity):

    PROMPT = f"Please check if there is a report containing the columns '{desired_columns}'. If such a report exists, provide the report name."

    output = ollama.generate(
        model="gemma2:2b",
        prompt=f"Using this data: {retrieval_results["documents"][0][0]}. Respond to this prompt: {PROMPT}",
    )

    print(output["response"])
    for i, doc in enumerate(retrieval_results["documents"][0]):
        match = re.search(r"Report Name '.*?'", retrieval_results["documents"][0][i])
        if match:
            print(f"Similarity for {match.group(0)}: {round(similarity[i] * 100, 2)}%")
        else:
            print(f"Similarity for {retrieval_results["documents"][0][i]}: {round(similarity[i] * 100, 2)}%")