In [1]:

import ollama
import chromadb
import numpy as np
import json
from sklearn.metrics.pairwise import cosine_similarity

client = chromadb.Client()
collection = client.create_collection(name="docs")

In [11]:
client.delete_collection(name="docs")
collection = client.create_collection(name="docs")

documents = [
    "Report 1 contains columns ['ONHANDQTY','PARTNO','PARTREV', 'VOL_M3']",
    "Report 2 contains columns ['ARRQTY','BUCODE', 'BUCODECR', 'BUTYPE', 'BUTYPECR', 'DELID']",
    "Report 3 contains columns ['DELTQTY', 'ITEMNO', 'NETVOL', 'STATDATE', 'STATDATE2']",
]

columns = "VORNAME"

res = main(documents, columns)

No, there is no report containing the column 'VORNAME'. 

Similarity for Report 2: 45.4%
Similarity for Report 1: 41.45%
Similarity for Report 3: 38.72%


#### `main`

In [None]:
def main(documents, columns):

    PROMPT = f"Is there a Report containing Columns '{columns}'? If so, provide Report Name"

    EMBEDDING_MODEL = "mxbai-embed-large:latest"
    LLM_MODEL = "gemma2:2b"

    document_embeddings = document_embedding(documents, EMBEDDING_MODEL)

    prompt_embeddings = ollama.embeddings(model=EMBEDDING_MODEL, prompt=columns)

    retrieval_results = retrieve(prompt_embeddings)

    similarity = calculate_similarity(document_embeddings, prompt_embeddings, retrieval_results)

    generate_answer(LLM_MODEL, retrieval_results, PROMPT, similarity)

##### `document_embedding`

In [3]:
def document_embedding(documents, EMBEDDING_MODEL):
    document_embeddings = []

    for i, d in enumerate(documents):
        response = ollama.embed(model=EMBEDDING_MODEL, input=d)
        embeddings = response["embeddings"]
        document_embeddings.append(embeddings)
        collection.add(ids=[str(i)], embeddings=embeddings, documents=[d])

    return document_embeddings,

##### `retrieve`

In [4]:
def retrieve(embedded_prompt):
    retrieval_results = collection.query(query_embeddings=embedded_prompt["embedding"], n_results=3)
    return retrieval_results

##### `calculate_similarity`

In [5]:
def calculate_similarity(document_embeddings, embedded_prompt, retrieval_results):
    similarities = []

    prompt_vector = np.array(embedded_prompt["embedding"]).reshape(1, -1)
    for i in range(len(retrieval_results["ids"][0])):
        document_vector = np.array(document_embeddings[0][int(retrieval_results["ids"][0][i])]).reshape(1, -1)
        similarities.append(cosine_similarity(document_vector, prompt_vector).flatten()[0])

    return similarities

# def calculate_similarity(document_embeddings, embedded_prompt, retrieval_results):

#     document_vector = np.array(document_embeddings[0][int(retrieval_results["ids"][0][0])]).reshape(1, -1)
#     prompt_vector = np.array(embedded_prompt["embedding"]).reshape(1, -1)
#     similarity = cosine_similarity(document_vector, prompt_vector).flatten()[0]

#     return similarity

##### `generate_answer`

In [6]:
import re

def generate_answer(LLM_MODEL, retrieval_results, PROMPT, similarity):
    output = ollama.generate(
        model=LLM_MODEL,
        prompt=f"Using this data: {retrieval_results['documents'][0]}. Respond to this prompt: {PROMPT}", #["documents"][0][0]
    )


    print(output["response"])
    for i, doc in enumerate(retrieval_results["documents"][0]):
        match = re.search(r'Report \d+', retrieval_results["documents"][0][i])
        if match:
            print(f"Similarity for {match.group(0)}: {round(similarity[i] * 100, 2)}%")
        else:
            print(f"Similarity for {retrieval_results["documents"][0][i]}: {round(similarity[i] * 100, 2)}%")