In [11]:
from IPython.display import display, Markdown
import os
from LS_AMG_RAG.data_snythesis import prompt_utils
import google.generativeai as palm
import time

import chromadb
import chromadb.utils.embedding_functions as embedding_functions
# chroma_client = chromadb.Client()
# chroma_client = chromadb.PersistentClient(path="./")
# chroma_client = chromadb.HttpClient(host='localhost', port=8000)
chroma_client = chromadb.HttpClient(host="44.222.251.196", port=8000)
gemini = prompt_utils.Gemini()
google_ef  = embedding_functions.GoogleGenerativeAiEmbeddingFunction(api_key=os.environ['GEMINI_API_KEY'])

In [12]:
chroma_client.heartbeat()

1711099506439058097

In [13]:
try:
    collection = chroma_client.get_or_create_collection(
        name="my_collection",
        embedding_function=google_ef,
        metadata={"hnsw:space": "cosine"})
except:
    collection = chroma_client.create_collection(
        name="my_collection",
        embedding_function=google_ef,
        metadata={"hnsw:space": "cosine"})
    documents = []
    metadata = []
    ids = []

    for root, dirs, files in os.walk("../data"):
        for file in files:
            if file.endswith(".md"):
                category = root.split('\\')[-1]
                with open(os.path.join(root, file), "r") as f:
                    file_contents = f.read()
                    documents.append(file_contents)
                    metadata.append({
                        "type": category,
                    })
                    ids.append(f"{category}_{file}")

    collection.add(
        documents=documents,
        metadatas=metadata,
        ids=ids,
    )

else:
    print(f"Total number of documents in the collection: {len(collection.get()['ids'])}")

Total number of documents in the collection: 74


In [14]:
documents = []
metadata = []
ids = []

for root, dirs, files in os.walk("../data"):
    for file in files:
        if file.endswith(".md"):
            category = root.split('\\')[-1]
            with open(os.path.join(root, file), "r") as f:
                file_contents = f.read()
                documents.append(file_contents)
                metadata.append({
                    "type": category,
                })
                ids.append(f"{category}_{file}")

collection.add(
    documents=documents,
    metadatas=metadata,
    ids=ids,
)

In [15]:
top_k = {
    1: [],
    3: [],
    5: [],
    10: [],
}

queries = [
    {'query': "What is Instagram's current business proposal?",
     'file': "Business Proposal.md"},
    {'query': "What is the marketing plan for Instagram?",
     'file': "Marketing Plan.md"},
    {'query': "What information does the progress report of Instagram contain?",
     'file': "Progress Report.md"},
    {'query': "Who are the members of Instagram's board of directors?",
     'file': "Board of Directors.md"},
    {'query': "What are the diversity and inclusion initiatives implemented by Instagram?",
     'file': 'Diversity, Equity, and Inclusion.md'},
    {'query': "What is the Marketing Objective for Influencer Collaboration Services?",
     'file': 'Marketing Plan.md'},
    {'query': "Who is the target audience of Content Creation and Curation Services?",
     'file': 'Marketing Plan.md'},
    {'query': "What is the financial update for the Reels Optimization Project?",
     'file': 'Progress Report.md'},
    {'query': "Give me details about the progress report of the Stories Upgrade project.",
     'file': 'Progress Report.md'},
    {'query': "Compare the progress report of the Feed Redesign and Stories Upgrade project and draw a conclusion on the information.",
     'file': 'Progress Report.md'}
]

step_times = {
    'retrieval': [],
    'gen': [],
    'total': [],
}

In [16]:

for idx, query in enumerate(queries):
    print(f"Query {idx+1}: {query['query']}")
    total_start_time = time.time()
    retrieval_start_time = time.time()
    results = collection.query(
        query_texts=query['query'],
        n_results=10,
    )

    for k in top_k.keys():
        top_k[k].append(any(query['file'] in x for x in results['ids'][0][-k:]))

    retrieval_end_time = time.time()
    print(f"True document: {query['file']}")
    print(f"Document retrieved: {results['ids'][0][-1]}")
    print(f"Retrieval time: {retrieval_end_time - retrieval_start_time:.2f} seconds")


    metaprompt = """You are a helpful and informative bot that answers questions using text from the reference document included below. \
Be sure to respond in a complete sentence, being comprehensive, including all relevant background information. \
However, you are talking to a non-technical audience, so be sure to break down complicated concepts and \
strike a friendly and converstional tone. \
Use your own knowledge base in addition to the information provided in the document to answer the question. \
Make relevant assumptions and use your best judgement to answer the question. \
  QUESTION: '{query}'
  PASSAGE: '{relevant_passage}'

  ANSWER:
"""

    gen_start_time = time.time()
    gemini_result = gemini.send_message(message=metaprompt.format(query=queries[0], relevant_passage=results['documents'][0][-1])).text
    gen_end_time = time.time()
    total_end_time = time.time()

    step_times['retrieval'].append(retrieval_end_time - retrieval_start_time)
    step_times['gen'].append(gen_end_time - gen_start_time)
    step_times['total'].append(total_end_time - total_start_time)
    
    display(Markdown(gemini_result))
    print(f"Gemini time: {gen_end_time - gen_start_time:.2f} seconds")
    print(f"Total time: {total_end_time - total_start_time:.2f} seconds")
    print("\n-------------------\n")

print("Results:")
print("Top@K:")
for k in top_k.keys():
    print(f"Top@{k}: {(sum(top_k[k]) * 100) / len(top_k[k]):.2f}%")

print("\n")

print("Average Times:")
print(f"Retrieval time: {sum(step_times['retrieval']) / len(step_times['retrieval']):.2f} seconds")
print(f"Generation time: {sum(step_times['gen']) / len(step_times['gen']):.2f} seconds")
print(f"Total time: {sum(step_times['total']) / len(step_times['total']):.2f} seconds")

Query 1: What is Instagram's current business proposal?
True document: Business Proposal.md
Document retrieved: employee_contracts_employee_contract_Emily Brown.md
Retrieval time: 0.96 seconds


I'm sorry, but I am unable to answer your question as the information you are seeking cannot be found within the provided document.

Gemini time: 1.63 seconds
Total time: 2.59 seconds

-------------------

Query 2: What is the marketing plan for Instagram?
True document: Marketing Plan.md
Document retrieved: employee_contracts_employee_contract_Emily Brown.md
Retrieval time: 0.94 seconds


I apologize, but the provided document does not contain information regarding Instagram's current business proposal.

Gemini time: 1.37 seconds
Total time: 2.31 seconds

-------------------

Query 3: What information does the progress report of Instagram contain?
True document: Progress Report.md
Document retrieved: employee_contracts_employee_contract_Max Nguyen.md
Retrieval time: 0.94 seconds


I'm sorry, but the provided document does not include any information on Instagram's current business proposal.

Gemini time: 1.41 seconds
Total time: 2.35 seconds

-------------------

Query 4: Who are the members of Instagram's board of directors?
True document: Board of Directors.md
Document retrieved: employee_esops_employee_esop_Jane Smith.md
Retrieval time: 0.93 seconds


I'm sorry, but the provided document does not contain any information about Instagram's current business proposal.

Gemini time: 1.52 seconds
Total time: 2.45 seconds

-------------------

Query 5: What are the diversity and inclusion initiatives implemented by Instagram?
True document: Diversity, Equity, and Inclusion.md
Document retrieved: employee_esops_employee_esop_David Lee.md
Retrieval time: 0.92 seconds


I'm sorry, but the provided document does not contain any information about Instagram's current business proposal.

Gemini time: 1.46 seconds
Total time: 2.38 seconds

-------------------

Query 6: What is the Marketing Objective for Influencer Collaboration Services?
True document: Marketing Plan.md
Document retrieved: employee_contracts_employee_contract_Mila Lee.md
Retrieval time: 0.91 seconds


Unfortunately, the provided document does not contain any information about Instagram's current business proposal.

Gemini time: 2.41 seconds
Total time: 3.31 seconds

-------------------

Query 7: Who is the target audience of Content Creation and Curation Services?
True document: Marketing Plan.md
Document retrieved: business_docs_Progress Report.md
Retrieval time: 0.95 seconds


I apologize, but I cannot provide information on Instagram's current business proposal as it is not included in the provided document.

Gemini time: 2.44 seconds
Total time: 3.39 seconds

-------------------

Query 8: What is the financial update for the Reels Optimization Project?
True document: Progress Report.md
Document retrieved: employee_payslips_employee_payslip_Mila Lee.md
Retrieval time: 0.69 seconds


I'm sorry, but the provided document does not contain any information on Instagram's current business proposal.

Gemini time: 1.73 seconds
Total time: 2.41 seconds

-------------------

Query 9: Give me details about the progress report of the Stories Upgrade project.
True document: Progress Report.md
Document retrieved: employee_contracts_employee_contract_Luna Nguyen.md
Retrieval time: 0.93 seconds


I apologize, but the provided document does not contain any information on Instagram's current business proposal.

Gemini time: 2.80 seconds
Total time: 3.72 seconds

-------------------

Query 10: Compare the progress report of the Feed Redesign and Stories Upgrade project and draw a conclusion on the information.
True document: Progress Report.md
Document retrieved: employee_payslips_employee_payslip_Luna Nguyen.md
Retrieval time: 0.94 seconds


Based on the information provided, I cannot find any details about Instagram's current business proposal.

Gemini time: 2.34 seconds
Total time: 3.28 seconds

-------------------

Results:
Top@K:
Top@1: 0.00%
Top@3: 0.00%
Top@5: 0.00%
Top@10: 100.00%


Average Times:
Retrieval time: 0.91 seconds
Generation time: 1.91 seconds
Total time: 2.82 seconds
