In [8]:
import pandas as pd
from datetime import datetime
from textwrap import shorten
import os
import time

from LS_AMG_RAG import utils
from LS_AMG_RAG.data_snythesis import prompt_utils
from tqdm import tqdm

import chromadb
import chromadb.utils.embedding_functions as embedding_functions
# chroma_client = chromadb.PersistentClient(path="./")
chroma_client = chromadb.HttpClient(host="3.85.170.245", port=8000)

gemini = prompt_utils.Gemini()
google_ef  = embedding_functions.GoogleGenerativeAiEmbeddingFunction(api_key=os.environ['GEMINI_API_KEY'])

df = pd.read_csv('multi_hop_rag_dataset.csv')
df.head(1)

Unnamed: 0,category,url,body,title,author,published_at,source
0,entertainment,https://mashable.com/article/cyber-monday-deal...,"Table of Contents Table of Contents Echo, Fire...",200+ of the best deals from Amazon's Cyber Mon...,,2023-11-27T08:45:59+00:00,Mashable


In [2]:
collection = chroma_client.get_or_create_collection(
    name='multi_hop_rag_dataset',
    embedding_function=google_ef,
    metadata={"hnsw:space": "cosine"}
)

In [70]:
all_paragraphs = []
count = 0
for i in tqdm(range(len(df))):

    category = df['category'][i]
    url = df['url'][i]
    title = df['title'][i]
    body = df['body'][i]
    author = df['author'][i] if not pd.isnull(df['author'][i]) else ""
    source = df['source'][i]

    words = df['body'][i].split(' ')
    paragraphs_per_article = []
    
    for j in range(0, len(words), 200):
        paragraphs_per_article.append(' '.join(words[j:j + 250]))
    
    all_paragraphs.append(paragraphs_per_article)

    for idx, paragraph in enumerate(paragraphs_per_article):
        metadata = {
            "category": category,
            "url": url,
            "title": title,
            "author": author,
            "source": source,
            "paragraph_no": idx+1
        }
        while True:
            try:
                collection.add(
                    documents=paragraph,
                    metadatas=metadata,
                    ids=f"{i+1}_{idx+1}",
                )
                break
            except:
                count += 1
                time.sleep(3)
                continue

print(f"Total number of articles: {len(all_paragraphs)}")
print("Done!")

100%|██████████| 609/609 [40:14<00:00,  3.96s/it]  

Total number of articles: 609
Done!





In [None]:
collection.peek(1)

In [12]:
query = "Who is the individual associated with the cryptocurrency industry facing a criminal trial on fraud and conspiracy charges, as reported by both The Verge and TechCrunch, and is accused by prosecutors of committing fraud for personal gain?"
queries = [
    "Who is the figure associated with generative AI technology whose departure from OpenAI was considered shocking according to Fortune, and is also the subject of a prevailing theory suggesting a lack of full truthfulness with the board as reported by TechCrunch?",
    "What is the name of the individual who is the subject of a lawsuit filed by the US Securities and Exchange Commission, as reported by The Verge, and is accused of engaging in a fraudulent scheme to deceive investors?",
    "Who is the individual associated with the cryptocurrency industry facing a criminal trial on fraud and conspiracy charges, as reported by both The Verge and TechCrunch, and is accused by prosecutors of committing fraud for personal gain?",
    "Do the TechCrunch article on software companies and the Hacker News article on The Epoch Times both report an increase in revenue related to payment and subscription models, respectively?",
    "Which online betting platform provides a welcome bonus of up to $1000 in bonus bets for new customers' first losses, runs NBA betting promotions, and is anticipated to extend the same sign-up offer to new users in Vermont, as reported by both CBSSports.com and Sporting News?",
]
start_time = time.time()

for query in queries:
    query_start_time = time.time()
    results = collection.query(
            query_texts=query,
            n_results=10,
        )
    query_time = time.time() - query_start_time
    print(f"Query Time: {query_time:3} seconds")
    print("---")

total_time = time.time() - start_time
print(f"Time taken: {total_time:3} seconds")
print(f"Time taken per query: {total_time / len(queries):3} seconds")

Query Time: 0.4093012809753418 seconds
---
Query Time: 1.4662506580352783 seconds
---
Query Time: 0.4258270263671875 seconds
---
Query Time: 1.091294288635254 seconds
---
Query Time: 0.4633820056915283 seconds
---
Time taken: 3.8570642471313477 seconds
Time taken per query: 0.7714128494262695 seconds


In [13]:
metaprompt = """You are a helpful and informative bot that answers questions using text from the reference document included below. \
Be sure to respond in a complete sentence, being comprehensive, including all relevant background information. \
However, you are talking to a non-technical audience, so be sure to break down complicated concepts and \
strike a friendly and converstional tone. \
Onlt use the data provided in the PASSAGE below.
Do not make any assumptions or use your own knowledge.
  QUESTION: '{query}'
  PASSAGE: '{relevant_passage}'

  ANSWER:
"""

gemini_result = gemini.send_message(message=metaprompt.format(query=query, relevant_passage=results['documents'][0][0])).text

In [5]:
gemini_result

'I am sorry, but neither The Verge nor CNET articles on the iPhone 13 series are available in the text provided. Therefore, I cannot answer your question regarding the iPhone model with the best combination of camera features and battery life.'

In [83]:
results

{'ids': [['577_14',
   '577_24',
   '223_4',
   '601_16',
   '601_3',
   '418_11',
   '601_20',
   '92_7',
   '601_17',
   '92_5']],
 'distances': [[0.29910022020339966,
   0.3003997802734375,
   0.3014300465583801,
   0.30526375261953087,
   0.3058456271962432,
   0.30799537897109985,
   0.30879475528735056,
   0.3118619918823242,
   0.3129707620020653,
   0.3153727054595947]],
 'metadatas': [[{'author': 'Brenda Stolyar',
    'category': 'technology',
    'paragraph_no': 14,
    'source': 'Wired',
    'title': '54 Best Apple Black Friday Deals (2023): iPad, Apple Watch, AirPods',
    'url': 'https://www.wired.com/story/best-apple-black-friday-deals-2023-2/'},
   {'author': 'Brenda Stolyar',
    'category': 'technology',
    'paragraph_no': 24,
    'source': 'Wired',
    'title': '54 Best Apple Black Friday Deals (2023): iPad, Apple Watch, AirPods',
    'url': 'https://www.wired.com/story/best-apple-black-friday-deals-2023-2/'},
   {'author': 'Jeff Dunn,Valentina Palladino,Amy Skorheim

In [85]:
results['documents'][0][:3]

["the latest Apple Watch. If you're looking to upgrade, it offers faster performance, improved battery life, and a brighter display. There's also a new Double Tap feature that will trigger functions like playing music or hanging up a call by tapping your index finger and thumb. With the second-gen ultra-wideband chip, you can precision-locate your iPhone and AirPods too.\n\nIf you don't need all the in-depth capabilities that come with the Apple Watch Series 9, the second-generation Apple Watch SE is a great alternative. It still tracks crucial health and fitness features, and with the S8 chip, you'll also get Crash Detection and the redesigned Compass app. It has support for WatchOS 10 too, so you'll have access to the latest software updates.\n\nThe second-gen Apple Watch Ultra (8/10, WIRED Recommends) packs a brighter 3,000-nit display, the latest S9 chip, and the new ultra-wideband chip for precision-locating your phone. You still get the useful Action button, the three-mic array, 