In [1]:
import pandas as pd
from datetime import datetime
from textwrap import shorten
from pymongo.mongo_client import MongoClient
from pymongo.server_api import ServerApi
from dotenv import load_dotenv
import os
import time

from LS_AMG_RAG import utils
from tqdm.notebook import tqdm

# Load environment variables from .env file
load_dotenv()

uri = os.getenv("MULTIHOP_RAG_URI")

# Create a new client and connect to the server
client = MongoClient(uri, server_api=ServerApi('1'))

# Send a ping to confirm a successful connection
try:
    client.admin.command('ping')
    print("Pinged your deployment. You successfully connected to MongoDB!")
except Exception as e:
    print(e)

db = client['multihop-rag']
corpus = db['corpus']

In [2]:
df = pd.read_csv('multi_hop_rag_dataset.csv')
df.head(1)

Unnamed: 0,category,url,body,title,author,published_at,source
0,entertainment,https://mashable.com/article/cyber-monday-deal...,"Table of Contents Table of Contents Echo, Fire...",200+ of the best deals from Amazon's Cyber Mon...,,2023-11-27T08:45:59+00:00,Mashable


In [6]:
# First article:
print(f"Category: {df['category'][0]}")
print(f"URL: {df['url'][0]}")
print(f"Title: {df['title'][0]}")
print(f"Body: {shorten(df['body'][0], width=100, placeholder='...')}")
print(f"Author(s): {df['author'][0]}")
print(f"Published Date: {datetime.strptime(df['published_at'][0], '%Y-%m-%dT%H:%M:%S%z').strftime('%Y-%m-%d')}")
print(f"Source: {df['source'][0]}")

paragraphs = df['body'][0].split('\n\n')
paragraph_embeddings = []
paragraph_len = []

start_time = time.time()
for paragraph in paragraphs:
    paragraph_len.append(len(paragraph.split(' ')))
    paragraph_embeddings.append(utils.gemini_vector(text=paragraph, title=df['title'][0]))

print(f"Paragraphs: {len(paragraphs)}")
print(f"Average words per paragraph: {sum(paragraph_len) / len(paragraph_len)}")
print(f"Minimum words per paragraph: {min(paragraph_len)}")
print(f"Maximum words per paragraph: {max(paragraph_len)}")
print(f"Time taken to embed {len(paragraphs)} paragraphs: {time.time() - start_time} seconds")

In [3]:
all_paragraphs = []
all_paragraph_embeddings = []
for i in tqdm(range(len(df))):

    category = df['category'][i]
    url = df['url'][i]
    title = df['title'][i]
    body = df['body'][i]
    author = df['author'][i] if not pd.isnull(df['author'][i]) else None
    source = df['source'][i]

    words = df['body'][i].split(' ')
    paragraphs_per_article = []
    paragraph_embeddings = []
    
    for j in range(0, len(words), 200):
        paragraphs_per_article.append(' '.join(words[j:j + 250]))
    
    all_paragraphs.append(paragraphs_per_article)
    documents_per_article = []
    for idx, paragraph in enumerate(paragraphs_per_article):
        paragraph_embedding = utils.gemini_vector(text=paragraph, title=df['title'][i])
        paragraph_embeddings.append(paragraph_embedding)
        document = {
            'category': category,
            'url': url,
            'title': title,
            'body': body,
            'author': author,
            'source': source,
            'paragraph': paragraph,
            'paragraph_no': idx+1,
            'paragraph_embedding': paragraph_embedding,
        }
        documents_per_article.append(document)
    
    all_paragraph_embeddings.append(paragraph_embeddings)

    
    result = corpus.insert_many(documents_per_article)

paragraph_len = [len(paragraphs) for paragraphs in all_paragraphs]

print(f"Total number of articles: {len(all_paragraphs)}")
print(f"Total number of paragraphs: {sum([len(paragraphs) for paragraphs in all_paragraphs])}")
print(f"Average paragraphs per article: {sum(paragraph_len) / len(paragraph_len)}")

  0%|          | 0/609 [00:00<?, ?it/s]

Total number of articles: 609
Total number of paragraphs: 5482
Average paragraphs per article: 9.001642036124794


In [None]:
for i in tqdm(range(len(df))):

    category = df['category'][i]
    url = df['url'][i]
    title = df['title'][i]
    body = df['body'][i]
    author = df['author'][i] if not pd.isnull(df['author'][i]) else None
    source = df['source'][i]

    doc_id = corpus.find_one({'url': url})['_id']

    # metadata code

    metadata = {
        # key: value
        'doc_id': doc_id,
    }