# Creating Vector DB for ARGOG Paper

In [1]:
# install packages if they dont exist:
import subprocess
import sys

# Function to install a package if not already installed
def install_package(package_name, import_name=None):
    try:
        if import_name is None:
            import_name = package_name
        # Use __import__ for top-level modules, import for submodules
        if "." in import_name:
            exec(f"from {import_name.rsplit('.', 1)[0]} import {import_name.rsplit('.', 1)[1]}")
        else:
            __import__(import_name)
    except ImportError:
        print(f"{package_name} not found. Installing...")
        subprocess.check_call([sys.executable, "-m", "pip", "install", package_name])
    else:
        print(f"{package_name} is already installed.")

# List of packages to install with their import names if different
packages = [
    ("datasets", "datasets"),
    ("pandas", "pandas"),
    ("llama-index", "llama_index"),
    ("chromadb", "chromadb"),
    ("openai", "openai"),
    ("dotenv", "dotenv"),
    ("llama-index-vector-stores-postgres", "llama_index.vector_stores"), # Install the postgres vector store if you intend to use it
    ("llama-index-vector-stores-chroma", "llama-index-vector-stores-chroma"),
    ("nest_asyncio", "nest_asyncio"),
    ("tenacity", "tenacity")

]

# Install each package
for package_name, import_name in packages:
    install_package(package_name, import_name)

# Check for `utils` module, typically custom or local
from google.colab import drive
drive.mount('/content/drive')
import sys
sys.path.append('/content/drive/MyDrive/Colab Notebooks/ARGOG')
try:
    import utils
except ImportError:
    print("The 'utils' module is not found. Make sure it's available in your environment or install it manually.")

datasets not found. Installing...
pandas is already installed.
llama-index not found. Installing...
chromadb not found. Installing...
openai is already installed.
dotenv is already installed.
llama-index-vector-stores-postgres not found. Installing...
llama-index-vector-stores-chroma not found. Installing...
nest_asyncio is already installed.
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# Importing necessary libraries for loading datasets, data manipulation, document processing, vector storage, and embeddings.
from datasets import load_dataset
from datasets import Dataset
import pandas as pd
from llama_index.core import Document, StorageContext, VectorStoreIndex
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core.node_parser import SentenceWindowNodeParser
import chromadb
from llama_index.core.node_parser import TokenTextSplitter
from utils import chunked_iterable, load_config
from llama_index.vector_stores.chroma import ChromaVectorStore
import openai
import os
from utils import *



In [3]:
from google.colab import drive
drive.mount('/content/drive')

# Verify the directory exists
argog_path = '/content/drive/MyDrive/Colab Notebooks/ARGOG'
if os.path.exists(argog_path):
    print(f"Directory exists: {argog_path}")
else:
    print(f"Directory does not exist: {argog_path}")

# Path to the .env file
env_path = '/content/drive/MyDrive/Colab Notebooks/ARGOG/.env'

# Load the .env file
load_dotenv(dotenv_path=env_path)

# Hardcoded values for easy adjustment
CHUNK_SIZE = 1000 #only for db upload
# TOKEN_CHUNK_SIZE = 512
TOKEN_CHUNK_SIZE = 1024
CHUNK_OVERLAP = 50

# Load the config file
load_config()
openai.api_key = os.getenv("OPENAI_API_KEY")
# print(openai.api_key)
hf_read_token = os.getenv("HF_READ_TOKEN")
from huggingface_hub import login
login(hf_read_token)
print (f'run on: {pd.Timestamp.now()}')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Directory exists: /content/drive/MyDrive/Colab Notebooks/ARGOG
run on: 2024-12-18 01:23:24.513355


Need to redo this to read the csv and sthen store it locally and retrieve it.

In [4]:
# import pandas as pd

# Define the file path
file_path = '/content/drive/MyDrive/Colab Notebooks/ARGOG/archiv_data.csv'

# Read the CSV file into a DataFrame
df = pd.read_csv(file_path)

# Display the DataFrame
# print(df.head())


In [5]:
# Specify the titles of the required papers
required_paper_titles = [
    'BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding',
    'DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter',
    'HellaSwag: Can a Machine Really Finish Your Sentence?',
    'LLaMA: Open and Efficient Foundation Language Models',
    'Measuring Massive Multitask Language Understanding',
    'CodeNet: A Large-Scale AI for Code Dataset for Learning a Diversity of Coding Tasks',
    'Task2Vec: Task Embedding for Meta-Learning',
    'GLM-130B: An Open Bilingual Pre-trained Model',
    'SuperGLUE: A Stickier Benchmark for General-Purpose Language Understanding Systems',
    "Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism",
    "PAL: Program-aided Language Models",
    "RoBERTa: A Robustly Optimized BERT Pretraining Approach",
    "DetectGPT: Zero-Shot Machine-Generated Text Detection using Probability Curvature"
]

In [6]:
# Filter the DataFrame to include only the required papers
required_papers = df[df['title'].isin(required_paper_titles)]

# Exclude the already selected papers to avoid duplicates and randomly sample ~40-50 papers
remaining_papers = df[~df['title'].isin(required_paper_titles)].sample(n=40, random_state=123)

# Concatenate the two DataFrames
final_df = pd.concat([required_papers, remaining_papers], ignore_index=True)


In [7]:
# Prepare document objects from the dataset for indexing
documents = [Document(text=content) for content in df['content']]


In [8]:
# Setup the embedding model
embed_model = OpenAIEmbedding(model="text-embedding-3-large")

chroma_client = chromadb.PersistentClient(path="./chroma_db")


In [9]:
# from VS code notebook
# Classic vector DB
# Initialize a text splitter with hardcoded values for chunking documents
parser = TokenTextSplitter(chunk_size=TOKEN_CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP)
nodes = parser.get_nodes_from_documents(documents)

In [24]:
# !pip install tenacity
import nest_asyncio
nest_asyncio.apply()
import time
from tenacity import retry, wait_exponential, stop_after_attempt

# ... (your existing imports and code) ...

# **Define insert_batch_size and sleep_duration**
insert_batch_size = 10  # Adjust the batch size as needed
sleep_duration = 2  # Adjust the sleep duration as needed

# Instead of getting the collection, create it if it doesn't exist:
chroma_collection = chroma_client.get_or_create_collection("ai_arxiv_full")
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

# Create the index object here
index = VectorStoreIndex(
    [],  # Initialize with an empty list of nodes
    storage_context=storage_context,
    embed_model=embed_model,
)

# **Change 1: Apply retry decorator with exponential backoff**
@retry(wait=wait_exponential(multiplier=1, min=4, max=60), stop=stop_after_attempt(6))
async def embed_with_retry(nodes_batch):
    # Now 'index' is accessible here
    return index.insert_nodes(nodes_batch)

# **Change 2: Insert nodes in batches with a delay**
for i in range(0, len(nodes), insert_batch_size):
    batch_nodes = nodes[i : i + insert_batch_size]
    # **Change 3: Call the decorated function to insert nodes**
    await embed_with_retry(batch_nodes)
    # Wait for the specified duration before processing the next batch
    time.sleep(sleep_duration)





RetryError: RetryError[<Future at 0x784a5cf3b0d0 state=finished raised RateLimitError>]

In [11]:
# # this code works. but has batch. Try without batchin nex section
# import nest_asyncio
# nest_asyncio.apply()

# # Continue with your existing code
# # Instead of getting the collection, create it if it doesn't exist:
# chroma_collection = chroma_client.get_or_create_collection("ai_arxiv_full")
# vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
# storage_context = StorageContext.from_defaults(vector_store=vector_store)

# # Reduce the batch size for embedding to avoid rate limits
# # You can experiment with different values until it works reliably
# # insert_batch_size = 10 # Commenting this out. Adjust the value if needed.

# index = VectorStoreIndex(
#     nodes,
#     storage_context=storage_context,
#     embed_model=embed_model,
#     # use_async=True, # Commenting out to disable asynchronous execution
#     # insert_batch_size=insert_batch_size # Commenting this out. Adjust the value if needed.
# )

In [12]:
# # this code works. but has batch. Try without batchin nex section
# # not working now
# import nest_asyncio
# nest_asyncio.apply()

# # Continue with your existing code
# # Instead of getting the collection, create it if it doesn't exist:
# chroma_collection = chroma_client.get_or_create_collection("ai_arxiv_full")
# vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
# storage_context = StorageContext.from_defaults(vector_store=vector_store)

# # Reduce the batch size for embedding to avoid rate limits
# # You can experiment with different values until it works reliably
# insert_batch_size = 10

# index = VectorStoreIndex(
#     nodes,
#     storage_context=storage_context,
#     embed_model=embed_model,
#     use_async=True,
#     insert_batch_size=insert_batch_size # Add this line to control the batch size
# )

In [13]:
# this code works. but has batch. Try without batchin nex section
# not working now
# import nest_asyncio
# nest_asyncio.apply()
# import time

# # Continue with your existing code
# # Instead of getting the collection, create it if it doesn't exist:
# chroma_collection = chroma_client.get_or_create_collection("ai_arxiv_full")
# vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
# storage_context = StorageContext.from_defaults(vector_store=vector_store)

# # Reduce the batch size for embedding to avoid rate limits
# # You can experiment with different values until it works reliably
# insert_batch_size = 5  # Reduced batch size further

# index = VectorStoreIndex(
#     nodes,
#     storage_context=storage_context,
#     embed_model=embed_model,
#     use_async=True,
#     insert_batch_size=insert_batch_size # Add this line to control the batch size
# )

# # Add a delay between batches to avoid rate limits
# for i in range(0, len(nodes), insert_batch_size):
#     # Process a batch of nodes
#     batch_nodes = nodes[i : i + insert_batch_size]

#     # ... your existing code to process the batch ...

#     # Wait for a short duration before processing the next batch
#     time.sleep(2) # Adjust the sleep duration as needed

In [14]:
# # this code works. but has batch. Try without batchin nex section
# # not working now
# import nest_asyncio
# nest_asyncio.apply()
# import time

# # Continue with your existing code
# # Instead of getting the collection, create it if it doesn't exist:
# chroma_collection = chroma_client.get_or_create_collection("ai_arxiv_full")
# vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
# storage_context = StorageContext.from_defaults(vector_store=vector_store)

# # Reduce the batch size for embedding to avoid rate limits
# # You can experiment with different values until it works reliably
# insert_batch_size = 5  # Reduced batch size further

# # **Change 1: Increase sleep duration**
# sleep_duration = 60  # Increase sleep duration to 1 minute or more

# index = VectorStoreIndex(
#     nodes,
#     storage_context=storage_context,
#     embed_model=embed_model,
#     use_async=True,
#     insert_batch_size=insert_batch_size # Add this line to control the batch size
# )

# # Add a delay between batches to avoid rate limits
# for i in range(0, len(nodes), insert_batch_size):
#     # Process a batch of nodes
#     batch_nodes = nodes[i : i + insert_batch_size]

#     # ... your existing code to process the batch ...

#     # Wait for a short duration before processing the next batch
#     time.sleep(sleep_duration) # **Change 2: Apply the increased sleep duration**

In [15]:
# # this code works. but has batch. Try without batchin nex section
# # not working now
# import nest_asyncio
# nest_asyncio.apply()
# import time

# # Continue with your existing code
# # Instead of getting the collection, create it if it doesn't exist:
# chroma_collection = chroma_client.get_or_create_collection("ai_arxiv_full")
# vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
# storage_context = StorageContext.from_defaults(vector_store=vector_store)

# # Reduce the batch size for embedding to avoid rate limits
# # You can experiment with different values until it works reliably
# insert_batch_size = 5  # Reduced batch size further

# # **Change 1: Increase sleep duration**
# sleep_duration = 120  # Increase sleep duration to 2 minutes or more

# index = VectorStoreIndex(
#     nodes,
#     storage_context=storage_context,
#     embed_model=embed_model,
#     use_async=True,
#     insert_batch_size=insert_batch_size # Add this line to control the batch size
# )

# # **Change 2: Insert nodes in batches with a delay**
# for i in range(0, len(nodes), insert_batch_size):
#     batch_nodes = nodes[i : i + insert_batch_size]
#     # Insert the batch of nodes into the index
#     index.insert_nodes(batch_nodes)
#     # Wait for the specified duration before processing the next batch
#     time.sleep(sleep_duration)

In [16]:
# # if the above works
# # Sentence window
# node_parser_sentence_window = SentenceWindowNodeParser.from_defaults(
#     window_size=3,
#     window_metadata_key="window",
#     original_text_metadata_key="original_text",
# )
# nodes_sentence_window = node_parser_sentence_window.get_nodes_from_documents(documents)
# nodes_sentence_window_filtered = filter_large_nodes(nodes_sentence_window)

# # chroma_collection_sentence_window = chroma_client.create_collection("ai_arxiv_sentence_window")

# chroma_collection_sentence_window = chroma_client.create_collection(
#     "ai_arxiv_sentence_window", get_or_create=True
# )

# vector_store_sentence_window = ChromaVectorStore(chroma_collection=chroma_collection_sentence_window)

# storage_context_sentence_window = StorageContext.from_defaults(vector_store=vector_store_sentence_window)

In [17]:
# # try this code wihtout batch and see what happens.
# # does not work without batch
# import nest_asyncio
# nest_asyncio.apply()

# # Continue with your existing code
# # Instead of getting the collection, create it if it doesn't exist:
# chroma_collection = chroma_client.get_or_create_collection("ai_arxiv_full")
# vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
# storage_context = StorageContext.from_defaults(vector_store=vector_store)

# index = VectorStoreIndex(
#     nodes,
#     storage_context=storage_context,
#     embed_model=embed_model,
#     use_async=True
# )

In [18]:
# import nest_asyncio
# nest_asyncio.apply()

# # Continue with your existing code
# chroma_collection = chroma_client.get_collection("ai_arxiv_full")
# vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
# storage_context = StorageContext.from_defaults(vector_store=vector_store)

# index = VectorStoreIndex(
#     nodes,
#     storage_context=storage_context,
#     embed_model=embed_model,
#     use_async=True
# )


In [19]:
# Classic vector DB
# # Initialize a text splitter with hardcoded values for chunking documents
# parser = TokenTextSplitter(chunk_size=TOKEN_CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP)
# nodes = parser.get_nodes_from_documents(documents)

# chroma_collection = chroma_client.create_collection("ai_arxiv_full")

# vector_store = ChromaVectorStore(chroma_collection=chroma_collection)

# storage_context = StorageContext.from_defaults(vector_store=vector_store)

# index = VectorStoreIndex(
#     nodes, storage_context=storage_context,
#     embed_model=embed_model,
#     use_async=True
# )

In [20]:
# # !pip install nest_asyncio
# import nest_asyncio
# nest_asyncio.apply()

# # Classic vector DB
# # Initialize a text splitter with hardcoded values for chunking documents
# parser = TokenTextSplitter(chunk_size=TOKEN_CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP)
# nodes = parser.get_nodes_from_documents(documents)

# # Instead of creating a new collection, try to get the existing one.
# # If it doesn't exist, it will be created.
# chroma_collection = chroma_client.get_or_create_collection("ai_arxiv_full")

# vector_store = ChromaVectorStore(chroma_collection=chroma_collection)

# storage_context = StorageContext.from_defaults(vector_store=vector_store)

# # Remove use_async=True to avoid nested event loop issues.
# index = VectorStoreIndex(
#     nodes, storage_context=storage_context,
#     embed_model=embed_model
#     # use_async=True  # Removed this line
# )

In [21]:
# !pip install nest_asyncio
# import nest_asyncio
# nest_asyncio.apply()

# # Classic vector DB
# # Initialize a text splitter with hardcoded values for chunking documents
# parser = TokenTextSplitter(chunk_size=TOKEN_CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP)
# nodes = parser.get_nodes_from_documents(documents)

# chroma_collection = chroma_client.create_collection("ai_arxiv_full")

# vector_store = ChromaVectorStore(chroma_collection=chroma_collection)

# storage_context = StorageContext.from_defaults(vector_store=vector_store)

# index = VectorStoreIndex(
#     nodes, storage_context=storage_context,
#     embed_model=embed_model,
#     use_async=True # This line triggers the error because an event loop is already running in Jupyter
# )

In [22]:
# !pip install nest_asyncio
# import nest_asyncio
# nest_asyncio.apply()

# # Classic vector DB
# # Initialize a text splitter with hardcoded values for chunking documents
# parser = TokenTextSplitter(chunk_size=TOKEN_CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP)
# nodes = parser.get_nodes_from_documents(documents)

# # Instead of creating a new collection, try to get the existing one.
# # If it doesn't exist, it will be created.
# chroma_collection = chroma_client.get_or_create_collection("ai_arxiv_full")

# vector_store = ChromaVectorStore(chroma_collection=chroma_collection)

# storage_context = StorageContext.from_defaults(vector_store=vector_store)

# index = VectorStoreIndex(
#     nodes, storage_context=storage_context,
#     embed_model=embed_model,
#     use_async=True # This line triggers the error because an event loop is already running in Jupyter
# )