# Creating Vector DB for ARGOG Paper

In [None]:
# # install packages if they dont exist:
# import subprocess
# import sys

# # Function to install a package if not already installed
# def install_package(package_name, import_name=None):
#     try:
#         if import_name is None:
#             import_name = package_name
#         __import__(import_name)
#     except ImportError:
#         print(f"{package_name} not found. Installing...")
#         subprocess.check_call([sys.executable, "-m", "pip", "install", package_name])
#     else:
#         print(f"{package_name} is already installed.")

# # List of packages to install with their import names if different
# packages = [
#     ("datasets", "datasets"),
#     ("pandas", "pandas"),
#     ("llama-index", "llama_index"),
#     ("chromadb", "chromadb"),
#     ("openai", "openai"),
#     ("dotenv", "dotenv"),
#     ("llama_index.vector_stores", "llama_index.vector_stores"),
#     # ("utils", "utils")

# ]

# # Install each package
# for package_name, import_name in packages:
#     install_package(package_name, import_name)

# # Check for `utils` module, typically custom or local
# drive.mount('/content/drive')
# import sys
# sys.path.append('/content/drive/MyDrive/Colab Notebooks/ARGOG')
# try:
#     import utils
# except ImportError:
#     print("The 'utils' module is not found. Make sure it's available in your environment or install it manually.")


In [None]:
# install packages if they dont exist:
import subprocess
import sys

# Function to install a package if not already installed
def install_package(package_name, import_name=None):
    try:
        if import_name is None:
            import_name = package_name
        # Use __import__ for top-level modules, import for submodules
        if "." in import_name:
            exec(f"from {import_name.rsplit('.', 1)[0]} import {import_name.rsplit('.', 1)[1]}")
        else:
            __import__(import_name)
    except ImportError:
        print(f"{package_name} not found. Installing...")
        subprocess.check_call([sys.executable, "-m", "pip", "install", package_name])
    else:
        print(f"{package_name} is already installed.")

# List of packages to install with their import names if different
packages = [
    ("datasets", "datasets"),
    ("pandas", "pandas"),
    ("llama-index", "llama_index"),
    ("chromadb", "chromadb"),
    ("openai", "openai"),
    ("dotenv", "dotenv"),
    ("llama-index-vector-stores-postgres", "llama_index.vector_stores"), # Install the postgres vector store if you intend to use it
    ("llama-index-vector-stores-chroma", "llama-index-vector-stores-chroma"),
    ("nest_asyncio", "nest_asyncio")
    # ("utils", "utils

]

# Install each package
for package_name, import_name in packages:
    install_package(package_name, import_name)

# Check for `utils` module, typically custom or local
from google.colab import drive
drive.mount('/content/drive')
import sys
sys.path.append('/content/drive/MyDrive/Colab Notebooks/ARGOG')
try:
    import utils
except ImportError:
    print("The 'utils' module is not found. Make sure it's available in your environment or install it manually.")

datasets not found. Installing...
pandas is already installed.
llama-index not found. Installing...
chromadb not found. Installing...
openai is already installed.
dotenv is already installed.
llama-index-vector-stores-postgres not found. Installing...
llama-index-vector-stores-chroma not found. Installing...
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Importing necessary libraries for loading datasets, data manipulation, document processing, vector storage, and embeddings.
from datasets import load_dataset
from datasets import Dataset
import pandas as pd
from llama_index.core import Document, StorageContext, VectorStoreIndex
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core.node_parser import SentenceWindowNodeParser
import chromadb
from llama_index.core.node_parser import TokenTextSplitter
from utils import chunked_iterable, load_config
from llama_index.vector_stores.chroma import ChromaVectorStore
import openai
import os
from utils import *



In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Verify the directory exists
argog_path = '/content/drive/MyDrive/Colab Notebooks/ARGOG'
if os.path.exists(argog_path):
    print(f"Directory exists: {argog_path}")
else:
    print(f"Directory does not exist: {argog_path}")

# Path to the .env file
env_path = '/content/drive/MyDrive/Colab Notebooks/ARGOG/.env'

# Load the .env file
load_dotenv(dotenv_path=env_path)

# Hardcoded values for easy adjustment
CHUNK_SIZE = 1000 #only for db upload
TOKEN_CHUNK_SIZE = 512
CHUNK_OVERLAP = 50

# Load the config file
load_config()
openai.api_key = os.getenv("OPENAI_API_KEY")
print(openai.api_key)
hf_read_token = os.getenv("HF_READ_TOKEN")
from huggingface_hub import login
login(hf_read_token)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Directory exists: /content/drive/MyDrive/Colab Notebooks/ARGOG
sk-proj-NedhPl47SBdpZvVsIe2KT3BlbkFJspbYIje2ZfdzTrhdPqP5


In [None]:
# Load dataset and convert to DataFrame for easier manipulation
dataset = load_dataset("jamescalam/ai-arxiv")
df = pd.DataFrame(dataset['train'])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/267 [00:00<?, ?B/s]

Repo card metadata block was not found. Setting CardData to empty.


train.jsonl:   0%|          | 0.00/38.8M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/423 [00:00<?, ? examples/s]

In [None]:
# Specify the titles of the required papers
required_paper_titles = [
    'BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding',
    'DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter',
    'HellaSwag: Can a Machine Really Finish Your Sentence?',
    'LLaMA: Open and Efficient Foundation Language Models',
    'Measuring Massive Multitask Language Understanding',
    'CodeNet: A Large-Scale AI for Code Dataset for Learning a Diversity of Coding Tasks',
    'Task2Vec: Task Embedding for Meta-Learning',
    'GLM-130B: An Open Bilingual Pre-trained Model',
    'SuperGLUE: A Stickier Benchmark for General-Purpose Language Understanding Systems',
    "Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism",
    "PAL: Program-aided Language Models",
    "RoBERTa: A Robustly Optimized BERT Pretraining Approach",
    "DetectGPT: Zero-Shot Machine-Generated Text Detection using Probability Curvature"
]

In [None]:
# Filter the DataFrame to include only the required papers
required_papers = df[df['title'].isin(required_paper_titles)]

# Exclude the already selected papers to avoid duplicates and randomly sample ~40-50 papers
remaining_papers = df[~df['title'].isin(required_paper_titles)].sample(n=40, random_state=123)

# Concatenate the two DataFrames
final_df = pd.concat([required_papers, remaining_papers], ignore_index=True)


In [None]:
# Prepare document objects from the dataset for indexing
documents = [Document(text=content) for content in df['content']]


In [None]:
# Setup the embedding model
embed_model = OpenAIEmbedding(model="text-embedding-3-large")

chroma_client = chromadb.PersistentClient(path="./chroma_db")


In [None]:
# from VS code notebook
# Classic vector DB
# Initialize a text splitter with hardcoded values for chunking documents
parser = TokenTextSplitter(chunk_size=TOKEN_CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP)
nodes = parser.get_nodes_from_documents(documents)

In [None]:
import nest_asyncio
nest_asyncio.apply()

# Continue with your existing code
chroma_collection = chroma_client.get_collection("ai_arxiv_full")
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

index = VectorStoreIndex(
    nodes,
    storage_context=storage_context,
    embed_model=embed_model,
    use_async=True
)




RateLimitError: Error code: 429 - {'error': {'message': 'Rate limit reached for text-embedding-3-large in organization org-oQuebWhpieTzpJtTlWIVROVj on tokens per min (TPM): Limit 1000000, Used 1000000, Requested 52041. Please try again in 3.122s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}

In [None]:
# Classic vector DB
# # Initialize a text splitter with hardcoded values for chunking documents
# parser = TokenTextSplitter(chunk_size=TOKEN_CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP)
# nodes = parser.get_nodes_from_documents(documents)

# chroma_collection = chroma_client.create_collection("ai_arxiv_full")

# vector_store = ChromaVectorStore(chroma_collection=chroma_collection)

# storage_context = StorageContext.from_defaults(vector_store=vector_store)

# index = VectorStoreIndex(
#     nodes, storage_context=storage_context,
#     embed_model=embed_model,
#     use_async=True
# )

RuntimeError: Detected nested async. Please use nest_asyncio.apply() to allow nested event loops.Or, use async entry methods like `aquery()`, `aretriever`, `achat`, etc.

In [1]:
# # !pip install nest_asyncio
# import nest_asyncio
# nest_asyncio.apply()

# # Classic vector DB
# # Initialize a text splitter with hardcoded values for chunking documents
# parser = TokenTextSplitter(chunk_size=TOKEN_CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP)
# nodes = parser.get_nodes_from_documents(documents)

# # Instead of creating a new collection, try to get the existing one.
# # If it doesn't exist, it will be created.
# chroma_collection = chroma_client.get_or_create_collection("ai_arxiv_full")

# vector_store = ChromaVectorStore(chroma_collection=chroma_collection)

# storage_context = StorageContext.from_defaults(vector_store=vector_store)

# # Remove use_async=True to avoid nested event loop issues.
# index = VectorStoreIndex(
#     nodes, storage_context=storage_context,
#     embed_model=embed_model
#     # use_async=True  # Removed this line
# )



NameError: name 'TokenTextSplitter' is not defined

In [None]:
# !pip install nest_asyncio
# import nest_asyncio
# nest_asyncio.apply()

# # Classic vector DB
# # Initialize a text splitter with hardcoded values for chunking documents
# parser = TokenTextSplitter(chunk_size=TOKEN_CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP)
# nodes = parser.get_nodes_from_documents(documents)

# chroma_collection = chroma_client.create_collection("ai_arxiv_full")

# vector_store = ChromaVectorStore(chroma_collection=chroma_collection)

# storage_context = StorageContext.from_defaults(vector_store=vector_store)

# index = VectorStoreIndex(
#     nodes, storage_context=storage_context,
#     embed_model=embed_model,
#     use_async=True # This line triggers the error because an event loop is already running in Jupyter
# )

In [None]:
# !pip install nest_asyncio
# import nest_asyncio
# nest_asyncio.apply()

# # Classic vector DB
# # Initialize a text splitter with hardcoded values for chunking documents
# parser = TokenTextSplitter(chunk_size=TOKEN_CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP)
# nodes = parser.get_nodes_from_documents(documents)

# # Instead of creating a new collection, try to get the existing one.
# # If it doesn't exist, it will be created.
# chroma_collection = chroma_client.get_or_create_collection("ai_arxiv_full")

# vector_store = ChromaVectorStore(chroma_collection=chroma_collection)

# storage_context = StorageContext.from_defaults(vector_store=vector_store)

# index = VectorStoreIndex(
#     nodes, storage_context=storage_context,
#     embed_model=embed_model,
#     use_async=True # This line triggers the error because an event loop is already running in Jupyter
# )