## Colab Notebook

#### For creating vector db

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
!pip install -q transformers sentence-transformers faiss-cpu langchain langchain_community tqdm pandas torch

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.7/30.7 MB[0m [31m47.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m72.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m72.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m57.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m43.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [6]:
!pip install langchain_huggingface

Collecting langchain_huggingface
  Downloading langchain_huggingface-0.1.2-py3-none-any.whl.metadata (1.3 kB)
Downloading langchain_huggingface-0.1.2-py3-none-any.whl (21 kB)
Installing collected packages: langchain_huggingface
Successfully installed langchain_huggingface-0.1.2


In [13]:
file_path = "/content/drive/MyDrive/Colab Notebooks/arxiv-metadata-oai-snapshot.json"

In [14]:
import json
import torch
import pandas as pd
import numpy as np
from pathlib import Path
from multiprocessing import Pool, cpu_count
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from tqdm.notebook import tqdm
import faiss
import gc

In [15]:
import json
import torch
import pandas as pd
import numpy as np
from pathlib import Path
from tqdm.notebook import tqdm
import gc
import psutil
import logging
from typing import List, Dict, Generator

In [16]:
# Check CUDA availability
print(f"CUDA Version: {torch.version.cuda}")
print(f"CUDA Available: {torch.cuda.is_available()}")
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

CUDA Version: 12.4
CUDA Available: True
Using device: cuda


In [17]:
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [18]:
class MemoryMonitor:
    @staticmethod
    def get_memory_usage():
        """Get current memory usage in GB"""
        process = psutil.Process()
        memory_gb = process.memory_info().rss / 1024 / 1024 / 1024
        return memory_gb

    @staticmethod
    def log_memory_usage(step: str):
        """Log memory usage at a given step"""
        memory_gb = MemoryMonitor.get_memory_usage()
        logger.info(f"Memory usage at {step}: {memory_gb:.2f} GB")

    @staticmethod
    def clear_memory():
        """Clear memory and CUDA cache"""
        gc.collect()
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
        logger.info("Memory cleared")

In [19]:
class Config:
    # Memory Settings
    MAX_MEMORY_GB = 12  # Adjust based on your system
    MEMORY_THRESHOLD_GB = 10  # Trigger cleanup when reached

    # Processing Settings
    CHUNK_SIZE = 1000  # Reduced chunk size
    EMBED_BATCH_SIZE = 256  # Smaller batch size
    SAVE_INTERVAL = 10000  # More frequent saves
    MAX_PAPERS = 500000

    # Model Settings
    MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
    VECTOR_DB_PATH = "/content/cs_papers_faiss_db"

    # Text Processing
    CHUNK_SIZE_CHARS = 1500  # Smaller text chunks
    CHUNK_OVERLAP = 200


In [20]:
class PaperProcessor:
    def __init__(self):
        self.embeddings = None
        self.text_splitter = None
        self.initialize_components()

    def initialize_components(self):
        """Initialize components with memory monitoring"""
        MemoryMonitor.log_memory_usage("before_initialization")

        from langchain_huggingface import HuggingFaceEmbeddings
        from langchain.text_splitter import RecursiveCharacterTextSplitter

        device = "cuda" if torch.cuda.is_available() else "cpu"
        self.embeddings = HuggingFaceEmbeddings(
            model_name=Config.MODEL_NAME,
            model_kwargs={'device': device}
        )

        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=Config.CHUNK_SIZE_CHARS,
            chunk_overlap=Config.CHUNK_OVERLAP
        )

        MemoryMonitor.log_memory_usage("after_initialization")

    def process_papers_stream(self, file_path: str) -> Generator[Dict, None, None]:
        """Stream process papers to control memory usage"""
        papers_processed = 0

        for chunk in pd.read_json(file_path, lines=True, chunksize=Config.CHUNK_SIZE):
            # Filter for CS papers
            cs_papers = chunk[chunk['categories'].str.contains('cs.', na=False)]

            for _, paper in cs_papers.iterrows():
                if papers_processed >= Config.MAX_PAPERS:
                    return

                # Check memory usage
                if MemoryMonitor.get_memory_usage() > Config.MEMORY_THRESHOLD_GB:
                    logger.warning("Memory threshold reached, clearing memory...")
                    MemoryMonitor.clear_memory()

                yield {
                    'id': paper.get('id', ''),
                    'title': paper.get('title', '').strip(),
                    'authors': paper.get('authors', ''),
                    'abstract': paper.get('abstract', '').strip(),
                    'categories': paper.get('categories', ''),
                    'date': paper.get('update_date', '')
                }

                papers_processed += 1

In [21]:
class VectorDBBuilder:
    def __init__(self, processor: PaperProcessor):
        self.processor = processor
        self.current_batch_texts = []
        self.current_batch_metadata = []

    def process_paper(self, paper: Dict):
        """Process single paper into chunks with memory management"""
        text = f"""Title: {paper['title']}
Authors: {paper['authors']}
Abstract: {paper['abstract']}
Categories: {paper['categories']}
ID: {paper['id']}"""

        chunks = self.processor.text_splitter.split_text(text)

        # Add chunks and metadata to current batch
        self.current_batch_texts.extend(chunks)
        self.current_batch_metadata.extend([{
            'paper_id': paper['id'],
            'title': paper['title'],
            'chunk_index': i,
            'total_chunks': len(chunks)
        } for i in range(len(chunks))])

    def save_batch(self, save_path: str, is_final: bool = False):
        """Save current batch to FAISS index"""
        if not self.current_batch_texts:
            return

        logger.info(f"Saving batch of {len(self.current_batch_texts)} chunks...")
        MemoryMonitor.log_memory_usage("before_embedding")

        from langchain_community.vectorstores import FAISS

        # Create or update FAISS index
        db = FAISS.from_texts(
            texts=self.current_batch_texts,
            embedding=self.processor.embeddings,
            metadatas=self.current_batch_metadata
        )

        # Save index
        db.save_local(save_path)

        # Clear batch
        self.current_batch_texts = []
        self.current_batch_metadata = []

        MemoryMonitor.clear_memory()
        MemoryMonitor.log_memory_usage("after_save")

In [22]:
def main():
    logger.info("Starting vector database creation...")

    processor = PaperProcessor()
    db_builder = VectorDBBuilder(processor)

    papers_processed = 0
    file_path = "/content/drive/MyDrive/Colab Notebooks/arxiv-metadata-oai-snapshot.json"

    try:
        for paper in processor.process_papers_stream(file_path):
            db_builder.process_paper(paper)
            papers_processed += 1

            # Save intermediate results
            if papers_processed % Config.SAVE_INTERVAL == 0:
                logger.info(f"Processed {papers_processed} papers...")
                db_builder.save_batch(Config.VECTOR_DB_PATH)

        # Final save
        logger.info("Saving final batch...")
        db_builder.save_batch(Config.VECTOR_DB_PATH, is_final=True)

    except Exception as e:
        logger.error(f"Error during processing: {e}")
        raise

    logger.info(f"Completed processing {papers_processed} papers")

In [23]:
if __name__ == "__main__":
    main()