# Run this code in google colab:
- first select t4 gpu in runtime then go forward and run these codes 
- then download the vector_db.zip file and paste the vector db folder in your project directory

In [1]:
!pip install langchain-huggingface langchain-community chromadb tqdm

Collecting langchain-huggingface
  Downloading langchain_huggingface-0.3.1-py3-none-any.whl.metadata (996 bytes)
Collecting langchain-community
  Downloading langchain_community-0.3.27-py3-none-any.whl.metadata (2.9 kB)
Collecting chromadb
  Downloading chromadb-1.0.15-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.0 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.10.1-py3-none-any.whl.metadata (3.4 kB)
Collecting httpx-sse<1.0.0,>=0.4.0 (from langchain-community)
  Downloading httpx_sse-0.4.1-py3-none-any.whl.metadata (9.4 kB)
Collecting pybase64>=1.4.1 (from chromadb)
  Downloading pybase64-1.4.2-cp311-cp311-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl.metadata (8.7 kB)
Collecting posthog<6.0.0,>=2.4.0 (from chromadb)
  Downloa

In [2]:
!pip install langchain_experimental

Collecting langchain_experimental
  Downloading langchain_experimental-0.3.4-py3-none-any.whl.metadata (1.7 kB)
Downloading langchain_experimental-0.3.4-py3-none-any.whl (209 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m209.2/209.2 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: langchain_experimental
Successfully installed langchain_experimental-0.3.4


In [3]:
!pip uninstall -y torch torchvision sentence-transformers transformers

Found existing installation: torch 2.6.0+cu124
Uninstalling torch-2.6.0+cu124:
  Successfully uninstalled torch-2.6.0+cu124
Found existing installation: torchvision 0.21.0+cu124
Uninstalling torchvision-0.21.0+cu124:
  Successfully uninstalled torchvision-0.21.0+cu124
Found existing installation: sentence-transformers 4.1.0
Uninstalling sentence-transformers-4.1.0:
  Successfully uninstalled sentence-transformers-4.1.0
Found existing installation: transformers 4.54.0
Uninstalling transformers-4.54.0:
  Successfully uninstalled transformers-4.54.0


In [4]:
!pip install torch torchvision --index-url https://download.pytorch.org/whl/cu118
!pip install sentence-transformers transformers

Looking in indexes: https://download.pytorch.org/whl/cu118
Collecting torch
  Downloading https://download.pytorch.org/whl/cu118/torch-2.7.1%2Bcu118-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (28 kB)
Collecting torchvision
  Downloading https://download.pytorch.org/whl/cu118/torchvision-0.22.1%2Bcu118-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (6.1 kB)
Collecting sympy>=1.13.3 (from torch)
  Downloading https://download.pytorch.org/whl/sympy-1.13.3-py3-none-any.whl.metadata (12 kB)
Collecting nvidia-cuda-nvrtc-cu11==11.8.89 (from torch)
  Downloading https://download.pytorch.org/whl/cu118/nvidia_cuda_nvrtc_cu11-11.8.89-py3-none-manylinux1_x86_64.whl (23.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.2/23.2 MB[0m [31m100.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-runtime-cu11==11.8.89 (from torch)
  Downloading https://download.pytorch.org/whl/cu118/nvidia_cuda_runtime_cu11-11.8.89-py3-none-manylinux1_x86_64.whl (875 kB)
[2K     [90

# before going forward make sure you restart the session


In [None]:
import os
import json
import shutil
from tqdm import tqdm
from pathlib import Path
import logging
from typing import List, Tuple
import torch
from langchain.schema import Document
from langchain_experimental.text_splitter import SemanticChunker
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from google.colab import files  # For downloading files in Colab

# Configuration - Updated for semantic chunking
CONFIG = {
    "breakpoint_threshold_type": "percentile",  # "percentile", "standard_deviation", or "interquartile"
    "breakpoint_threshold_amount": 80,         # percentile value (0-100) if using percentile
    "embedding_model": "sentence-transformers/all-mpnet-base-v2",
    "persist_directory": "/content/vector_db",
    "collection_name": "arena2036_en",
    "input_file": "arena_data_en.jsonl",
    "batch_size": 32,                 # Batch size for Colab GPU memory
    "device": "cuda" if torch.cuda.is_available() else "cpu",
    "normalize_embeddings": True
}

# Logging setup
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s",
    handlers=[logging.StreamHandler()]
)
logger = logging.getLogger(__name__)

def load_documents(file_path: str) -> Tuple[List[Document], int]:
    documents, skipped, empty_content, parsing_errors = [], 0, 0, 0
    try:
        file_size = Path(file_path).stat().st_size
        with open(file_path, 'r', encoding='utf-8') as f, tqdm(
            total=file_size, unit='B', unit_scale=True, desc="Loading documents"
        ) as pbar:
            for line_num, line in enumerate(f, 1):
                pbar.update(len(line.encode('utf-8')))
                try:
                    rec = json.loads(line)
                    content = rec.get('content', '').strip()
                    if not content:
                        empty_content += 1
                        skipped += 1
                        continue
                    metadata = {
                        'url': rec.get('url', ''),
                        'title': rec.get('title', ''),
                        'source': Path(rec.get('url', '')).stem
                    }
                    documents.append(Document(page_content=content, metadata=metadata))
                except json.JSONDecodeError:
                    parsing_errors += 1
                    skipped += 1
                    logger.warning(f"JSON error in line {line_num}")
                except Exception as e:
                    parsing_errors += 1
                    skipped += 1
                    logger.error(f"Error processing line {line_num}: {e}")
    except Exception as e:
        logger.error(f"Failed to load documents: {e}")
        raise
    logger.info(f"Docs loaded: {len(documents)}, skipped: {skipped}, empty: {empty_content}, errors: {parsing_errors}")
    return documents, skipped

def create_vector_store(documents: List[Document], config: dict, recreate: bool = False) -> Tuple[Chroma, List[Document]]:
    os.makedirs(config["persist_directory"], exist_ok=True)

    # Initialize embeddings - same model used for both chunking and vector store
    embeddings = HuggingFaceEmbeddings(
        model_name=config["embedding_model"],
        model_kwargs={"device": config["device"]},
        encode_kwargs={"normalize_embeddings": config["normalize_embeddings"]}
    )

    # Create semantic chunker
    logger.info("Creating semantic chunks...")
    text_splitter = SemanticChunker(
        embeddings,
        breakpoint_threshold_type=config["breakpoint_threshold_type"],
        breakpoint_threshold_amount=config.get("breakpoint_threshold_amount", None)
    )

    chunks = []
    for doc in tqdm(documents, desc="Semantic chunking documents"):
        chunks.extend(text_splitter.split_documents([doc]))

    avg_size = sum(len(c.page_content) for c in chunks)/len(chunks) if chunks else 0
    logger.info(f"{len(chunks)} chunks created, avg size: {avg_size:.1f}")

    if recreate and os.path.exists(config["persist_directory"]):
        logger.info("Clearing existing store...")
        shutil.rmtree(config["persist_directory"])

    vectorstore = Chroma(
        embedding_function=embeddings,
        persist_directory=config["persist_directory"],
        collection_name=config["collection_name"]
    )

    logger.info("Indexing chunks...")
    for i in tqdm(range(0, len(chunks), config["batch_size"]), desc="Indexing batches"):
        vectorstore.add_documents(chunks[i:i+config["batch_size"]])
    vectorstore.persist()
    return vectorstore, chunks

def validate_chunks(chunks: List[Document], sample_size: int = 3):
    logger.info(f"Validating first {sample_size} chunks...")
    for i, c in enumerate(chunks[:sample_size],1):
        logger.info(f"Chunk {i}: {len(c.page_content)} chars")

def main():
    logger.info(f"Running on device: {CONFIG['device']}")
    logger.info(f"Semantic chunking with {CONFIG['breakpoint_threshold_type']} threshold")
    docs, skipped = load_documents(CONFIG['input_file'])
    if not docs: return
    store, chunks = create_vector_store(docs, CONFIG, recreate=True)
    logger.info(f"Stored {store._collection.count()} vectors")
    validate_chunks(chunks)
    # ZIP and download
    shutil.make_archive("/content/vector_db","zip",CONFIG['persist_directory'])
    files.download("/content/vector_db.zip")

if __name__ == '__main__':
    main()

Loading documents: 100%|██████████| 25.0M/25.0M [00:00<00:00, 159MB/s]
Semantic chunking documents: 100%|██████████| 296/296 [05:58<00:00,  1.21s/it]
  vectorstore = Chroma(
Indexing batches: 100%|██████████| 95/95 [01:47<00:00,  1.13s/it]
  vectorstore.persist()


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>