# Run this code in google colab:
- first select t4 gpu in runtime then go forward and run these codes 
- then download the vector_db.zip file and paste the vector db folder in your project directory
- upload arena_data_en.jsonl and also arena_data_de.jsonl

In [None]:
!pip install langchain-huggingface langchain-community chromadb tqdm "pinecone[grpc]" langchain-pinecone

Collecting langchain-huggingface
  Downloading langchain_huggingface-0.3.1-py3-none-any.whl.metadata (996 bytes)
Collecting langchain-community
  Downloading langchain_community-0.3.27-py3-none-any.whl.metadata (2.9 kB)
Collecting chromadb
  Downloading chromadb-1.0.15-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.0 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.10.1-py3-none-any.whl.metadata (3.4 kB)
Collecting httpx-sse<1.0.0,>=0.4.0 (from langchain-community)
  Downloading httpx_sse-0.4.1-py3-none-any.whl.metadata (9.4 kB)
Collecting pybase64>=1.4.1 (from chromadb)
  Downloading pybase64-1.4.2-cp311-cp311-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl.metadata (8.7 kB)
Collecting posthog<6.0.0,>=2.4.0 (from chromadb)
  Downloa

In [2]:
!pip install langchain_experimental

Collecting langchain_experimental
  Downloading langchain_experimental-0.3.4-py3-none-any.whl.metadata (1.7 kB)
Downloading langchain_experimental-0.3.4-py3-none-any.whl (209 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m209.2/209.2 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: langchain_experimental
Successfully installed langchain_experimental-0.3.4


In [3]:
!pip uninstall -y torch torchvision sentence-transformers transformers

Found existing installation: torch 2.6.0+cu124
Uninstalling torch-2.6.0+cu124:
  Successfully uninstalled torch-2.6.0+cu124
Found existing installation: torchvision 0.21.0+cu124
Uninstalling torchvision-0.21.0+cu124:
  Successfully uninstalled torchvision-0.21.0+cu124
Found existing installation: sentence-transformers 4.1.0
Uninstalling sentence-transformers-4.1.0:
  Successfully uninstalled sentence-transformers-4.1.0
Found existing installation: transformers 4.54.0
Uninstalling transformers-4.54.0:
  Successfully uninstalled transformers-4.54.0


In [4]:
!pip install torch torchvision --index-url https://download.pytorch.org/whl/cu118
!pip install sentence-transformers transformers

Looking in indexes: https://download.pytorch.org/whl/cu118
Collecting torch
  Downloading https://download.pytorch.org/whl/cu118/torch-2.7.1%2Bcu118-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (28 kB)
Collecting torchvision
  Downloading https://download.pytorch.org/whl/cu118/torchvision-0.22.1%2Bcu118-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (6.1 kB)
Collecting sympy>=1.13.3 (from torch)
  Downloading https://download.pytorch.org/whl/sympy-1.13.3-py3-none-any.whl.metadata (12 kB)
Collecting nvidia-cuda-nvrtc-cu11==11.8.89 (from torch)
  Downloading https://download.pytorch.org/whl/cu118/nvidia_cuda_nvrtc_cu11-11.8.89-py3-none-manylinux1_x86_64.whl (23.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.2/23.2 MB[0m [31m100.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-runtime-cu11==11.8.89 (from torch)
  Downloading https://download.pytorch.org/whl/cu118/nvidia_cuda_runtime_cu11-11.8.89-py3-none-manylinux1_x86_64.whl (875 kB)
[2K     [90

# before going forward make sure you restart the session


In [None]:
import os
# get api key from pinecone
os.environ["PINECONE_API_KEY"] = ""
os.environ["PINECONE_REGION"] = "us-east-1"   # or your region


In [None]:
import os
import hashlib
import json
import shutil
import logging
import time
import sys
from pathlib import Path
from typing import List, Tuple, Set
from tqdm import tqdm
import torch
from langchain.schema import Document
from langchain_experimental.text_splitter import SemanticChunker
from langchain_huggingface import HuggingFaceEmbeddings
# from langchain.vectorstores import Chroma  # COMMENTED OUT
from langchain_pinecone import PineconeVectorStore
from pinecone import Pinecone, ServerlessSpec
from google.colab import files  # For download in Colab

# Configuration for both collections
CONFIG = {
    "embedding_models": {
        "en": "sentence-transformers/all-mpnet-base-v2",
        "de": "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
    },
    # "persist_directory": "/content/vector_db",  # COMMENTED OUT - Not needed for Pinecone
    "pinecone": {
        "environment": "us-east-1",  # Change to your preferred region
        "index_name_prefix": "arena2036",  # Will create arena2036-en and arena2036-de
        "dimension": 768,  # Dimension for all-mpnet-base-v2 model
        "metric": "cosine"
    },
    "collections": {
        "en": {
            "name": "arena2036_en",
            "input_file": "arena_data_en.jsonl",
            "index_name": "arena2036-en"
        },
        "de": {
            "name": "arena2036_de", 
            "input_file": "arena_data_de.jsonl",
            "index_name": "arena2036-de"
        }
    },
    "chunking": {
        "breakpoint_threshold_type": "percentile",
        "breakpoint_threshold_amount": 80
    },
    # Reduced batch size to prevent memory issues
    "batch_size": 16,
    "device": "cuda" if torch.cuda.is_available() else "cpu",
    "normalize_embeddings": True
}

# Logging setup with Colab compatibility
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s",
    handlers=[logging.StreamHandler()]
)
logger = logging.getLogger(__name__)

# Helper function to ensure logs show up in Colab
def log_and_flush(message, level="info"):
    if level == "info":
        logger.info(message)
    elif level == "warning":
        logger.warning(message)
    elif level == "error":
        logger.error(message)
    sys.stdout.flush()

def setup_pinecone():
    """Initialize Pinecone client and create indexes if they don't exist"""
    # Get API key from environment variable
    api_key = os.getenv("PINECONE_API_KEY")
    if not api_key:
        raise ValueError("PINECONE_API_KEY environment variable not set!")
    
    # Initialize Pinecone
    pc = Pinecone(api_key=api_key)
    
    # Create indexes for both languages if they don't exist
    for language in ["en", "de"]:
        index_name = CONFIG["collections"][language]["index_name"]
        
        try:
            # Check if index exists
            existing_indexes = [idx.name for idx in pc.list_indexes()]
            
            if index_name not in existing_indexes:
                log_and_flush(f"Creating Pinecone index: {index_name}")
                pc.create_index(
                    name=index_name,
                    dimension=CONFIG["pinecone"]["dimension"],
                    metric=CONFIG["pinecone"]["metric"],
                    spec=ServerlessSpec(
                        cloud="aws",
                        region=CONFIG["pinecone"]["environment"]
                    )
                )
                # Wait for index to be ready
                while not pc.describe_index(index_name).status['ready']:
                    log_and_flush(f"Waiting for index {index_name} to be ready...")
                    time.sleep(1)
                log_and_flush(f"Index {index_name} created and ready!")
            else:
                log_and_flush(f"Index {index_name} already exists")
                
        except Exception as e:
            log_and_flush(f"Error with index {index_name}: {e}", "error")
            raise
    
    return pc

def make_source_id(url: str, title: str) -> str:
    """Create a unique source ID from URL and title"""
    base = (url or "") + "|" + (title or "")
    h = hashlib.sha1(base.encode("utf-8")).hexdigest()
    return h[:14]

def flatten_documents(maybe_nested):
    """Safe flatten utility (in case splitter returns nested lists)"""
    out = []
    for item in maybe_nested:
        if item is None:
            continue
        if isinstance(item, list):
            out.extend(flatten_documents(item))
        else:
            out.append(item)
    return out

def load_documents(file_path: str, language: str) -> Tuple[List[Document], int, List[str]]:
    """Load documents with comprehensive tracking and lenient parsing"""
    documents = []
    failed_lines = []
    total_lines = 0

    # First pass: count total lines for accurate tracking
    try:
        with open(file_path, "r", encoding="utf-8") as f:
            total_lines = sum(1 for line in f if line.strip())
        log_and_flush(f"Total lines in file: {total_lines}")
    except Exception as e:
        log_and_flush(f"Failed to count lines in file: {e}", "error")
        raise

    try:
        with open(file_path, "r", encoding="utf-8") as f:
            line_num = 0
            for line in tqdm(f, desc=f"Loading {language} documents", total=total_lines):
                line_num += 1
                line = line.strip()
                if not line:
                    continue

                try:
                    rec = json.loads(line)
                except json.JSONDecodeError as e:
                    error_msg = f"Line {line_num}: Invalid JSON - {e}"
                    log_and_flush(error_msg, "error")
                    failed_lines.append(error_msg)
                    continue

                # More lenient content extraction
                content = (rec.get("content") or "").strip()
                title = (rec.get("title") or "").strip()
                url = (rec.get("url") or "").strip()

                # Only skip if ALL fields are empty (much more lenient)
                if not content and not title and not url:
                    error_msg = f"Line {line_num}: All fields empty (url, title, content)"
                    log_and_flush(error_msg, "warning")
                    failed_lines.append(error_msg)
                    continue

                # Create document even with minimal content
                # If content is empty but title exists, use title as content
                if not content and title:
                    combined = title
                    log_and_flush(f"Using title as content for URL: {url}")
                elif content and title:
                    combined = f"{title}\n{content}"
                elif content:
                    combined = content
                else:
                    # Last resort: use URL as content if nothing else
                    combined = url or f"Document from line {line_num}"
                    log_and_flush(f"Using URL as content for line {line_num}")

                source_id = make_source_id(url, title)
                metadata = {
                    "url": url,
                    "title": title,
                    "source": Path(url).stem if url else f"line_{line_num}",
                    "language": language,
                    "source_id": source_id,
                    "original_line": line_num
                }

                documents.append(Document(page_content=combined, metadata=metadata))

    except Exception as e:
        log_and_flush(f"Failed to load documents: {e}", "error")
        raise

    log_and_flush(f"Successfully loaded {len(documents)} documents out of {total_lines} total lines")
    if failed_lines:
        log_and_flush(f"Failed to process {len(failed_lines)} lines:", "warning")
        for failed in failed_lines[:10]:  # Show first 10 failures
            log_and_flush(f"  {failed}", "warning")
        if len(failed_lines) > 10:
            log_and_flush(f"  ... and {len(failed_lines) - 10} more failures", "warning")

    return documents, len(documents), failed_lines

def safe_chunk_documents(text_splitter, docs: List[Document]) -> List[Document]:
    """Chunk documents with individual fallback for problematic docs"""
    all_chunks = []
    failed_docs = []

    log_and_flush("Starting document chunking process...")

    # Try batch chunking first
    try:
        log_and_flush("Attempting batch chunking...")
        raw_chunks = text_splitter.split_documents(docs)
        chunks = flatten_documents(raw_chunks)
        log_and_flush(f"Batch chunking successful: {len(chunks)} chunks created")
        return chunks
    except Exception as e:
        log_and_flush(f"Batch chunking failed: {e}. Falling back to individual document processing...", "warning")

    # Individual document processing fallback
    for i, doc in enumerate(tqdm(docs, desc="Chunking documents individually")):
        try:
            doc_chunks = text_splitter.split_documents([doc])
            doc_chunks = flatten_documents(doc_chunks)

            # If no chunks created, create one from original document
            if not doc_chunks:
                log_and_flush(f"No chunks created for doc {i}, using original document", "warning")
                # Create a chunk from the original document
                chunk = Document(
                    page_content=doc.page_content,
                    metadata={**doc.metadata, "chunk_method": "fallback_original"}
                )
                doc_chunks = [chunk]

            all_chunks.extend(doc_chunks)

        except Exception as e:
            log_and_flush(f"Failed to chunk document {i} (source_id: {doc.metadata.get('source_id', 'unknown')}): {e}", "error")
            # Create a fallback chunk from the original document
            try:
                fallback_chunk = Document(
                    page_content=doc.page_content[:8000],  # Truncate if too long
                    metadata={**doc.metadata, "chunk_method": "fallback_truncated"}
                )
                all_chunks.append(fallback_chunk)
                log_and_flush(f"Created fallback chunk for document {i}")
            except Exception as e2:
                log_and_flush(f"Even fallback chunk creation failed for document {i}: {e2}", "error")
                failed_docs.append(doc.metadata.get('source_id', f'doc_{i}'))

    log_and_flush(f"Individual chunking complete: {len(all_chunks)} chunks created, {len(failed_docs)} documents failed completely")
    if failed_docs:
        log_and_flush(f"Completely failed document IDs: {failed_docs}", "warning")

    return all_chunks

def create_collection(language: str, config: dict, pc: Pinecone):
    """Create a new collection for a specific language with comprehensive tracking"""
    
    # Load documents
    input_file = config["collections"][language]["input_file"]
    docs, doc_count, failed_lines = load_documents(input_file, language)
    
    if not docs:
        raise ValueError(f"No {language} documents loaded!")
    
    log_and_flush(f"Loaded {doc_count} documents for language={language}")

    # Track original source IDs for verification
    original_source_ids = {doc.metadata["source_id"] for doc in docs}
    log_and_flush(f"Tracking {len(original_source_ids)} unique source IDs")

    # Create sources index
    sources_index = {
        d.metadata["source_id"]: {
            "title": d.metadata.get("title", ""),
            "url": d.metadata.get("url", ""),
            "language": d.metadata.get("language", ""),
            "original_line": d.metadata.get("original_line", "")
        } for d in docs
    }

    # Initialize embeddings
    embeddings = HuggingFaceEmbeddings(
        model_name=config["embedding_models"][language],
        model_kwargs={"device": config["device"]},
        encode_kwargs={"normalize_embeddings": config["normalize_embeddings"]}
    )

    # Create semantic text splitter
    text_splitter = SemanticChunker(
        embeddings,
        breakpoint_threshold_type=config["chunking"]["breakpoint_threshold_type"],
        breakpoint_threshold_amount=config["chunking"]["breakpoint_threshold_amount"]
    )

    log_and_flush("Starting robust document chunking...")
    sys.stdout.flush()

    chunks = safe_chunk_documents(text_splitter, docs)

    # Clean chunks more carefully
    clean_chunks = []
    for i, c in enumerate(chunks):
        content = (c.page_content or "").strip()
        if not content:
            log_and_flush(f"Empty chunk {i} found, skipping", "warning")
            continue

        # Ensure metadata is complete
        metadata = c.metadata or {}
        if "source_id" not in metadata or not metadata["source_id"]:
            # Try to reconstruct source_id from other metadata
            url = metadata.get("url", "")
            title = metadata.get("title", "")
            metadata["source_id"] = make_source_id(url, title)
            log_and_flush(f"Reconstructed source_id for chunk {i}: {metadata['source_id']}")

        c.metadata = metadata
        clean_chunks.append(c)

    chunks = clean_chunks
    total_chunks = len(chunks)
    log_and_flush(f"Final chunk count: {total_chunks} (after cleaning)")

    # Verify we have chunks for all original documents
    chunk_source_ids = {chunk.metadata.get("source_id") for chunk in chunks}
    missing_source_ids = original_source_ids - chunk_source_ids
    if missing_source_ids:
        log_and_flush(f"Missing chunks for {len(missing_source_ids)} source documents!", "warning")
        log_and_flush(f"Missing source IDs (first 10): {list(missing_source_ids)[:10]}", "warning")

        # Try to create fallback chunks for missing documents
        for doc in docs:
            if doc.metadata["source_id"] in missing_source_ids:
                log_and_flush(f"Creating fallback chunk for missing source: {doc.metadata['source_id']}")
                fallback_chunk = Document(
                    page_content=doc.page_content[:8000],  # Truncate if needed
                    metadata={**doc.metadata, "chunk_method": "missing_fallback"}
                )
                chunks.append(fallback_chunk)

        total_chunks = len(chunks)
        log_and_flush(f"After adding fallback chunks: {total_chunks} total chunks")

    # Sample chunks for verification
    for s in range(min(3, total_chunks)):
        snippet = chunks[s].page_content[:200].replace("\n", " ")
        log_and_flush(f"Sample chunk {s}: {snippet}... | source_id: {chunks[s].metadata.get('source_id', 'N/A')}")

    # PINECONE IMPLEMENTATION STARTS HERE
    index_name = config["collections"][language]["index_name"]
    
    # Get the Pinecone index
    try:
        index = pc.Index(index_name)
        log_and_flush(f"Connected to Pinecone index: {index_name}")
        
        # Check index stats
        stats = index.describe_index_stats()
        log_and_flush(f"Index {index_name} current vector count: {stats['total_vector_count']}")
        
        # Optional: Clear existing vectors if you want a fresh start
        # Uncomment the next line if you want to clear the index
        # index.delete(delete_all=True)
        
    except Exception as e:
        log_and_flush(f"Error connecting to Pinecone index {index_name}: {e}", "error")
        raise

    # Create PineconeVectorStore
    vectorstore = PineconeVectorStore(
        index=index,
        embedding=embeddings,
        text_key="text",  # Field name for storing text content
        namespace=language  # Use language as namespace for organization
    )

    # Ultra-robust document addition with retry logic
    batch_size = max(1, int(config.get("batch_size", 16)))
    added_count = 0
    failed_chunks = []
    retry_chunks = []
    global_idx = 0
    chunks_index = {}
    successfully_added_source_ids: Set[str] = set()

    log_and_flush(f"Adding {total_chunks} chunks to Pinecone with batch size {batch_size}...")
    sys.stdout.flush()

    # Primary batch processing
    for start in tqdm(range(0, total_chunks, batch_size), desc="Adding batches"):
        batch = chunks[start : start + batch_size]
        batch_ids = []
        batch_source_ids = []
        
        for c in batch:
            src_id = c.metadata.get("source_id") or make_source_id(
                c.metadata.get("url", ""), 
                c.metadata.get("title", "")
            )
            chunk_id = f"{src_id}_chunk_{global_idx}"
            global_idx += 1
            c.metadata["chunk_id"] = chunk_id
            batch_ids.append(chunk_id)
            batch_source_ids.append(src_id)
            
            chunks_index[chunk_id] = {
                "source_id": src_id,
                "title": c.metadata.get("title", ""),
                "url": c.metadata.get("url", ""),
                "snippet": c.page_content[:300]
            }

        # Try batch addition
        success = False
        try:
            vectorstore.add_documents(batch, ids=batch_ids)
            added_count += len(batch)
            successfully_added_source_ids.update(batch_source_ids)
            success = True
            logger.debug(f"Batch {start//batch_size + 1} added successfully ({len(batch)} docs)")
        except Exception as e:
            logger.warning(f"Batch {start//batch_size + 1} failed: {e}. Will retry individually.")

        # If batch failed, add to retry queue
        if not success:
            for i, doc in enumerate(batch):
                retry_chunks.append((doc, batch_ids[i], batch_source_ids[i]))

    # Retry failed chunks individually with exponential backoff
    log_and_flush(f"Retrying {len(retry_chunks)} failed chunks individually...")
    sys.stdout.flush()

    for doc, doc_id, src_id in tqdm(retry_chunks, desc="Retrying individual chunks"):
        max_retries = 3
        for attempt in range(max_retries):
            try:
                vectorstore.add_documents([doc], ids=[doc_id])
                added_count += 1
                successfully_added_source_ids.add(src_id)
                break
            except Exception as e:
                if attempt == max_retries - 1:
                    logger.error(f"Failed to add chunk {doc_id} after {max_retries} attempts: {e}")
                    failed_chunks.append(doc_id)
                else:
                    logger.debug(f"Retry {attempt + 1} failed for chunk {doc_id}: {e}")
                    time.sleep(0.1 * (2 ** attempt))  # Exponential backoff

    # Final verification of source coverage
    missing_from_vector = original_source_ids - successfully_added_source_ids
    if missing_from_vector:
        log_and_flush(f"CRITICAL: {len(missing_from_vector)} source documents missing from vector DB!", "error")
        log_and_flush(f"Missing source IDs: {list(missing_from_vector)[:20]}", "error")
    else:
        log_and_flush("✅ SUCCESS: All original source documents are represented in the vector database!")
    
    sys.stdout.flush()

    # Save comprehensive indexes to local files for backup
    try:
        # Create local backup directory
        backup_dir = Path("/content/pinecone_backup")
        backup_dir.mkdir(parents=True, exist_ok=True)
        
        collection_name = config["collections"][language]["name"]
        
        # Sources index
        src_path = backup_dir / f"{collection_name}_sources_index.json"
        with open(src_path, "w", encoding="utf-8") as f:
            json.dump(sources_index, f, indent=2, ensure_ascii=False)

        # Chunks index
        chunks_path = backup_dir / f"{collection_name}_chunks_index.json"
        with open(chunks_path, "w", encoding="utf-8") as f:
            json.dump(chunks_index, f, indent=2, ensure_ascii=False)

        # Processing report
        report = {
            "total_input_lines": len(docs) + len(failed_lines),
            "failed_to_parse": len(failed_lines),
            "documents_loaded": doc_count,
            "original_source_ids": len(original_source_ids),
            "total_chunks_created": total_chunks,
            "chunks_added_to_vector_db": added_count,
            "failed_chunks": len(failed_chunks),
            "successfully_added_source_ids": len(successfully_added_source_ids),
            "missing_from_vector_db": len(missing_from_vector),
            "missing_source_ids": list(missing_from_vector),
            "pinecone_index": index_name,
            "namespace": language
        }

        report_path = backup_dir / f"{collection_name}_processing_report.json"
        with open(report_path, "w", encoding="utf-8") as f:
            json.dump(report, f, indent=2, ensure_ascii=False)

        log_and_flush(f"Saved indexes and processing report to {backup_dir}")
        
    except Exception as e:
        log_and_flush(f"Failed to write index files: {e}", "error")

    # Final summary
    print("="*80)
    print(f"FINAL PROCESSING SUMMARY FOR {language.upper()} COLLECTION:")
    print(f"📁 Input file lines: {len(docs) + len(failed_lines)}")
    print(f"📄 Documents loaded: {doc_count}")
    print(f"🔗 Unique source IDs: {len(original_source_ids)}")
    print(f"📝 Total chunks created: {total_chunks}")
    print(f"✅ Chunks added to Pinecone: {added_count}")
    print(f"❌ Failed chunks: {len(failed_chunks)}")
    print(f"🎯 Source coverage: {len(successfully_added_source_ids)}/{len(original_source_ids)}")
    print(f"🌲 Pinecone Index: {index_name}")
    print(f"📦 Namespace: {language}")
    if missing_from_vector:
        print(f"⚠️ MISSING SOURCES: {len(missing_from_vector)} documents not in vector DB")
    else:
        print("🎉 COMPLETE SUCCESS: All source documents represented in vector DB!")
    print("="*80)
    sys.stdout.flush()

    logger.info(f"Created {language} collection with {added_count} chunks from {len(successfully_added_source_ids)} sources in Pinecone index {index_name}")

def verify_collections(config: dict, pc: Pinecone):
    """Verify all Pinecone indexes exist and show stats"""
    logger.info("Current Pinecone indexes:")
    try:
        indexes = pc.list_indexes()
        for idx_info in indexes:
            index_name = idx_info.name
            index = pc.Index(index_name)
            stats = index.describe_index_stats()
            
            logger.info(f"- {index_name}: {stats['total_vector_count']} vectors")
            
            # Show namespace stats if available
            if 'namespaces' in stats:
                for ns_name, ns_stats in stats['namespaces'].items():
                    logger.info(f"  └─ Namespace '{ns_name}': {ns_stats['vector_count']} vectors")
                    
    except Exception as e:
        logger.error(f"Failed to verify Pinecone indexes: {e}")

def download_backup_files():
    """Download the backup index files"""
    try:
        backup_dir = Path("/content/pinecone_backup")
        if backup_dir.exists():
            # Create ZIP archive of backup files
            zip_path = "/content/pinecone_backup"
            logger.info(f"Creating ZIP archive of backup files...")
            shutil.make_archive(zip_path, "zip", backup_dir)
            
            # Download the file
            logger.info("Downloading backup files...")
            files.download(f"{zip_path}.zip")
            logger.info("Backup files downloaded successfully!")
        else:
            logger.warning("No backup directory found to download")
    except Exception as e:
        logger.error(f"Failed to create/download backup: {e}")
        raise

def main():
    logger.info("Starting ROBUST Pinecone vector database creation process")
    logger.info("This version includes comprehensive tracking to prevent document loss")
    
    # Setup Pinecone client and create indexes
    pc = setup_pinecone()
    
    # Create collections for both languages
    for language in ["en", "de"]:
        if os.path.exists(CONFIG["collections"][language]["input_file"]):
            logger.info(f"Creating {language} collection in Pinecone...")
            start_time = time.time()
            create_collection(language, CONFIG, pc)
            elapsed = time.time() - start_time
            logger.info(f"{language} collection created in {elapsed:.2f} seconds")
        else:
            logger.warning(f"Input file not found for {language}: {CONFIG['collections'][language]['input_file']}")

    # Verify both collections exist
    verify_collections(CONFIG, pc)
    
    # Download backup files
    download_backup_files()

    logger.info("🎉 Pinecone vector database creation completed successfully!")

if __name__ == '__main__':
    main()

Loading documents: 100%|██████████| 25.0M/25.0M [00:00<00:00, 159MB/s]
Semantic chunking documents: 100%|██████████| 296/296 [05:58<00:00,  1.21s/it]
  vectorstore = Chroma(
Indexing batches: 100%|██████████| 95/95 [01:47<00:00,  1.13s/it]
  vectorstore.persist()


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>