In [4]:
from pinecone import Pinecone
import os
from dotenv import load_dotenv
load_dotenv() 


# Load environment variables
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
PINECONE_INDEX_NAME = os.getenv("PINECONE_INDEX_NAME")

In [None]:
# only downloads the 

import json
import os
from pinecone import Pinecone

# 1. Setup
api_key = os.getenv("PINECONE_API_KEY")
index_name = os.getenv("PINECONE_INDEX_NAME") # The current "bad" index
pc = Pinecone(api_key=api_key)
index = pc.Index(index_name)

# 2. Storage for full vector data
all_vectors = []

print("üöÄ Starting Full Vector Backup...")

# 3. Iterate and Fetch
# list() gives us IDs, then we fetch the actual vector data
for ids in index.list(namespace="default"): 
    if not ids:
        continue
    
    # fetch() returns the actual dense values and metadata
    fetch_response = index.fetch(ids=ids, namespace="default")
    
    for vector_id, vector_data in fetch_response['vectors'].items():
        # We save the exact structure Pinecone expects for an upsert
        record = {
            "id": vector_id,
            "values": vector_data['values'],           # The OpenAI Embeddings (Free to move!)
            "sparse_values": vector_data.get('sparse_values'), # Existing BM25 data
            "metadata": vector_data.get('metadata', {})
        }
        all_vectors.append(record)

# 4. Save to a file
output_file = "full_backup.json"
with open(output_file, "w", encoding="utf-8") as f:
    json.dump(all_vectors, f)

print(f"‚úÖ Backup complete! Saved {len(all_vectors)} full vectors to '{output_file}'.")
print("‚ö†Ô∏è NOW it is safe to delete the index.")

In [2]:
import json
import os
from pinecone import Pinecone

# 1. Setup
api_key = os.getenv("PINECONE_API_KEY")
index_name = os.getenv("PINECONE_INDEX_NAME")
pc = Pinecone(api_key=api_key)
index = pc.Index(index_name)

# 2. Storage for full vector data
all_vectors = []

print("üöÄ Starting Full Vector Backup...")

# Helper function to fix the JSON error
def make_serializable(obj):
    """Converts Pinecone objects to standard Python dicts."""
    if hasattr(obj, "to_dict"):
        return obj.to_dict()
    return obj

# 3. Iterate and Fetch
try:
    # Get all IDs first
    for ids in index.list(namespace="default"): 
        if not ids:
            continue
        
        # Fetch the actual data
        fetch_response = index.fetch(ids=ids, namespace="default")
        
        # In the new SDK, fetch_response might be an object or dict. 
        # We access vectors safely.
        vectors_dict = fetch_response.vectors if hasattr(fetch_response, "vectors") else fetch_response.get("vectors", {})
        
        for vector_id, vector_data in vectors_dict.items():
            
            # --- FIX: Convert SparseValues object to a Dict ---
            sparse_data = vector_data.get('sparse_values')
            if sparse_data:
                # If it's the custom object, convert it
                sparse_data = make_serializable(sparse_data)

            record = {
                "id": vector_id,
                "values": vector_data['values'], # Dense vectors usually behave like lists
                "sparse_values": sparse_data,    # <--- The fixed safe dictionary
                "metadata": vector_data.get('metadata', {})
            }
            all_vectors.append(record)

    # 4. Save to a file
    output_file = "full_backup.json"
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(all_vectors, f, indent=None) # indent=None keeps file size smaller

    print(f"‚úÖ Backup complete! Saved {len(all_vectors)} full vectors to '{output_file}'.")
    print("‚ö†Ô∏è NOW it is safe to delete the index.")

except Exception as e:
    print(f"‚ùå Backup failed: {e}")

  from .autonotebook import tqdm as notebook_tqdm


üöÄ Starting Full Vector Backup...
‚úÖ Backup complete! Saved 2126 full vectors to 'full_backup.json'.
‚ö†Ô∏è NOW it is safe to delete the index.


In [None]:
pc = Pinecone(api_key=PINECONE_API_KEY)
existing_indexes = [i.name for i in pc.list_indexes()]
print(existing_indexes)

['tatvalabs-index']


In [7]:
import os
from pinecone import Pinecone, ServerlessSpec
from dotenv import load_dotenv

# 1. Force Reload .env file (Fixes the "old name" issue)
load_dotenv(override=True)

# 2. Get Config
API_KEY = os.getenv("PINECONE_API_KEY")
TARGET_INDEX_NAME = os.getenv("PINECONE_INDEX_NAME")

print(f"üîé DEBUG INFO:")
print(f"   Target Index Name: '{TARGET_INDEX_NAME}'")
print(f"   API Key Loaded:    {'Yes' if API_KEY else 'No'}")

# 3. Connect to Pinecone
pc = Pinecone(api_key=API_KEY)

# 4. List ALL current indexes
existing_indexes = [i.name for i in pc.list_indexes()]
print(f"   Existing Indexes:  {existing_indexes}")

# 5. Check Region & Create if missing
if TARGET_INDEX_NAME not in existing_indexes:
    print(f"\n‚ö†Ô∏è Index '{TARGET_INDEX_NAME}' is MISSING.")
    print("   Attempting to create it now...")
    
    try:
        # --- CRITICAL: CHECK YOUR REGION ---
        # Look at your Pinecone Console URL or Dashboard.
        # It usually says "region: us-east-1" or "us-west-2".
        # Ensure the region below MATCHES your console.
        
        pc.create_index(
            name=TARGET_INDEX_NAME,
            dimension=1536,
            metric="dotproduct",
            spec=ServerlessSpec(cloud="aws", region="us-east-1") 
        )
        print(f"‚úÖ SUCCESS: Index '{TARGET_INDEX_NAME}' created!")
    except Exception as e:
        print(f"\n‚ùå CREATION FAILED: {e}")
        print("   (Double check if your Pinecone region is 'us-east-1')")
else:
    print(f"\n‚úÖ Index '{TARGET_INDEX_NAME}' already exists!")

# 6. Verify Status
index = pc.Index(TARGET_INDEX_NAME)
stats = index.describe_index_stats()
print(f"\nüìä Index Stats for '{TARGET_INDEX_NAME}':")
print(stats)

üîé DEBUG INFO:
   Target Index Name: 'rag-tatvalabs'
   API Key Loaded:    Yes
   Existing Indexes:  ['tatvalabs-index']

‚ö†Ô∏è Index 'rag-tatvalabs' is MISSING.
   Attempting to create it now...
‚úÖ SUCCESS: Index 'rag-tatvalabs' created!

üìä Index Stats for 'rag-tatvalabs':
{'_response_info': {'raw_headers': {'connection': 'keep-alive',
                                    'content-length': '155',
                                    'content-type': 'application/json',
                                    'date': 'Mon, 15 Dec 2025 10:21:08 GMT',
                                    'grpc-status': '0',
                                    'server': 'envoy',
                                    'x-envoy-upstream-service-time': '49',
                                    'x-pinecone-request-id': '4502653468299685320',
                                    'x-pinecone-request-latency-ms': '49'}},
 'dimension': 1536,
 'index_fullness': 0.0,
 'memoryFullness': 0.0,
 'metric': 'dotproduct',
 'na

In [9]:
import json
import os
import tqdm
from pinecone import Pinecone, ServerlessSpec
from pinecone_text.sparse import BM25Encoder

# Configuration
API_KEY = os.getenv("PINECONE_API_KEY")
INDEX_NAME = os.getenv("PINECONE_INDEX_NAME") 
BACKUP_FILE = "full_backup.json"

def smart_restore():
    # 1. Load Backup
    if not os.path.exists(BACKUP_FILE):
        print("‚ùå Backup file not found! Run the backup script first.")
        return

    print(f"üìñ Reading {BACKUP_FILE}...")
    with open(BACKUP_FILE, "r") as f:
        vectors = json.load(f)
    
    # 2. Setup BM25 (Just in case we need to patch missing values)
    bm25 = BM25Encoder().default()

    # 3. Setup Pinecone & Create Correct Index
    pc = Pinecone(api_key=API_KEY)
    
    # Check if index exists, if not create it with DOTPRODUCT
    if INDEX_NAME not in [i.name for i in pc.list_indexes()]:
        print(f"Creating NEW index: {INDEX_NAME} with metric='dotproduct'")
        pc.create_index(
            name=INDEX_NAME,
            dimension=1536,
            metric="dotproduct",  
            spec=ServerlessSpec(cloud="aws", region="us-east-1")
        )
    
    index = pc.Index(INDEX_NAME)

    # 4. Prepare Batch
    batch_size = 50
    print(f"üöÄ Restoring {len(vectors)} items...")

    for i in tqdm.tqdm(range(0, len(vectors), batch_size)):
        batch = vectors[i : i + batch_size]
        
        # --- SAFETY CHECK ---
        # Ensure every item has sparse values. 
        # If the old index lost them, we regenerate them here for free.
        for item in batch:
            if not item.get("sparse_values") or not item["sparse_values"]["indices"]:
                text = item["metadata"].get("text", "")
                if text:
                    # Regenerate locally (Free)
                    item["sparse_values"] = bm25.encode_documents(text)
        
        try:
            index.upsert(vectors=batch, namespace="default")
        except Exception as e:
            print(f"‚ùå Error on batch {i}: {e}")

    print("üéâ Restoration Complete! Your Dense (OpenAI) and Sparse (BM25) vectors are active.")

if __name__ == "__main__":
    smart_restore()

üìñ Reading full_backup.json...
üöÄ Restoring 2126 items...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 43/43 [00:51<00:00,  1.20s/it]

üéâ Restoration Complete! Your Dense (OpenAI) and Sparse (BM25) vectors are active.



