In [None]:
from custom.pinecone_ops import pinecone_upsert
from custom.gemini_async import async_embed

from langchain.text_splitter import RecursiveCharacterTextSplitter

In [None]:
NAMESPACE = 'world_bank'

In [None]:
with open('data/world_bank_articles.txt', encoding='utf-8') as f:
    texts = f.read()

In [None]:
separator = "-" * 150                  # Defined earlier during webscraping

# Necessary to limit the payload to and avoid a
# 400: 'Request payload size exceeds the limit: 10000 bytes.'

text_splitter = RecursiveCharacterTextSplitter(separators=[separator, "\n\n\n", "\n\n", "\n"], 
                                               chunk_size=7000,         # Empirically set from the output of CharacterTextSplitter
                                               chunk_overlap=0)
docs = text_splitter.split_text(texts)
len(docs)

### Embed

In [None]:
results = await async_embed(docs)

### Pinecone Init

In [None]:
from pinecone import Pinecone
import os

In [None]:
pc = Pinecone(api_key=os.getenv('PINECONE_API_KEY'), environment='gcp-starter')

# Pick an index at random
index_ = pc.list_indexes()[0]
index = pc.Index(index_['name'])

# Check whether index matches our embedding dimension
dim_a = index_['dimension']
dim_b = len(results[0]['embeddings']['embedding']['values'])       # Pick any random embedding vector in our results

if dim_a != dim_b:
    raise Exception(f"Pinecone Index dimension: {dim_a} does not match Vector Embedding dimension {dim_b}")

# Delete namespace if found
# Will be created anew when we upsert to it. Avoids duplication
if NAMESPACE in index.describe_index_stats()['namespaces'].keys():
    index.delete(delete_all=True, namespace=NAMESPACE)
    index.describe_index_stats()

### Upsert 

In [None]:
# Upserting all texts
pinecone_upsert([result['embeddings']['embedding']['values'] for result in results],
                [result['text_metadata'] for result in results],
                index_['name'], 
                NAMESPACE) 