In [1]:
from custom.pinecone_ops import pinecone_upsert
from custom.gemini_async import async_embed

from langchain.text_splitter import RecursiveCharacterTextSplitter

In [2]:
NAMESPACE = 'world_bank'

In [3]:
with open('data/world_bank_articles.txt', encoding='utf-8') as f:
    texts = f.read()

In [4]:
separator = "-" * 150                  # Defined earlier during webscraping

# Necessary to limit the payload to and avoid a
# 400: 'Request payload size exceeds the limit: 10000 bytes.'

text_splitter = RecursiveCharacterTextSplitter(separators=[separator, "\n\n\n", "\n\n", "\n"], 
                                               chunk_size=7000,         # Empirically set from the output of CharacterTextSplitter
                                               chunk_overlap=0)
docs = text_splitter.split_text(texts)
len(docs)

1982

### Embed

In [5]:
results = await async_embed(docs)

### Pinecone Init

In [6]:
from pinecone import Pinecone
import os

In [7]:
pc = Pinecone(api_key=os.getenv('PINECONE_API_KEY'), environment='gcp-starter')

# Pick an index at random
index_ = pc.list_indexes()[0]
index = pc.Index(index_['name'])

# Check whether index matches our embedding dimension
dim_a = index_['dimension']
dim_b = len(results[0]['embeddings']['embedding']['values'])       # Pick any random embedding vector in our results

if dim_a != dim_b:
    raise Exception(f"Pinecone Index dimension: {dim_a} does not match Vector Embedding dimension {dim_b}")

# Delete namespace if found
# Will be created anew when we upsert to it. Avoids duplication
if NAMESPACE in index.describe_index_stats()['namespaces'].keys():
    index.delete(delete_all=True, namespace=NAMESPACE)
    index.describe_index_stats()

In [8]:
vectors = []
text_metadata = []
skipped = int()
for result in results:
    try:
        vectors.append(result['embeddings']['embedding']['values'])
        text_metadata.append(result['text_metadata'])
    except:
        skipped += 1
        
print(f"The following vectors were processed: {len(vectors), len(text_metadata)}")
print(f"The following vectors were skipped: {skipped}")

The following vectors were processed: (1614, 1614)


### Upsert 

In [10]:
# Upserting all texts
pinecone_upsert(vectors,
                text_metadata,
                index_['name'], 
                NAMESPACE) 

[<multiprocessing.pool.ApplyResult at 0x2828ef77010>,
 <multiprocessing.pool.ApplyResult at 0x2828ef75a50>,
 <multiprocessing.pool.ApplyResult at 0x2828eefcb20>,
 <multiprocessing.pool.ApplyResult at 0x2828ecf15a0>,
 <multiprocessing.pool.ApplyResult at 0x2828ecf3010>,
 <multiprocessing.pool.ApplyResult at 0x2829496afb0>,
 <multiprocessing.pool.ApplyResult at 0x2828ecc12a0>,
 <multiprocessing.pool.ApplyResult at 0x2828ecc3400>,
 <multiprocessing.pool.ApplyResult at 0x2828ec75720>,
 <multiprocessing.pool.ApplyResult at 0x2828ec74a60>,
 <multiprocessing.pool.ApplyResult at 0x2828ec205b0>,
 <multiprocessing.pool.ApplyResult at 0x2828ecc3790>,
 <multiprocessing.pool.ApplyResult at 0x2828ebd3760>,
 <multiprocessing.pool.ApplyResult at 0x2828ecf03d0>,
 <multiprocessing.pool.ApplyResult at 0x2828ec4b4c0>,
 <multiprocessing.pool.ApplyResult at 0x2828e84fdf0>,
 <multiprocessing.pool.ApplyResult at 0x2828ef998a0>]