We are going to explore retrieval augmented generation from start to finish.

In [27]:
import time
import json
from uuid import uuid4
from tqdm.auto import tqdm

In [28]:
# Custom modules
from chunker import get_chunks
from custom import pineconify_vectors
from embeddings_palm import get_palm_embeddings

### Loading

In [29]:
with open("data/world_bank_articles.txt", encoding='utf-8') as f:
    data = f.read()

### Embeddings

In [30]:
chunks = get_chunks(data)

In [None]:
vectors = pineconify_vectors(chunks)

#### Saving Embeddings

In [13]:
with open("data/world_bank_embeddings.json", "w", encoding='utf-8') as f:
    json.dump(vectors, f, indent=2)

#### Loading Embeddings

In [13]:
with open("data/world_bank_embeddings.json") as json_file:
    vectors = json.load(json_file)

In [14]:
len(vectors)

2143

## Pinecone

In [6]:
import os
import pinecone

### Credentials

In [7]:
pinecone_api_key = os.getenv('PINECONE_API_KEY_03')
environment = os.getenv("PINECONE_ENV")

### Creating an Index

In [15]:

index_name = 'econwiki'

# initialize connection to pinecone
pinecone.init(
    api_key= pinecone_api_key,
    environment=environment  # next to API key in console
)

In [16]:
# check if index already exists (it shouldn't if this is first time)
if index_name not in pinecone.list_indexes():
    # if does not exist, create index
    pinecone.create_index(
        index_name,
        dimension=len(vectors[0]['values']),
        metric='dotproduct'
    )


In [21]:
# connect to index
index = pinecone.GRPCIndex(index_name)
# view index stats
index.describe_index_stats()

{'dimension': 768,
 'index_fullness': 0.04063,
 'namespaces': {'': {'vector_count': 4063}},
 'total_vector_count': 4063}

### Populating the Index

In [22]:
def batch_upsert(vectors, batch_size=100): 
    """ 
    Perform batch upsert in vectors of at most 100 vectors
    """
    for i in range(0, len(vectors), batch_size):
        batch_vectors = vectors[i:i + batch_size]
        upsert_response = index.upsert(vectors=batch_vectors)

        # Process the upsert response if needed
        # (You may want to check the response status to ensure it was successful)
        print(f"Batch {i//batch_size + 1} upsert response: {upsert_response}")

In [None]:
batch_upsert(vectors)