We are going to explore retrieval augmented generation from start to finish.

In [1]:
import json
from uuid import uuid4
from tqdm.auto import tqdm

In [2]:
# Custom modules
from chunker import get_chunks
from embeddings_palm import get_palm_embeddings

### Loading

In [4]:
with open("data/imf_article_txt", encoding='utf-8') as f:
    data = f.read()

### Embeddings
We use appropriate autoencoders since different models count words differently, then we use recursive text splitters to prioritize various separators and split continuously.

In [5]:
texts = get_chunks(data)

In [5]:
chunks = []
for text in tqdm(texts):
    
    chunks.append(
        {
            'id': str(uuid4()),
            'values': get_palm_embeddings(text),
            'metadata': {
                'text': text
                }
        }
    )

  0%|          | 0/288 [00:00<?, ?it/s]

#### Saving Embeddings

In [6]:
with open("data/embeddings.json", "w", encoding='utf-8') as f:
    json.dump(chunks, f, indent=2)

#### Loading Embeddings

In [7]:
with open("data/embeddings.json") as json_file:
    chunks = json.load(json_file)

## Pinecone

In [8]:
import os
import pinecone

### Credentials

In [9]:
pinecone_api_key = os.getenv('PINECONE_API_KEY_03')

### Creating an Index

In [10]:

index_name = 'econwiki'

# initialize connection to pinecone
pinecone.init(
    api_key= pinecone_api_key,
    environment="gcp-starter"  # next to API key in console
)

In [11]:
# check if index already exists (it shouldn't if this is first time)
if index_name not in pinecone.list_indexes():
    # if does not exist, create index
    pinecone.create_index(
        index_name,
        dimension=len(chunks[0]['values']),
        metric='dotproduct'
    )
# connect to index
index = pinecone.GRPCIndex(index_name)

{'dimension': 768,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 0}},
 'total_vector_count': 0}

### Populating the Index

In [12]:
index.upsert(vectors=chunks)

upserted_count: 288

In [13]:
# view index stats
index.describe_index_stats()

{'dimension': 768,
 'index_fullness': 0.00288,
 'namespaces': {'': {'vector_count': 288}},
 'total_vector_count': 288}