In [None]:
import os, uuid
from tqdm import tqdm

from pinecone import Pinecone
from pinecone_text.sparse import BM25Encoder

from sentence_transformers import SentenceTransformer
from langchain.text_splitter import RecursiveCharacterTextSplitter

### Data
It is important to note that that some models, especially open source ones on hugging face, may have a maximum sequence length. We will therefore limit our `chunk_size=512` to be conservative.

In [None]:
with open("data/imf_article_txt", encoding='utf-8') as f:
    texts = f.read()

In [None]:
text_splitter = RecursiveCharacterTextSplitter(
    separators=['_'*150, '\n\n', '\n', '\t'],
    chunk_size=512,
    chunk_overlap=50
)

In [None]:
documents = text_splitter.split_text(texts)
len(documents)

#### Postprocessing (for intfloat/e5-small-v2)
Optional modifications to each element in our documents needs to be done to get the most out of the `intfloat/e5-small-v2` model. This involves adding "query: " before each string (as this is for similarity search).

In [None]:
documents_ = [f"query: {document}" for document in documents]
documents_[:5]

### Sparse and Dense Embeddings
We use BM25 for sparse embeddings and hugging face sentence transformers for dense embeddings.

#### Sparse

In [None]:
bm25 = BM25Encoder()

In [None]:
bm25.fit(documents)

#### Dense

In [None]:
model_dir = "models/"

In [None]:
try:
    model = SentenceTransformer(model_dir)
except Exception as e:
    model = SentenceTransformer('intfloat/e5-small-v2')
    print(f"Caught an error: {e}")


#### Save Model (Optional)

In [None]:
# If model dir empty save model
if len(os.listdir(model_dir)) == 0:    
    model.save(model_dir)
    print(f"Saved model to {model_dir}")

### Connecting to Pinecone

In [None]:
pc = Pinecone(
    api_key=os.getenv("PINECONE_KEY_PROCOPIUS"),
    environment='gcp-starter'
)

In [None]:
# Connect to index
# On a starter plan we only have one, so we pick the zero-indexed one
# This ensures that even if the name changes, as long as the dimensions don't, we can connect to an index.

indexes_info = pc.list_indexes().index_list['indexes']
index_name = indexes_info[0]['name']

print(f"The index with name: {index_name}\nHas a dimension of: {indexes_info[0]['dimension']}")

index = pc.Index(name=index_name)

### Embed and Upsert

In [None]:
namespace = 'imf-articles'

records = []
for document in tqdm(documents_):
    
    dense_vector = model.encode(
        document, 
        normalize_embeddings=True, 
        # show_progress_bar=True
    ).tolist()

    sparse_vector = bm25.encode_documents(document)

    record = {
        "id": str(uuid.uuid4()),
        "values": dense_vector,
        "sparse_values": sparse_vector,
        'metadata': {
            'text': document
        }
    }

    records.append(record)
    # index.upsert(record, namespace=namespace)

#### Batch and Async Upsert

In [None]:
def chunker(seq, batch_size):
  return (seq[pos:pos + batch_size] for pos in range(0, len(seq), batch_size))

async_results = [
  index.upsert(vectors=chunk, namespace=namespace, async_req=True)
  for chunk in chunker(records, batch_size=100)
]

# Wait for and retrieve responses (in case of error)
# [async_result.result() for async_result in async_results]

### Testing Pipeline
Let us perform a little question answer over our stored documents to ensure it at least works well.

In [None]:
question = "Has there been staff-level agreements between the IMF and the government of kenya?"

sparse_query = bm25.encode_documents(question)
dense_query = model.encode(question).tolist()

In [None]:
res = index.query(
    top_k=3, 
    vector=dense_query,
    sparse_vector=sparse_query,
    include_metadata=True,
    namespace=namespace
)

In [None]:
res

In [None]:
contexts = ''.join([match['metadata']['text'].replace('query: ', '') for match in res['matches']])

In [None]:
index.describe_index_stats()

In [None]:
import google.generativeai as genai

In [None]:
genai.configure(api_key=os.getenv("PALM_API_KEY"))

In [None]:
gemini = genai.GenerativeModel('gemini-pro')

In [None]:

prompt = f"""
Given the following context :

{contexts}

try to answer the following question

{question}

or at least summarize what is contained in the context. all right?

"""

In [None]:
response = gemini.generate_content(prompt)

In [None]:
response.text