### Pinecone podojo
Storing podojo docu in pinecone.

In [11]:
import pinecone
from dotenv import load_dotenv
import os
from tqdm import tqdm_notebook as tqdm
import pandas as pd
import json
import csv
import uuid
import openai

In [23]:
load_dotenv(".env")
pinecone.init(api_key=os.getenv("PINECONE_API_KEY"), environment="us-west1-gcp")
openai.api_key = os.getenv("OPENAI_API_KEY")

In [13]:
pinecone.list_indexes()

['podojo-docu']

In [14]:
index = pinecone.Index("podojo-docu")

### Load file

In [15]:
filename = 'culture-map'

In [16]:
# Read CSV file
csv_file = 'embeddings/' + filename + '.csv'
json_file = 'json/' + filename + '.json'
df = pd.read_csv(csv_file)

In [17]:
df.head()

Unnamed: 0.1,Unnamed: 0,text,n_tokens,embeddings
0,0,WHAT IS A CULTURE MAP?,8,"[-0.021535981446504593, -0.0006195246824063361..."
1,1,With the Culture Map you can visualize the cul...,37,"[-0.0366937592625618, 0.0026987437158823013, 0..."
2,2,WHEN TO USE IT?,6,"[0.011209188960492611, 0.004226140212267637, 0..."
3,3,We use the Culture Map to ...,7,"[-0.015473390929400921, 0.010065949521958828, ..."
4,4,Formulate hypotheses in a change process,7,"[0.017886145040392876, 0.005050803534686565, -..."


In [18]:
# Load CSV into a DataFrame and filter the columns
df = pd.read_csv(csv_file, usecols=['text', 'embeddings'])

In [19]:
df.head()

Unnamed: 0,text,embeddings
0,WHAT IS A CULTURE MAP?,"[-0.021535981446504593, -0.0006195246824063361..."
1,With the Culture Map you can visualize the cul...,"[-0.0366937592625618, 0.0026987437158823013, 0..."
2,WHEN TO USE IT?,"[0.011209188960492611, 0.004226140212267637, 0..."
3,We use the Culture Map to ...,"[-0.015473390929400921, 0.010065949521958828, ..."
4,Formulate hypotheses in a change process,"[0.017886145040392876, 0.005050803534686565, -..."


### Upsert data to pinecone

In [20]:
# Convert embeddings to lists (keep all dimensions)
df['embeddings'] = df['embeddings'].apply(lambda x: json.loads(x))

# Generate output data
data = []
for idx, row in df.iterrows():
    # Generate a random ID
    random_id = str(uuid.uuid4())

    # Add the text and URL to metadata
    metadata = {"text": row['text'], "url": "your_url_here"}

    data.append((random_id, row['embeddings'], metadata))

# Upsert data into the vector database
index.upsert(data)


{'upserted_count': 108}

In [21]:
# Get the vector of the first entry
query_vector = df.loc[0, 'embeddings']

# Perform the query
result = index.query(vector=query_vector, top_k=3, include_values=True, include_metadata=True)

# Print the result
print(result)


{'matches': [{'id': 'c7334d2e-ef5c-417f-a0bb-7925c3850993',
              'metadata': {'text': 'WHAT IS A CULTURE MAP?',
                           'url': 'your_url_here'},
              'score': 1.00000012,
              'values': [-0.0215359814,
                         -0.000619524682,
                         0.00820815843,
                         -0.00766558526,
                         -0.000561267603,
                         0.0270173624,
                         -0.0121661602,
                         0.00928634871,
                         -0.00289372378,
                         -0.0375905819,
                         -0.0127226459,
                         -0.0108792884,
                         -0.0195187237,
                         0.0230384916,
                         0.0020224764,
                         0.0323596224,
                         0.0218142252,
                         -0.0073595182,
                         -0.00916114,
                         0.001401

### Query the data

In [33]:
def complete(prompt):
    # query text-davinci-003
    res = openai.Completion.create(
        engine='text-davinci-003',
        prompt=prompt,
        temperature=0,
        max_tokens=400,
        top_p=1,
        frequency_penalty=0,
        presence_penalty=0,
        stop=None
    )
    return res['choices'][0]['text'].strip()

In [24]:
embed_model = "text-embedding-ada-002"

res = openai.Embedding.create(
    input=[
        "Sample document text goes here",
        "there will be several phrases in each batch"
    ], engine=embed_model
)

In [41]:
query = "Give 5 key points from the data?"

In [42]:
res = openai.Embedding.create(
    input=[query],
    engine=embed_model
)

# retrieve from Pinecone
xq = res['data'][0]['embedding']

# get relevant contexts (including the questions)
res = index.query(xq, top_k=2, include_metadata=True)

In [43]:
res

{'matches': [{'id': 'd47d5cb7-a5db-4c14-aab8-5b7babfe0054',
              'metadata': {'text': 'What results are we seeing?',
                           'url': 'your_url_here'},
              'score': 0.803712368,
              'values': []},
             {'id': 'c618ba71-0ed5-4e4f-962f-d157fbd8c43d',
              'metadata': {'text': 'What results are we seeing?',
                           'url': 'your_url_here'},
              'score': 0.803712368,
              'values': []}],
 'namespace': ''}

In [44]:
limit = 3750

def retrieve(query):
    res = openai.Embedding.create(
        input=[query],
        engine=embed_model
    )

    # retrieve from Pinecone
    xq = res['data'][0]['embedding']

    # get relevant contexts
    res = index.query(xq, top_k=3, include_metadata=True)
    contexts = [
        x['metadata']['text'] for x in res['matches']
    ]

    # build our prompt with the retrieved contexts included
    prompt_start = (
        "Answer the question based on the context below.\n\n"+
        "Context:\n"
    )
    prompt_end = (
        f"\n\nQuestion: {query}\nAnswer:"
    )
    # append contexts until hitting limit
    for i in range(1, len(contexts)):
        if len("\n\n---\n\n".join(contexts[:i])) >= limit:
            prompt = (
                prompt_start +
                "\n\n---\n\n".join(contexts[:i-1]) +
                prompt_end
            )
            break
        elif i == len(contexts)-1:
            prompt = (
                prompt_start +
                "\n\n---\n\n".join(contexts) +
                prompt_end
            )
    return prompt

In [45]:
# first we retrieve relevant items from Pinecone
query_with_contexts = retrieve(query)
query_with_contexts

'Answer the question based on the context below.\n\nContext:\nWhat results are we seeing?\n\n---\n\nWhat results are we seeing?\n\n---\n\nWhat is the impact?\n\nQuestion: Give 5 key points from the data?\nAnswer:'

In [46]:
# then we complete the context-infused query
complete(query_with_contexts)

'The five key points from the data could include: 1) the overall success rate, 2) the average time to completion, 3) the number of users who completed the task, 4) the number of users who abandoned the task, and 5) any other relevant metrics.'