In [25]:
from dotenv import load_dotenv
import pinecone
import openai
import os
import json
from time import time 
import tqdm
# Typing
from typing import List, Dict, Any, Optional, Union, Tuple

from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pinecone.index import Index
from pinecone.index import UpsertResponse

# Data processing stuff
import pandas as pd

from PineconeUtils.Indexer import Indexer,DataEmbedding


In [26]:
load_dotenv()
PINECONE_API_KEY,PINECONE_ENVIRONMENT,INDEX_NAME = os.getenv("PINECONE_API_KEY"),os.getenv("PINECONE_ENVIRONMENT"),os.getenv("PINECONE_INDEX_NAME")
print('PINECONE_API_KEY,PINECONE_ENVIRONMENT,INDEX_NAME: ', PINECONE_API_KEY,PINECONE_ENVIRONMENT,INDEX_NAME)

PINECONE_API_KEY,PINECONE_ENVIRONMENT,INDEX_NAME:  3efa3801-3d5d-4f24-8259-1d9dc446478d gcp-starter namchat


# Load the Json file in

In [27]:
# Load json file 
with open('./data/qna_01-11-2023.json',encoding="utf-8") as f:
    inserts = json.load(f)

# Extract as list of dicts, 
"""
Example:
[{
    questions:[str],
    answer: str,
}]
"""

def extract_qna_json(json_file_path:str) ->list[dict]:
    with open(json_file_path,encoding="utf-8") as f:
        data = json.load(f)

    final = []

    question_list = data['qnas']
    for question in question_list:
        final.append({
            'questions':question['data']['questions']['en'],
            'answers':question['data']['answers']['en']
        })
    return final

qn_ans_list = extract_qna_json('./data/qna_01-11-2023.json')
qn_ans_list

[{'questions': ['What is Advance Care Planning (ACP)? ',
   'What is the programme that helps me plan out future health and personal care?',
   'Which programme helps me plan for future medical care?',
   'Advance Care Planning (ACP)'],
  'answers': ["- Advance Care Planning (ACP) is the process of planning for future health and personal care your loved one's ACP document tells you and doctors about the treatment options your loved one prefers, should they be unable to communicate their wishes.  \n- This is about creating a plan that can be used if your loved one is unable to make their own decisions. They can also change it at any time, as long as your loved one retains their mental capacity.\n- Your loved one can do an ACP if they have mental capacity. Mental capacity refers to a person’s ability to make a decision for themselves and is specific to the particular decision that needs to be made. \n- A person is deemed to have mental capacity, when they can still:\n     • Understand th

In [28]:
len(qn_ans_list)

139

In [29]:
# Extract questions that have multiple answers
def extract_multi_answer_qns(qn_ans_list:list[dict]) -> list[dict]:
    final = []
    for qn_ans in qn_ans_list:
        if len(qn_ans['answers']) > 1:
            final.append(qn_ans)
    return final

multi_answer_qns = extract_multi_answer_qns(qn_ans_list)
multi_answer_qns[0]['answers']

["Ask for help\n• Tell friends and family how you're feeling: be honest about how you're feeling and what the situation is, having a conversation about how to move forward will be beneficial\n• Spread the responsibility, if possible: divide tasks up to other family members so that they can all pitch in. Tasks can include medical responsibilities, finances, bills, groceries, chores, etc. \n• Don’t be afraid to accept help: others feel good about themselves when helping others, so don't feel guilty\n• Look into respite care if necessary \n\nHear from caregiver Paul on his journey to care for his mother who lives with dementia:\nhttps://m.facebook.com/watch/?v=1006218726996702&_rdr\n\nRespite is important for caregivers and you may need to access this unexpectedly. Planning ahead for respite care will help you to better care for yourself and your loved ones. You can pre-enrol for respite care ahead of time under the Go Respite pilot programme by AIC\nhttps://www.aic.sg/caregiving/go-respi

# Embedding

In [39]:
dataEmbedding = DataEmbedding(openai_embedding=OpenAIEmbeddings(),text_splitter=RecursiveCharacterTextSplitter())

sample = dataEmbedding.prepareJson(qn_ans_list)

100%|██████████| 139/139 [00:48<00:00,  2.86it/s]


In [31]:
sample

[{'id': '0',
  'values': [0.017125344103254253,
   0.000198176511309184,
   0.023978052939949885,
   -0.03931629330984183,
   0.004895251301291879,
   0.048598949431866253,
   0.011423324367761854,
   0.0072834138675015635,
   -0.02908222783683732,
   0.017652474549268056,
   0.014373975041942214,
   0.011680461602283932,
   -0.031036472309321248,
   0.0008031522832241755,
   -0.015209671286969614,
   -0.008221965378866826,
   0.0344306830599546,
   -0.013036861236162895,
   -0.002691905976875784,
   0.01825674830768043,
   -0.01599393989882808,
   0.019658146654920914,
   0.01857817008366367,
   0.022743793469185845,
   0.008626956593088292,
   0.013332569195561672,
   -0.0011707781950328312,
   -0.03617921886240797,
   0.007386268668178136,
   -0.00015146994150083286,
   0.011500465817515253,
   -0.010774052827310544,
   0.005534880393354661,
   -0.0076176925517770396,
   0.012979004683186553,
   -0.003767061323938762,
   -0.00895480635755636,
   -0.018089609431203985,
   0.010645483

In [38]:
pinecone.init(api_key=PINECONE_API_KEY,environment=PINECONE_ENVIRONMENT)
index = pinecone.Index(index_name=INDEX_NAME)
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0001,
 'namespaces': {'': {'vector_count': 10}},
 'total_vector_count': 10}

In [40]:
index.upsert(sample)

{'upserted_count': 139}

In [32]:
indexer = Indexer(PINECONE_API_KEY=os.getenv('PINECONE_API_KEY'),INDEX_NAME=os.getenv('PINECONE_INDEX_NAME'),PINECONE_ENVIRONMENT=os.getenv('PINECONE_ENVIRONMENT'))

Successfully connected to Pinecone Index:
{'dimension': 1536,
 'index_fullness': 0.0001,
 'namespaces': {'': {'vector_count': 10}},
 'total_vector_count': 10}


In [33]:
indexer.upsertQNA(qn_ans_list)

data_batches:  [[{'questions': ['What is Advance Care Planning (ACP)? ', 'What is the programme that helps me plan out future health and personal care?', 'Which programme helps me plan for future medical care?', 'Advance Care Planning (ACP)'], 'answers': ["- Advance Care Planning (ACP) is the process of planning for future health and personal care your loved one's ACP document tells you and doctors about the treatment options your loved one prefers, should they be unable to communicate their wishes.  \n- This is about creating a plan that can be used if your loved one is unable to make their own decisions. They can also change it at any time, as long as your loved one retains their mental capacity.\n- Your loved one can do an ACP if they have mental capacity. Mental capacity refers to a person’s ability to make a decision for themselves and is specific to the particular decision that needs to be made. \n- A person is deemed to have mental capacity, when they can still:\n     • Understa

100%|██████████| 10/10 [00:03<00:00,  2.82it/s]


Upserted 10 rows, upsert_response: {'upserted_count': 10}


100%|██████████| 10/10 [00:03<00:00,  2.93it/s]


Upserted 10 rows, upsert_response: {'upserted_count': 10}


100%|██████████| 10/10 [00:03<00:00,  2.91it/s]


Upserted 10 rows, upsert_response: {'upserted_count': 10}


100%|██████████| 10/10 [00:03<00:00,  2.92it/s]


Upserted 10 rows, upsert_response: {'upserted_count': 10}


100%|██████████| 10/10 [00:03<00:00,  3.00it/s]


Upserted 10 rows, upsert_response: {'upserted_count': 10}


100%|██████████| 10/10 [00:03<00:00,  2.82it/s]


Upserted 10 rows, upsert_response: {'upserted_count': 10}


100%|██████████| 10/10 [00:03<00:00,  3.02it/s]


Upserted 10 rows, upsert_response: {'upserted_count': 10}


100%|██████████| 10/10 [00:03<00:00,  2.62it/s]


Upserted 10 rows, upsert_response: {'upserted_count': 10}


100%|██████████| 10/10 [00:03<00:00,  2.94it/s]


Upserted 10 rows, upsert_response: {'upserted_count': 10}


100%|██████████| 10/10 [00:03<00:00,  2.91it/s]


Upserted 10 rows, upsert_response: {'upserted_count': 10}


 20%|██        | 2/10 [00:01<00:04,  1.93it/s]


KeyboardInterrupt: 

# Querer

In [None]:
class PineconeQuery:
    """Main class to query both text/sentence/images with pinecone"""
    embedding_model:OpenAIEmbeddings = OpenAIEmbeddings(model='text-embedding-ada-002')
    NAMESPACES = ["personal","experience","projects","thoughts"]

    def _initPinecone(self,PINECONE_API_KEY:str,PINECONE_ENVIRONMENT:str,INDEX_NAME:str) -> pinecone.index:
        """Init Pinecone stuff"""

        pinecone.init(api_key=PINECONE_API_KEY,environment=PINECONE_ENVIRONMENT)
        # connect to index
        index:Index = pinecone.Index(INDEX_NAME)
        return index

    def __init__(self,PINECONE_API_KEY:str,PINECONE_ENVIRONMENT:str,INDEX_NAME:str):
        """ Initialize the Pinecone Index from .env file pinecone variables"""
        start_time = time()
        
        # connect to index
        self.index:Index = self._initPinecone(PINECONE_API_KEY,PINECONE_ENVIRONMENT,INDEX_NAME)
        self.embedding_model:OpenAIEmbeddings = self.embedding_model
        self.namespaces:list[str]  = ["personal","experience","projects","thoughts"]
        # Init the namespaces docsearch
        # self.docsearch = Pinecone.from_existing_index(INDEX_NAME, embedding_model) # default namespace
        # self.docsearch_personal = Pinecone.from_existing_index(INDEX_NAME, embedding_model, namespace="personal")
        # self.docsearch_experience = Pinecone.from_existing_index(INDEX_NAME, embedding_model, namespace="experience")
        # self.docsearch_projects = Pinecone.from_existing_index(INDEX_NAME, embedding_model, namespace="projects")
        # self.docsearch_thoughts = Pinecone.from_existing_index(INDEX_NAME, embedding_model, namespace="thoughts")
        print(f'Successfully connected to Pinecone Index:\n{self.index.describe_index_stats()},took {time() - start_time} seconds')


    def _checkValidNamespace(self,namespace:str) -> bool:
        # Check if namespace is valid
        if namespace not in self.namespaces:
            raise Warning(f"Namespace not found, must be one of the following {self.namespaces}. using default namespace= None")
        
        return True
    


    def query(self,query:str,namespace:str=None,top_k:int = 3) -> list[dict]:
        """Select a query and fetch the results

        Raises ValueError if namespace is not one of the following:
            ValueError: Namespace must be one of the following ['personal', 'experience', 'projects', 'thoughts']

        Args:
            query (str): Query to search
            namespace (str): Namespace to search can be one of the following
                >personal experience, projects, thoughts

        Returns:
            list[dict]: List of matched documents, Top 3 relevant documents
            Example matched_docs[
                {'id': '4',
  'score': 0.737784088,
  'values': [],
  'metadata': {'categories': 'personal',
   'isImage': False,
   'text': 'WTH ( Finalist ) Activities: NUS LifeHack 2021 ( Participant ) SUTD What The Hack: Environment 2021 ( Participant ) Appetizer Hackathon 2021 ( Participant ) SPAI Beginner Machine Learning Bootcamp 2021 SPAI Advance Machine Learning Workshop 2021 SEED Code League 2021 ( Participant ) NUS LifeHack 2022 ( Participant ) NTUtion 2022 Hackathon ( Participant ) NUS LifeHack 2023( Participant )'}},
 {'id': '2',
  'score': 0.728767276,
  'values': [],
  'metadata': {'categories': 'personal',
   'isImage': False,
   'text': 'Activities:\nNUS LifeHack 2021 ( Participant )\nSUTD What The Hack: Environment 2021 ( Participant )\nAppetizer Hackathon 2021 ( Participant )\nSPAI Beginner Machine Learning Bootcamp 2021\nSPAI Advance Machine Learning Workshop 2021\nSEED Code League 2021 ( Participant )\nNUS LifeHack 2022 ( Participant )\nNTUtion 2022 Hackathon ( Participant )'}},
 {'id': '3',
  'score': 0.725435615,
  'values': [],
  'metadata': {'categories': 'personal',
   'isImage': False,
   'text': 'NUS LifeHack 2023( Participant )Achievement: Polyfintech100 API Hackthon 2023 ( Champion ) Batey Hackathon 2022 ( Champion Gold ) Polyfintech100 API Hackthon 2023 ( 1st runner up ) Polyfintech 2022 ( 1st Runner-Up) DSAC AI Makerspace Holiday Challenge 2021 ( Champion ) FPG FIT Hack 2021 ( Finalist ) SUTD WTH ( Finalist ) Activities:'}}
   ]
        """

        query_embedding = self.embedding_model.embed_query(query)
        # Get the top 3 results
        results = self.index.query(query_embedding,top_k=top_k,include_metadata=True)
        print('results: ', results)
        
        results_dict = results.to_dict()
        print('results_dict: ', results_dict)
        matched_docs = results_dict["matches"]
        return matched_docs


# Test the class
pineconeQuery = PineconeQuery(PINECONE_API_KEY,PINECONE_ENVIRONMENT,INDEX_NAME)

Successfully connected to Pinecone Index:
{'dimension': 1536,
 'index_fullness': 0.0001,
 'namespaces': {'': {'vector_count': 10}},
 'total_vector_count': 10},took 0.8778841495513916 seconds


In [None]:
def parseQnDocs(questions:list[str]) -> str:
    """Format the questions to a single string"""
    final = ""
    for qn in questions:
        final += f"{qn}\n"
    return final