In [1]:
from dotenv import load_dotenv
import pinecone
import openai
import os
import json
from time import time 
import tqdm
# Typing
from typing import List, Dict, Any, Optional, Union, Tuple

from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pinecone.index import Index
from pinecone.index import UpsertResponse
from langchain.chains import LLMChain
from llm.chains import output_chain
# Data processing stuff
import pandas as pd

from PineconeUtils.Indexer import Indexer,DataEmbedding


  from tqdm.autonotebook import tqdm


In [2]:
load_dotenv()
PINECONE_API_KEY,PINECONE_ENVIRONMENT,INDEX_NAME = os.getenv("PINECONE_API_KEY"),os.getenv("PINECONE_ENVIRONMENT"),os.getenv("PINECONE_INDEX_NAME")
print('PINECONE_API_KEY,PINECONE_ENVIRONMENT,INDEX_NAME: ', PINECONE_API_KEY,PINECONE_ENVIRONMENT,INDEX_NAME)

PINECONE_API_KEY,PINECONE_ENVIRONMENT,INDEX_NAME:  3efa3801-3d5d-4f24-8259-1d9dc446478d gcp-starter namchat


In [3]:
class PineconeQuery:
    """Main class to query both text/sentence/images with pinecone"""
    embedding_model:OpenAIEmbeddings = OpenAIEmbeddings(model='text-embedding-ada-002')
    NAMESPACES = ["personal","experience","projects","thoughts"]

    def _initPinecone(self,PINECONE_API_KEY:str,PINECONE_ENVIRONMENT:str,INDEX_NAME:str) -> pinecone.index:
        """Init Pinecone stuff"""

        pinecone.init(api_key=PINECONE_API_KEY,environment=PINECONE_ENVIRONMENT)
        # connect to index
        index:Index = pinecone.Index(INDEX_NAME)
        return index

    def __init__(self,PINECONE_API_KEY:str,PINECONE_ENVIRONMENT:str,INDEX_NAME:str):
        """ Initialize the Pinecone Index from .env file pinecone variables"""
        start_time = time()
        
        # connect to index
        self.index:Index = self._initPinecone(PINECONE_API_KEY,PINECONE_ENVIRONMENT,INDEX_NAME)
        self.embedding_model:OpenAIEmbeddings = self.embedding_model
        self.namespaces:list[str]  = ["personal","experience","projects","thoughts"]
        # Init the namespaces docsearch
        # self.docsearch = Pinecone.from_existing_index(INDEX_NAME, embedding_model) # default namespace
        # self.docsearch_personal = Pinecone.from_existing_index(INDEX_NAME, embedding_model, namespace="personal")
        # self.docsearch_experience = Pinecone.from_existing_index(INDEX_NAME, embedding_model, namespace="experience")
        # self.docsearch_projects = Pinecone.from_existing_index(INDEX_NAME, embedding_model, namespace="projects")
        # self.docsearch_thoughts = Pinecone.from_existing_index(INDEX_NAME, embedding_model, namespace="thoughts")
        print(f'Successfully connected to Pinecone Index:\n{self.index.describe_index_stats()},took {time() - start_time} seconds')


    def _checkValidNamespace(self,namespace:str) -> bool:
        # Check if namespace is valid
        if namespace not in self.namespaces:
            raise Warning(f"Namespace not found, must be one of the following {self.namespaces}. using default namespace= None")
        
        return True
    


    def query(self,query:str,namespace:str=None,top_k:int = 3) -> list[dict]:
        """Select a query and fetch the results

        Raises ValueError if namespace is not one of the following:
            ValueError: Namespace must be one of the following ['personal', 'experience', 'projects', 'thoughts']

        Args:
            query (str): Query to search
            namespace (str): Namespace to search can be one of the following
                >personal experience, projects, thoughts

        Returns:
            list[dict]: List of matched documents, Top 3 relevant documents
            Example matched_docs[
                {'id': '4',
  'score': 0.737784088,
  'values': [],
  'metadata': {'categories': 'personal',
   'isImage': False,
   'text': 'WTH ( Finalist ) Activities: NUS LifeHack 2021 ( Participant ) SUTD What The Hack: Environment 2021 ( Participant ) Appetizer Hackathon 2021 ( Participant ) SPAI Beginner Machine Learning Bootcamp 2021 SPAI Advance Machine Learning Workshop 2021 SEED Code League 2021 ( Participant ) NUS LifeHack 2022 ( Participant ) NTUtion 2022 Hackathon ( Participant ) NUS LifeHack 2023( Participant )'}},
 {'id': '2',
  'score': 0.728767276,
  'values': [],
  'metadata': {'categories': 'personal',
   'isImage': False,
   'text': 'Activities:\nNUS LifeHack 2021 ( Participant )\nSUTD What The Hack: Environment 2021 ( Participant )\nAppetizer Hackathon 2021 ( Participant )\nSPAI Beginner Machine Learning Bootcamp 2021\nSPAI Advance Machine Learning Workshop 2021\nSEED Code League 2021 ( Participant )\nNUS LifeHack 2022 ( Participant )\nNTUtion 2022 Hackathon ( Participant )'}},
 {'id': '3',
  'score': 0.725435615,
  'values': [],
  'metadata': {'categories': 'personal',
   'isImage': False,
   'text': 'NUS LifeHack 2023( Participant )Achievement: Polyfintech100 API Hackthon 2023 ( Champion ) Batey Hackathon 2022 ( Champion Gold ) Polyfintech100 API Hackthon 2023 ( 1st runner up ) Polyfintech 2022 ( 1st Runner-Up) DSAC AI Makerspace Holiday Challenge 2021 ( Champion ) FPG FIT Hack 2021 ( Finalist ) SUTD WTH ( Finalist ) Activities:'}}
   ]
        """

        query_embedding = self.embedding_model.embed_query(query)
        # Get the top 3 results
        results = self.index.query(query_embedding,top_k=top_k,include_metadata=True)
        print('results: ', results)
        
        results_dict = results.to_dict()
        print('results_dict: ', results_dict)
        matched_docs = results_dict["matches"]
        return matched_docs


# Test the class
pineconeQuery = PineconeQuery(PINECONE_API_KEY,PINECONE_ENVIRONMENT,INDEX_NAME)

Successfully connected to Pinecone Index:
{'dimension': 1536,
 'index_fullness': 0.00139,
 'namespaces': {'': {'vector_count': 139}},
 'total_vector_count': 139},took 0.5850510597229004 seconds


In [4]:
docs = pineconeQuery.query("DSG,DSG,DSG,DSG",top_k=5)

results:  {'matches': [{'id': '3',
              'metadata': {'answers': ['1. <b> Caregiver support service: </b> '
                                       'provides services to support and '
                                       'empower caregivers such as supporting '
                                       'them in self-care, health and wellness '
                                       'activities, stress management, peer '
                                       'support, future planning and emotional '
                                       'support.\n'
                                       '2. <b> Dementia daycare: </b> New '
                                       'Horizon Centres - provides day care '
                                       'services for people with dementia.\n'
                                       '3. <b> Family of Wisdom programme: '
                                       '</b> a weekly 3-hour enrichment '
                                       'programme to p

In [5]:
def extractQuestionsFromDocs(matched_docs:list[dict]) -> list[dict]:
    """Filter the metadata from the matched docs
    Args:
        matched_docs (list[dict]): List of matched documents
    Returns:
        list[list[str]]: List of questions 
    """
    questions_list = []
    for doc in matched_docs:
        metadata = doc["metadata"]
        text = metadata["questions"]
        questions_list.append(text)
    return questions_list

def formatQuestionPrompt(questions_list:list[list[str]]) -> str:
    """Format the questions to be used as prompt for GPT-3
    Args:
        questions (list[str]): List of questions
    Returns:
        str: Formatted questions
    """
    letter_mapping = {
        1:"a",
        2:"b",
        3:"c",
        4:"d",
        5:"e",
        6:"f",
        7:"g",
        8:"h",
        9:"i",
        10:"j",
        11:"k",
        12:"l",
        13:"m",
        14:"n",
        15:"o",
        16:"p",
        17:"q",
        18:"r",
        19:"s",
        20:"t",
        21:"u",
        22:"v",
        23:"w",
        24:"x",
        25:"y",
        26:"z",
    }
    question_prompt = ""
    for i,questions in enumerate(questions_list):
        question_prompt += f"---Question List {i+1}\n"
        for e,question in enumerate(questions):
            question_prompt += f"{i+1}.{letter_mapping[e+1]}) {question}\n "
    
    return question_prompt
        

In [6]:
questions = extractQuestionsFromDocs(docs)
question_prompt = formatQuestionPrompt(questions)
question_prompt

# test.txt write
with open("test.txt", "w") as text_file:
    text_file.write(question_prompt)

In [7]:
class Orchestrator:
    """Main wrapper class to handle both Pinecone Query doc and Langchain QA docs"""

    def __init__(self,pineconeQuery:PineconeQuery,chain:LLMChain):
        self.pineconeQuery = pineconeQuery
        self.chain = chain

    @staticmethod
    def extractQuestionsFromDocs(matched_docs:list[dict]) -> list[dict]:
        """Filter the metadata from the matched docs
        Args:
            matched_docs (list[dict]): List of matched documents
        Returns:
            list[list[str]]: List of questions 
        """
        questions_list = []
        for doc in matched_docs:
            metadata = doc["metadata"]
            text = metadata["questions"]
            questions_list.append(text)
        return questions_list

    @staticmethod
    def formatQuestionPrompt(questions_list:list[list[str]]) -> str:
        """Format the questions to be used as prompt for GPT-3
        Args:
            questions (list[str]): List of questions
        Returns:
            str: Formatted questions
        """

        question_prompt = ""
        for i,questions in enumerate(questions_list):
            question_prompt += f"---\n"
            for e,question in enumerate(questions):
                question_prompt += f"{i+1}.{e+1}) {question}\n "
        
        return question_prompt

    
    def findRelevantQuestion(self,question:str)->dict:
        """
        Find the most relevant question from the Langchain QA docs, if theres no relevant question, return false

        Args:
            question (str): Question to search
        
        Returns:
            {
                "isValidQuestion": true,
                "matched_question": "tube feeding impacts",
                "question_list_index": 2
            }
        """
        docs = self.pineconeQuery.query(question,top_k=5)

        questions = Orchestrator.extractQuestionsFromDocs(docs)
        question_prompt = Orchestrator.formatQuestionPrompt(questions)

        # Get the answer from the chain
        answer = self.chain.run(questions= question_prompt,user_question=question)

        return answer

orchestrator = Orchestrator(pineconeQuery,chain=output_chain)

In [8]:
question_json = orchestrator.findRelevantQuestion("my lover have problem about memory what symoptom")

results:  {'matches': [{'id': '34',
              'metadata': {'answers': ['Persons with dementia often have '
                                       'hallucinations, become suspicious, and '
                                       'have paranoia. \n'
                                       'So, some tips for handling this: \n'
                                       '• Validate their feelings about the '
                                       'situation and then as naturally as '
                                       'possible change the subject to '
                                       'something more pleasant, or change '
                                       'what you are doing—for example, get up '
                                       'and take a walk with them. \n'
                                       "A person with Alzheimer's may become "
                                       'suspicious of those around them, even '
                                       'accusing others 

In [9]:
question_json

{'isRelevantQuestion': True,
 'matched_question': 'dementia Loss of identity',
 'question_list_index': 3}