# Installing Libraries

In [17]:
from dotenv import load_dotenv
import pinecone
import openai
import os
import json
from time import time 
# Typing
from typing import List, Dict, Any, Optional, Union, Tuple

from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pinecone.index import Index
# Embedding model
from InstructorEmbedding import INSTRUCTOR

## Loading .env variables

In [18]:
# Load variables from the .env file
vaild = load_dotenv()

# Access the variables
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
INDEX_NAME = os.getenv("PINECONE_INDEX_NAME")
PINECONE_ENVIRONMENT= os.getenv("PINECONE_ENVIRONMENT")

# openai.api_key = OPENAI_API_KEY
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY
os.environ["AZURE_OPENAI_API_KEY"] = os.environ.get("API_KEY")
os.environ["AZURE_OPENAI_API_BASE"] = os.environ.get("API_BASE")
vaild


True

# Connecting to Pinecone Index

In [19]:
import pinecone

# initialize connection to pinecone
pinecone.init(
    api_key=PINECONE_API_KEY,  # app.pinecone.io (console)
    environment=PINECONE_ENVIRONMENT  # next to API key in console
)
# connect to index
index = pinecone.Index(INDEX_NAME)
# view index stats
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 1},
                'experience': {'vector_count': 6},
                'personal': {'vector_count': 6}},
 'total_vector_count': 13}

# Chunking + procesing 

In [4]:
import tiktoken

tokenizer = tiktoken.get_encoding('p50k_base')

# create the length function
def tiktoken_len(text):
    tokens = tokenizer.encode(
        text,
        disallowed_special=()
    )
    return len(tokens)

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=100,
    chunk_overlap=10,
    length_function=tiktoken_len,
    separators=["\n\n", "\n", " ", ""]
)

# Define the Text-Embedding Model

In [8]:
embedding_model = OpenAIEmbeddings(model='text-embedding-ada-002')

def embedded_text_splitter(text:str)->list[float]:
    split_embedded_text = text_splitter.split_text(text)
    all = []
    for i,text in enumerate(split_embedded_text):
        print(text)
        vector_text = embedding_model.embed_query(text)
        print('vector_text: ', vector_text)

        id = str(i)

        all.append((id, vector_text,{"text":text,"isImage":False,"categories":"personal"}))

        print('to_upsert: ', all)

    index.upsert(vectors = all,namespace="experience")
    
    

test_text = """The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog."""
test_text2 = """
EducationEducation
Singapore Polytechnic logo
Singapore PolytechnicSingapore Polytechnic
Diploma of Education, Applied AI and AnalyticsDiploma of Education, Applied AI and Analytics
Apr 2021 - Apr 2024Apr 2021 - Apr 2024
Activities and societies: SPAI, SP guitarist
SP Guitarist Vice-President ( March 2022-Current )Activities and societies: SPAI, SP guitarist SP Guitarist Vice-President ( March 2022-Current )
Achievement:
Polyfintech100 API Hackthon 2023 ( Champion )
Batey Hackathon 2022 ( Champion Gold )
Polyfintech100 API Hackthon 2023 ( 1st runner up )
Polyfintech 2022 ( 1st Runner-Up)
DSAC AI Makerspace Holiday Challenge 2021 ( Champion )
FPG FIT Hack 2021 ( Finalist )
SUTD WTH ( Finalist )

Activities:
NUS LifeHack 2021 ( Participant )
SUTD What The Hack: Environment 2021 ( Participant )
Appetizer Hackathon 2021 ( Participant )
SPAI Beginner Machine Learning Bootcamp 2021
SPAI Advance Machine Learning Workshop 2021
SEED Code League 2021 ( Participant )
NUS LifeHack 2022 ( Participant )
NTUtion 2022 Hackathon ( Participant )
NUS LifeHack 2023( Participant )Achievement: Polyfintech100 API Hackthon 2023 ( Champion ) Batey Hackathon 2022 ( Champion Gold ) Polyfintech100 API Hackthon 2023 ( 1st runner up ) Polyfintech 2022 ( 1st Runner-Up) DSAC AI Makerspace Holiday Challenge 2021 ( Champion ) FPG FIT Hack 2021 ( Finalist ) SUTD WTH ( Finalist ) Activities: NUS LifeHack 2021 ( Participant ) SUTD What The Hack: Environment 2021 ( Participant ) Appetizer Hackathon 2021 ( Participant ) SPAI Beginner Machine Learning Bootcamp 2021 SPAI Advance Machine Learning Workshop 2021 SEED Code League 2021 ( Participant ) NUS LifeHack 2022 ( Participant ) NTUtion 2022 Hackathon ( Participant ) NUS LifeHack 2023( Participant )
Skills: Natural Language Processing (NLP) · Data Engineering · Python (Programming Language)
"""
vector_text = embedded_text_splitter(test_text2)

# index.upsert(vectors = vector_text, ids=["test"], meta=meta)


EducationEducation
Singapore Polytechnic logo
Singapore PolytechnicSingapore Polytechnic
Diploma of Education, Applied AI and AnalyticsDiploma of Education, Applied AI and Analytics
Apr 2021 - Apr 2024Apr 2021 - Apr 2024
Activities and societies: SPAI, SP guitarist
SP Guitarist Vice-President ( March 2022-Current )Activities and societies: SPAI, SP guitarist SP Guitarist Vice-President ( March 2022-Current )
vector_text:  [-0.015419459408313027, -0.018333790237118893, 0.015141273364177596, -0.03478651382467682, -0.030203065799735824, 0.02678535047884619, -0.02277152167405347, 0.03301142050484495, -0.008550911868162784, -0.02073148944196358, 0.02644092877532371, -0.002523545560696266, -0.0009016210972514315, 0.001654213939942408, 0.012664092299390993, -0.016134796147790266, 0.02300996632255667, -0.010643931696439766, 0.0022569505241229327, 0.0015159488636170308, -0.03907852867360497, 0.015154520496055007, 0.00854428830222408, -0.017380009780461, 0.001453853747848898, 0.01655869877993211

# Pinecone UpSert

In [6]:

class PineconeUpsert:
    """Main class to upsert both text/sentence/images with pinecone"""

    def _initPinecone(PINECONE_API_KEY:str,PINECONE_ENVIRONMENT:str,INDEX_NAME:str) -> pinecone.index:
        """Init Pinecone stuff"""

        pinecone.init(api_key=PINECONE_API_KEY,environment=PINECONE_ENVIRONMENT)
        # connect to index
        index:pinecone.index = pinecone.Index(INDEX_NAME)
        print(f'Successfully connected to Pinecone Index:\n{index.describe_index_stats()}')
        return index

    def __init__(self,PINECONE_API_KEY:str,PINECONE_ENVIRONMENT:str,INDEX_NAME:str):
        """ Initialize the Pinecone Index from .env file pinecone variables"""
        # connect to index
        self.index:Index = self._initPinecone(PINECONE_API_KEY,PINECONE_ENVIRONMENT,INDEX_NAME)
        self.embedding_model:OpenAIEmbeddings = OpenAIEmbeddings(model='text-embedding-ada-002')
        self.text_splitter:RecursiveCharacterTextSplitter = RecursiveCharacterTextSplitter(
                chunk_size=10,
                chunk_overlap=1,
                length_function=tiktoken_len,
                separators=["\n\n", "\n", " ", ""]
        )

        # List of metadata options per key
        self.metadata_options = {
            "text": "",
            "isImage": [True, False],
            "categories": ["personal","experience","projects","thoughts"]
        }
        
    def create_and_index_embeddings_text(self,data:str):
        """Upsert text to Pinecone Index"""
        # split text
        split_text = self.text_splitter.split_text(data)
        # embed text
        embedded_text = [embedding_model.embed_query(text) for text in split_text]
        # upsert to pinecone
        self.index.upsert(items=embedded_text,ids=split_text)
        print(f'Successfully upserted text to Pinecone Index:\n{self.index.describe_index_stats()}')


    def insertImage(self,embedded_data:list[float], isImage:bool=True, image_descripton:str):
    
        """Upsert Image to Pinecone """



    

# Test the class
pinecone_upsert = PineconeUpsert(PINECONE_API_KEY,PINECONE_ENVIRONMENT,INDEX_NAME)

TypeError: PineconeUpsert._initPinecone() takes 3 positional arguments but 4 were given

# Pinecone Query

In [41]:
from langchain.vectorstores.pinecone import Pinecone

class PineconeQuery:
    """Main class to query both text/sentence/images with pinecone"""
    embedding_model:OpenAIEmbeddings = OpenAIEmbeddings(model='text-embedding-ada-002')
    NAMESPACES = ["personal","experience","projects","thoughts"]

    def _initPinecone(self,PINECONE_API_KEY:str,PINECONE_ENVIRONMENT:str,INDEX_NAME:str) -> pinecone.index:
        """Init Pinecone stuff"""

        pinecone.init(api_key=PINECONE_API_KEY,environment=PINECONE_ENVIRONMENT)
        # connect to index
        index:Index = pinecone.Index(INDEX_NAME)
        return index

    def __init__(self,PINECONE_API_KEY:str,PINECONE_ENVIRONMENT:str,INDEX_NAME:str):
        """ Initialize the Pinecone Index from .env file pinecone variables"""
        start_time = time()
        
        # connect to index
        self.index:Index = self._initPinecone(PINECONE_API_KEY,PINECONE_ENVIRONMENT,INDEX_NAME)
        self.embedding_model:OpenAIEmbeddings = embedding_model
        self.namespaces:list[str]  = ["personal","experience","projects","thoughts"]
        # Init the namespaces docsearch
        # self.docsearch = Pinecone.from_existing_index(INDEX_NAME, embedding_model) # default namespace
        # self.docsearch_personal = Pinecone.from_existing_index(INDEX_NAME, embedding_model, namespace="personal")
        # self.docsearch_experience = Pinecone.from_existing_index(INDEX_NAME, embedding_model, namespace="experience")
        # self.docsearch_projects = Pinecone.from_existing_index(INDEX_NAME, embedding_model, namespace="projects")
        # self.docsearch_thoughts = Pinecone.from_existing_index(INDEX_NAME, embedding_model, namespace="thoughts")
        print(f'Successfully connected to Pinecone Index:\n{index.describe_index_stats()},took {time() - start_time} seconds')


    def _checkValidNamespace(self,namespace:str) -> bool:
        # Check if namespace is valid
        if namespace not in self.namespaces:
            raise ValueError(f"Namespace must be one of the following {self.namespaces}")
        return True

    def fetch(self,query:str,namespace:str,top_k:int = 3) -> list[dict]:
        """Select a query and fetch the results

        Raises ValueError if namespace is not one of the following:
            ValueError: Namespace must be one of the following ['personal', 'experience', 'projects', 'thoughts']

        Args:
            query (str): Query to search
            namespace (str): Namespace to search can be one of the following
                >personal experience, projects, thoughts

        Returns:
            list[dict]: List of matched documents, Top 3 relevant documents
            Example matched_docs[
                {'id': '4',
  'score': 0.737784088,
  'values': [],
  'metadata': {'categories': 'personal',
   'isImage': False,
   'text': 'WTH ( Finalist ) Activities: NUS LifeHack 2021 ( Participant ) SUTD What The Hack: Environment 2021 ( Participant ) Appetizer Hackathon 2021 ( Participant ) SPAI Beginner Machine Learning Bootcamp 2021 SPAI Advance Machine Learning Workshop 2021 SEED Code League 2021 ( Participant ) NUS LifeHack 2022 ( Participant ) NTUtion 2022 Hackathon ( Participant ) NUS LifeHack 2023( Participant )'}},
 {'id': '2',
  'score': 0.728767276,
  'values': [],
  'metadata': {'categories': 'personal',
   'isImage': False,
   'text': 'Activities:\nNUS LifeHack 2021 ( Participant )\nSUTD What The Hack: Environment 2021 ( Participant )\nAppetizer Hackathon 2021 ( Participant )\nSPAI Beginner Machine Learning Bootcamp 2021\nSPAI Advance Machine Learning Workshop 2021\nSEED Code League 2021 ( Participant )\nNUS LifeHack 2022 ( Participant )\nNTUtion 2022 Hackathon ( Participant )'}},
 {'id': '3',
  'score': 0.725435615,
  'values': [],
  'metadata': {'categories': 'personal',
   'isImage': False,
   'text': 'NUS LifeHack 2023( Participant )Achievement: Polyfintech100 API Hackthon 2023 ( Champion ) Batey Hackathon 2022 ( Champion Gold ) Polyfintech100 API Hackthon 2023 ( 1st runner up ) Polyfintech 2022 ( 1st Runner-Up) DSAC AI Makerspace Holiday Challenge 2021 ( Champion ) FPG FIT Hack 2021 ( Finalist ) SUTD WTH ( Finalist ) Activities:'}}
   ]
        """
        # Check if namespace is valid
        self._checkValidNamespace(namespace)

        query_embedding = embedding_model.embed_query(query)
        # Get the top 3 results
        results = self.index.query(query_embedding,top_k=top_k,namespace=namespace,include_metadata=True)
        results_dict = results.to_dict()
        matched_docs = results_dict["matches"]

        return matched_docs


# Test the class
pineconeQuery = PineconeQuery(PINECONE_API_KEY,PINECONE_ENVIRONMENT,INDEX_NAME)

pineconeQuery.fetch(query="education journey",namespace="experience")

Successfully connected to Pinecone Index:
{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 1},
                'experience': {'vector_count': 6},
                'personal': {'vector_count': 6}},
 'total_vector_count': 13},took 0.682253360748291 seconds


In [44]:
results = pineconeQuery.fetch(query="education journey",namespace="personal")
# Concat all the text into one string
text = ""
for result in results:
    text += result["metadata"]["text"] + " "
print(text)

EducationEducation
Singapore Polytechnic logo
Singapore PolytechnicSingapore Polytechnic
Diploma of Education, Applied AI and AnalyticsDiploma of Education, Applied AI and Analytics
Apr 2021 - Apr 2024Apr 2021 - Apr 2024
Activities and societies: SPAI, SP guitarist
SP Guitarist Vice-President ( March 2022-Current )Activities and societies: SPAI, SP guitarist SP Guitarist Vice-President ( March 2022-Current ) Skills: Natural Language Processing (NLP) · Data Engineering · Python (Programming Language) Activities:
NUS LifeHack 2021 ( Participant )
SUTD What The Hack: Environment 2021 ( Participant )
Appetizer Hackathon 2021 ( Participant )
SPAI Beginner Machine Learning Bootcamp 2021
SPAI Advance Machine Learning Workshop 2021
SEED Code League 2021 ( Participant )
NUS LifeHack 2022 ( Participant )
NTUtion 2022 Hackathon ( Participant ) 


# Testing raw pinecone query without the Lnagchain wrapper

In [20]:
from api.PineconeUtils.Query import PineconeQuery

pineconeQuery = PineconeQuery(PINECONE_API_KEY,PINECONE_ENVIRONMENT,INDEX_NAME)

Successfully connected to Pinecone Index:
{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 1},
                'experience': {'vector_count': 6},
                'personal': {'vector_count': 6}},
 'total_vector_count': 13},took 1.0480186939239502 seconds


In [21]:
pineconeQuery.fetch(query="education journey",namespace="experience")

AuthenticationError: Incorrect API key provided: sk-C8eg4***************************************6WG1. You can find your API key at https://platform.openai.com/account/api-keys.

In [38]:
query_input = "not sure what to do next"
# Embedd the query_input
query_embedding = embedding_model.embed_query(query_input)
results = index.query(query_embedding,top_k=3,namespace="ss",include_metadata=True)
results_dict = results.to_dict()
results_dict['matches']

[]