# Installing Libraries

In [2]:
from dotenv import load_dotenv
import pinecone
import openai
import os

from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Embedding model
from InstructorEmbedding import INSTRUCTOR

  from tqdm.autonotebook import tqdm


## Loading .env variables

In [3]:
# Load variables from the .env file
load_dotenv('./env')

# Access the variables
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
INDEX_NAME = os.getenv("PINECONE_INDEX_NAME")
PINECONE_ENVIRONMENT= os.getenv("PINECONE_ENVIRONMENT")

openai.api_key = OPENAI_API_KEY
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY


# Connecting to Pinecone Index

In [4]:
import pinecone

# initialize connection to pinecone
pinecone.init(
    api_key=PINECONE_API_KEY,  # app.pinecone.io (console)
    environment=PINECONE_ENVIRONMENT  # next to API key in console
)
# connect to index
index = pinecone.Index(INDEX_NAME)
# view index stats
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

# Chunking + procesing 

In [5]:
import tiktoken

tokenizer = tiktoken.get_encoding('p50k_base')

# create the length function
def tiktoken_len(text):
    tokens = tokenizer.encode(
        text,
        disallowed_special=()
    )
    return len(tokens)

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=100,
    chunk_overlap=10,
    length_function=tiktoken_len,
    separators=["\n\n", "\n", " ", ""]
)

# Define the Text-Embedding Model

In [6]:
# def create_and_index_embeddings(data, model, index):
#     print(data)
#     batch_size = 32
#     for i in range(0, len(data), batch_size):
#         text_batch = [item["text"] for item in data[i:i+batch_size]]
#         ids_batch = [str(n) for n in range(i, i+min(batch_size, len(data)-i))]
#         print('ids_batch: ', ids_batch)
#         res = openai.Embedding.create(input=text_batch, engine=model)
#         embeds = [record["embedding"] for record in res["data"]]
#         print('embeds: ', embeds)

#         # prep metadata and upsert batch
#         meta = [{'text': line} for line in text_batch]

#         to_upsert = zip(ids_batch, embeds, meta)
#         print('to_upsert list', list(to_upsert))
        
#         # index.upsert(vectors=list(to_upsert))

# import jsonlines

# def load_data(file_path):
#     data = []
#     with jsonlines.open(file_path) as f:
#         for item in f:
#             data.append(item)
#     return data
# train_data = load_data("train.jsonl")
# create_and_index_embeddings(train_data,"text-embedding-ada-002","dwasd")

In [7]:
# from InstructorEmbedding import INSTRUCTOR
# model = INSTRUCTOR('hkunlp/instructor-large')
# sentence = "3D ActionSLAM: wearable person tracking in multi-floor environments"
# instruction = "Represent the Science title:"
# INSTRUCTION = f"""Represent the domain text_type for task_objective:"""

# embeddings = model.encode([[instruction,sentence]])
# print(embeddings)


In [18]:
embedding_model = OpenAIEmbeddings(model='text-embedding-ada-002')

def embedded_text_splitter(text:str)->list[float]:
    split_embedded_text = text_splitter.split_text(text)
    all = []
    for i,text in enumerate(split_embedded_text):
        print(text)
        vector_text = embedding_model.embed_query(text)
        print('vector_text: ', vector_text)

        id = str(i)

        all.append((id, vector_text,{"text":text,"isImage":False,"categories":"personal"}))

        print('to_upsert: ', all)

    index.upsert(vectors = all,namespace="personal")
    
    

test_text = """The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog."""
vector_text = embedded_text_splitter(test_text)

# index.upsert(vectors = vector_text, ids=["test"], meta=meta)


The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog.
vector_text:  [-0.0050512995325097095, 0.002053068512973587, -0.011899033722832998, -0.010309157101646624, -0.016549734623322293, 0.018552728249085666, -0.025563205007934887, -0.01791427308790947, -0.009977411614163238, -0.014972376840353794, 0.022959314598294124, 0.017826643083029507, 0.019666893674605657, -0.0029997958743133565, 2.9951991061292723e-05, -0.013269832537851703, 0.02638944087973406, 0.0008144985296473261, 0.004494216819749714, -0.013858211414833803, -0.01050945665048748, 0.015185195227412527, -0.00858157512270878, -0.005082596162393108, 0.00182460192738848, 0.014509184482905288, 0.014158660738095085, -0.0018230370726112451, 3.420639815014245e-05, -0.0028495712126827156, 0.009583071935590467, -0.002148523490231664, -0.03447652654944964, -0.007141924094475633, -0.006672472317918189, -0.017150633269844857, 0.0011814532496184917, -0.004234453

# Pinecone UpSert

In [None]:

class PineconeUpsert:
    """Main class to upsert both text/sentence/images with pinecone"""

    def _initPinecone(PINECONE_API_KEY:str,PINECONE_ENVIRONMENT:str,INDEX_NAME:str) -> pinecone.index:
        """Init Pinecone stuff"""

        pinecone.init(api_key=PINECONE_API_KEY,environment=PINECONE_ENVIRONMENT)
        # connect to index
        index:pinecone.index = pinecone.Index(INDEX_NAME)
        print(f'Successfully connected to Pinecone Index:\n{index.describe_index_stats()}')
        return index

    def __init__(self,PINECONE_API_KEY:str,PINECONE_ENVIRONMENT:str,INDEX_NAME:str):
        """ Initialize the Pinecone Index from .env file pinecone variables"""
        # connect to index
        self.index:pinecone.index = self._initPinecone(PINECONE_API_KEY,PINECONE_ENVIRONMENT,INDEX_NAME)
        self.embedding_model:OpenAIEmbeddings = OpenAIEmbeddings(model='text-embedding-ada-002')
        self.text_splitter:RecursiveCharacterTextSplitter = RecursiveCharacterTextSplitter(
                chunk_size=10,
                chunk_overlap=1,
                length_function=tiktoken_len,
                separators=["\n\n", "\n", " ", ""]
        )

        # List of metadata options per key
        self.metadata_options = {
            "text": "",
            "isImage": [True, False],
            "categories": ["personal","experience","projects","thoughts"]
        }
        

    
    def create_and_index_embeddings_text(self,data:str):
        """Upsert text to Pinecone Index"""
        # split text
        split_text = self.text_splitter.split_text(data)
        # embed text
        embedded_text = [embedding_model.embed_query(text) for text in split_text]
        # upsert to pinecone
        self.index.upsert(items=embedded_text,ids=split_text)
        print(f'Successfully upserted text to Pinecone Index:\n{self.index.describe_index_stats()}')



    

# Test the class
pinecone_upsert = PineconeUpsert(PINECONE_API_KEY,PINECONE_ENVIRONMENT,INDEX_NAME)

SyntaxError: invalid syntax (927392540.py, line 31)

# Pinecone Query

In [20]:
from langchain.vectorstores.pinecone import Pinecone

class PineconeQuery:
    """Main class to query both text/sentence/images with pinecone"""
    embedding_model:OpenAIEmbeddings = OpenAIEmbeddings(model='text-embedding-ada-002')

    def _initPinecone(self,PINECONE_API_KEY:str,PINECONE_ENVIRONMENT:str,INDEX_NAME:str) -> pinecone.index:
        """Init Pinecone stuff"""

        pinecone.init(api_key=PINECONE_API_KEY,environment=PINECONE_ENVIRONMENT)
        # connect to index
        index:pinecone.index = pinecone.Index(INDEX_NAME)
        print(f'Successfully connected to Pinecone Index:\n{index.describe_index_stats()}')
        return index

    def __init__(self,PINECONE_API_KEY:str,PINECONE_ENVIRONMENT:str,INDEX_NAME:str):
        """ Initialize the Pinecone Index from .env file pinecone variables"""
        # connect to index
        self.index = self._initPinecone(PINECONE_API_KEY,PINECONE_ENVIRONMENT,INDEX_NAME)
        self.embedding_model:OpenAIEmbeddings = embedding_model
        self.docsearch = Pinecone.from_existing_index(INDEX_NAME, embedding_model)

    
    def fetch(self,query:str):
        """Select a query and fetch the results"""
        retriever = self.docsearch.as_retriever(search_type="mmr",search_kwargs={"k": 3,"score_threshold": 0.6})
        matched_docs = retriever.get_relevant_documents(query)

        return matched_docs


# Test the class
pineconeQuery = PineconeQuery(PINECONE_API_KEY,PINECONE_ENVIRONMENT,INDEX_NAME)

pineconeQuery.fetch("i like to eat shit")

Successfully connected to Pinecone Index:
{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 1}},
 'total_vector_count': 1}


[Document(page_content='The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog.', metadata={'categories': 'personal', 'isImage': False})]