In [1]:
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv(), override = True)

True

In [2]:
import document_loader as dl

In [3]:
# resume_data = dl.load_document('Data/Mamesa El Resume.pdf')
# cold_email = dl.load_document('Data/Cold Email Manuals.pdf')

dir_data = dl.directory_pdf_loader("./Data")

In [4]:
# Chunking documents
def chunk_data(data, chunk_size = 256):
    from langchain.text_splitter import RecursiveCharacterTextSplitter
    text_splitter = RecursiveCharacterTextSplitter(chunk_size = chunk_size, chunk_overlap = 0)
    chunks = text_splitter.split_documents(data)
    return chunks

def print_embedding_cost(texts):
    import tiktoken
    enc = tiktoken.encoding_for_model('text-embedding-ada-002')
    total_tokens = sum([len(enc.encode(page.page_content))for page in texts])
    print(f'Total Tokens: {total_tokens}')
    print(f'Embedding Cost in USD: {total_tokens/1000 *0.0004:0.6f}')
    

In [5]:
def insert_or_fetch_embeddings(index_name, chunks):
    # importing the necessary libraries and initializing the Pinecone client
    import pinecone
    from langchain_community.vectorstores import Pinecone
    from langchain_openai import OpenAIEmbeddings
    from pinecone import PodSpec

    
    pc = pinecone.Pinecone()
        
    embeddings = OpenAIEmbeddings(model='text-embedding-3-small', dimensions=1536)  # 512 works as well

    # loading from existing index
    if index_name in pc.list_indexes().names():
        print(f'Index {index_name} already exists. Loading embeddings ... ', end='')
        vector_store = Pinecone.from_existing_index(index_name, embeddings)
        print('Ok')
    else:
        # creating the index and embedding the chunks into the index 
        print(f'Creating index {index_name} and embeddings ...', end='')

        # creating a new index
        pc.create_index(
            name=index_name,
            dimension=1536,
            metric='cosine',
            spec=PodSpec(
                environment='gcp-starter'
            )
        )

        # processing the input documents, generating embeddings using the provided `OpenAIEmbeddings` instance,
        # inserting the embeddings into the index and returning a new Pinecone vector store object. 
        vector_store = Pinecone.from_documents(chunks, embeddings, index_name=index_name)
        print('Ok')
        
    return vector_store

def delete_pinecone_index(index_name = 'all'):
    import pinecone
    pc = pinecone.Pinecone()
    if index_name == 'all':
        indexes = pc.list_indexes().names()
        print("Deleting all indexes ...")
        for index in indexes:
            pc.delete_index(index)
        print('Ok')
    else:
        print(f"Deleting index {index_name} ...", end = '')
        pc.delete_index(index_name)
        print('Ok')

    

In [6]:
data = chunk_data(dir_data, chunk_size = 5000)
print_embedding_cost(data)

Total Tokens: 4872
Embedding Cost in USD: 0.001949


In [7]:
delete_pinecone_index()

  from tqdm.autonotebook import tqdm


Deleting all indexes ...
Ok


In [8]:
index_name = 'personaldata'
vector_store = insert_or_fetch_embeddings(index_name, data)


Creating index personaldata and embeddings ...Ok


: 