In [1]:
from decouple import config
from langchain_openai.chat_models import ChatOpenAI
from langchain_openai import OpenAIEmbeddings
from pinecone import Pinecone, ServerlessSpec

from langchain.text_splitter import RecursiveCharacterTextSplitter

from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA

from langchain_community.document_loaders import PyPDFDirectoryLoader

from langchain_pinecone import PineconeVectorStore
import time

In [2]:
OPENAI_API_KEY = config('OPENAI_API_KEY')
PINECONE_API_KEY= config('PINECONE_API_KEY')

In [7]:
def load_documents(directory):
    documentPDF = PyPDFDirectoryLoader(directory)
    documents = documentPDF.load()
    return documents

def chunk_data(docs, chunk_size=800, overlap=100):
    text_spliter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=overlap
        )
    return text_spliter.split_documents(docs)


documents = load_documents('./')
print(f"tamano de documentos: {len(documents)}")

chunks = chunk_data(documents)
print(f"tamano de chunks: {len(chunks)}")


tamano de documentos: 19
tamano de chunks: 89


In [3]:
model_name = "text-embedding-3-small"  
embeddings = OpenAIEmbeddings(  
    model=model_name,  
    openai_api_key=OPENAI_API_KEY 
)  

In [11]:
def calculate_embeddings(chunks, embeddings):
    chunk_embeddings = []
    for chunk in chunks:
        embedding = embeddings.embed_query(chunk.page_content)
        metadata = chunk.metadata.copy()  # Copia los metadatos existentes
        metadata['text'] = chunk.page_content  # Agrega el texto a los metadatos
        chunk_embeddings.append({
            'embedding': embedding,
            'metadata': metadata
        })
    return chunk_embeddings

chunk_embeddings = calculate_embeddings(chunks, embeddings)

In [12]:
import json

def save_embeddings_to_json(chunk_embeddings, filename):
    with open(filename, 'w') as f:
        json.dump(chunk_embeddings, f)

save_embeddings_to_json(chunk_embeddings, 'embeddings.json')

In [13]:
def load_embeddings_from_json(filename):
    with open(filename, 'r') as f:
        return json.load(f)

loaded_embeddings = load_embeddings_from_json('embeddings.json')

In [14]:
def save_to_pinecone(loaded_embeddings, index_name, namespace):
    pc = Pinecone(api_key=PINECONE_API_KEY)

    if index_name not in pc.list_indexes().names():
        pc.create_index(
            name=index_name,
            dimension=1536, 
            metric="cosine", 
            spec=ServerlessSpec(
                cloud="aws",
                region="us-east-1"
            ) 
        )
        # wait for index to be initialized
        while not pc.describe_index(index_name).status['ready']:
            time.sleep(1)
            
    # connect to index
    index = pc.Index(index_name)
    time.sleep(1)

    batch_size = 100
    for i in range(0, len(loaded_embeddings), batch_size):
        batch = loaded_embeddings[i:i+batch_size]
        vectors = [(str(j+i), item['embedding'], item['metadata']) for j, item in enumerate(batch)]
        index.upsert(vectors=vectors, namespace=namespace)

index_name = "test-index"
namespace = "amazon"
save_to_pinecone(loaded_embeddings, index_name, namespace)

Test

In [5]:

model_name = "text-embedding-3-small"  
embeddings = OpenAIEmbeddings(  
    model=model_name,  
    openai_api_key=OPENAI_API_KEY 
)  

query = "Que es la SS?"

xq = embeddings.embed_query(query)

In [11]:
pc = Pinecone(api_key=PINECONE_API_KEY)

index_name = "test-index"
namespace = "amazon"

# connect to index
index = pc.Index(index_name)
time.sleep(1)

res = index.query(
    namespace=namespace,
    vector=xq,
    top_k=3,
    includeMetadata= True
)

print(res)

{'matches': [{'id': '13',
              'metadata': {'page': 2.0,
                           'source': 'historia.pdf',
                           'text': 'Holocaust Memorial Museum, Organization of '
                                   'SS and german police presented at \n'
                                   'Nuremberg trial, Diciembre 20, 1945 y '
                                   'Enero 2, 1946)  \n'
                                   '• La SS:  \n'
                                   'La Schutzstaffel  o escuadras de protecc '
                                   'ión, de acuerdo con United States \n'
                                   'Holocaust Memorial Museum: “Se hicieron '
                                   'cargo de la seguridad, la identificación '
                                   'del origen étnico, la política de '
                                   'establecimiento demográfico, y la '
                                   'recopilación y el \n'
                                