In [1]:
import os
import time

from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_pinecone import PineconeVectorStore
from pinecone import Pinecone, ServerlessSpec

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
### EMBEDDINGS DE HUGGINGFACE
embed_model = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)

DIMENSION = len(embed_model.embed_query("hola"))

In [3]:
### LOAD PDFS
def read_doc(directory):
    loader = PyPDFDirectoryLoader(directory)
    return loader.load()

In [4]:
people = ['beatriz', 'carlos', 'kevin']

In [5]:
### PINECONE INIT
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
pc = Pinecone(api_key=PINECONE_API_KEY)

cloud = os.getenv("PINECONE_CLOUD") or "aws"
region = os.getenv("PINECONE_REGION") or "us-east-1"
spec = ServerlessSpec(cloud=cloud, region=region)

In [6]:
### CREATE INDEX
index_name = "cv-index"

if index_name in pc.list_indexes().names():
    pc.delete_index(index_name)
    print("Index borrado:", index_name)

pc.create_index(
    name=index_name,
    dimension=DIMENSION,
    metric="cosine",
    spec=spec
)
print("Index creado:", index_name)

Index borrado: cv-index
Index creado: cv-index


In [7]:
for person_name in people:
    dir = rf"C:\Users\Kevin\Desktop\repos\propios\nlp2\tp3\docs\{person_name}"
    docs = read_doc(dir)

    splitter = RecursiveCharacterTextSplitter(
        chunk_size=3000,
        chunk_overlap=50
    )
    documents = splitter.split_documents(docs)

    ### UPSERT
    docsearch = PineconeVectorStore.from_documents(
        documents=documents,
        index_name=index_name,
        embedding=embed_model,
        namespace=person_name
    )
    print(f"Vectores insertados para {person_name}")

    time.sleep(1)

Vectores insertados para beatriz
Vectores insertados para carlos
Vectores insertados para kevin
