In [None]:
import os
import time

from langchain_groq import ChatGroq
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_pinecone import PineconeVectorStore
from langchain_classic.chains.retrieval import create_retrieval_chain
from langchain_classic.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate
from pinecone import Pinecone, ServerlessSpec

In [None]:
### LLM DE GROQ
GROQ_API_KEY = os.getenv("GROQ_API_KEY")

chat = ChatGroq(
    groq_api_key=GROQ_API_KEY,
    model_name="llama-3.1-8b-instant",
    temperature=0,
    streaming=True
)

In [None]:
### EMBEDDINGS DE HUGGINGFACE
embed_model = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)

print("embedding dim:", len(embed_model.embed_query("hola")))

In [None]:
### LOAD PDFS
def read_doc(directory):
    loader = PyPDFDirectoryLoader(directory)
    return loader.load()

In [None]:
dir = r"C:\Users\Kevin\Desktop\repos\propios\nlp2\tp2\docs\cv"
docs = read_doc(dir)

In [None]:
### SPLITS
splitter = RecursiveCharacterTextSplitter(
    chunk_size=3000,
    chunk_overlap=50
)
documents = splitter.split_documents(docs)

In [None]:
### PINECONE INIT
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
pc = Pinecone(api_key=PINECONE_API_KEY)

cloud = os.getenv("PINECONE_CLOUD") or "aws"
region = os.getenv("PINECONE_REGION") or "us-east-1"
spec = ServerlessSpec(cloud=cloud, region=region)

index_name = "kajachuan"
namespace = "nlp2"
DIMENSION = 384  # HuggingFace MiniLM

In [None]:
### CREATE INDEX
if index_name in pc.list_indexes().names():
    pc.delete_index(index_name)
    print("Index borrado:", index_name)

pc.create_index(
    name=index_name,
    dimension=DIMENSION,
    metric="cosine",
    spec=spec
)
print("Index creado:", index_name)

In [None]:
### UPSERT
docsearch = PineconeVectorStore.from_documents(
    documents=documents,
    index_name=index_name,
    embedding=embed_model,
    namespace=namespace
)
print("Vectores insertados")

time.sleep(1)

In [None]:
### RETRIEVER + QA
vectorstore = PineconeVectorStore(
    index_name=index_name,
    embedding=embed_model,
    namespace=namespace
)

system_prompt = (
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use three sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)
combine_docs_chain = create_stuff_documents_chain(llm=chat, prompt=prompt)
retrieval_chain = create_retrieval_chain(vectorstore.as_retriever(), combine_docs_chain)

query = "Cu√°l es la experiencia de Kevin?"
result = retrieval_chain.invoke({"input": query})

print(result["answer"])
