In [None]:
# %pip install python-decouple langchain_community pypdf langchain_pinecone langchain_openai pinecone langchain

In [1]:
from decouple import config
from langchain_openai.chat_models import ChatOpenAI
from langchain_openai import OpenAIEmbeddings
from pinecone import Pinecone, ServerlessSpec

from langchain.text_splitter import RecursiveCharacterTextSplitter

from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA

from langchain_community.document_loaders import PyPDFDirectoryLoader

from langchain_pinecone import PineconeVectorStore
import time

In [2]:
OPENAI_API_KEY = config('OPENAI_API_KEY')
PINECONE_API_KEY= config('PINECONE_API_KEY')

In [3]:
def load_documents(directory):
    documentPDF = PyPDFDirectoryLoader(directory)
    documents = documentPDF.load()
    return documents

def chunk_data(docs, chunk_size=800, overlap=100):
    text_spliter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=overlap
        )
    return text_spliter.split_documents(docs)


documents = load_documents('./')
print(f"tamano de documentos: {len(documents)}")

chunks = chunk_data(documents)
print(f"tamano de chunks: {len(chunks)}")


tamano de documentos: 10
tamano de chunks: 41


In [4]:
model_name = "text-embedding-3-small"  
embeddings = OpenAIEmbeddings(  
    model=model_name,  
    openai_api_key=OPENAI_API_KEY 
)  

In [5]:
pc = Pinecone(api_key=PINECONE_API_KEY)

index_name = "test-index"

if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=1536, 
        metric="cosine", 
        spec=ServerlessSpec(
            cloud="aws",
            region="us-east-1"
        ) 
    ) 

In [7]:
namespace = "empresa-1"

docsearch = PineconeVectorStore.from_documents(
    documents=chunks,
    index_name=index_name,
    embedding=embeddings, 
    namespace=namespace 
)

time.sleep(1)

In [9]:
index = pc.Index(index_name)
print(index.describe_index_stats())

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'empresa-1': {'vector_count': 41}},
 'total_vector_count': 41}


In [10]:
def get_prompt(instruction, new_system_prompt ):
    SYSTEM_PROMPT = B_SYS + new_system_prompt + E_SYS
    prompt_template =  B_INST + SYSTEM_PROMPT + instruction + E_INST
    return prompt_template

B_INST, E_INST = "[INST]", "[/INST]"
B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"

sys_prompt = """You are a helpful, respectful, and honest assistant. Always provide the most helpful and accurate answer using only the contextual text provided. Do not add any information that is not in the context.

Guidelines:

1. **Relevance**: Only answer questions based on the provided context. Do not use outside knowledge.
2. **Honesty**: If a question does not make sense or is factually incoherent, explain why instead of providing incorrect information.
3. **Integrity**: If you don't know the answer based on the context, do not share false information.
4. **Scope**: If the question is outside the context, inform the user politely that it cannot be answered accurately.
5. **Clarity**: Ensure your answers are clear and concise, avoiding ambiguity or vagueness.
6. **Finality**: Answer the question directly and do not include any additional text after the answer.

Ensure that your answers are clear and concise, avoiding ambiguity or vague responses."""

instruction = """CONTEXT:/n/n {context}/n

Question: {question}"""

prompt_template = get_prompt(instruction, sys_prompt)
llama_prompt = PromptTemplate(
    template=prompt_template, input_variables=["context", "question"]
)
chain_type_kwargs = {"prompt": llama_prompt}

In [11]:
llm = ChatOpenAI(
    openai_api_key=OPENAI_API_KEY,
    model_name="gpt-3.5-turbo",
    temperature=0.0
)

retriever = docsearch.as_retriever(search_kwargs={"k": 3})

qa_chain = RetrievalQA.from_chain_type(llm=llm,
                                    chain_type="stuff",
                                    retriever=retriever,
                                    return_source_documents=True,
                                    chain_type_kwargs=chain_type_kwargs)

In [12]:
def process_llm_response(llm_response):
    print(llm_response['result'])
    print('\n\nFuentes:')
    for source in llm_response["source_documents"]:
        # Acceder a la metadata para obtener la página
        page_number = source.metadata['page']
        print(f"Fuente: {source.metadata['source']}, Pagina: {page_number}")

In [14]:
query1 = "Que es la primera guerra mundial?"

respoonse = qa_chain.invoke(query1)
process_llm_response(respoonse)

La Primera Guerra Mundial fue un conflicto que supuso un cambio en la forma de combatir y en el pensamiento del mundo moderno. Se caracterizó por el uso de nuevas tecnologías como bombas aéreas, sistemas de detección y destrucción, y gases venenosos, entre otros. Alemania fue uno de los países involucrados y resultó devastada al finalizar la guerra, siendo obligada a aceptar las condiciones impuestas por los vencedores en el tratado de Versalles.


Fuentes:
Fuente: historia.pdf, Pagina: 0.0
Fuente: historia.pdf, Pagina: 0.0
Fuente: historia.pdf, Pagina: 1.0
