In [4]:
import os
import openai
from dotenv import load_dotenv

load_dotenv()
openai.api_key = os.environ['OPENAI_API_KEY']

In [5]:
from langchain.document_loaders import PyPDFLoader
from langchain_core.documents import Document
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_text_splitters import RecursiveCharacterTextSplitter
from typing import (List)

embedding = OpenAIEmbeddings()
splitter = RecursiveCharacterTextSplitter()

documents = []
loaders = [
    PyPDFLoader("../data/autogpt.pdf"),
    PyPDFLoader("../data/lora.pdf"),
]

for loader in loaders:
    documents.extend(loader.load())

chunks = splitter.split_documents(documents)


In [6]:
persist_directory = '../docs/chroma/02'
vectordb = Chroma(
    embedding_function=embedding,
    persist_directory=persist_directory
)

if (len(vectordb.get()['ids']) < 1):
    print('persist...')
    vectordb.from_documents(
        documents=chunks,
        embedding=embedding,
        persist_directory=persist_directory
    )



In [7]:
# Working with metadata
question = "what about the persistence boost"
docs = vectordb.similarity_search(question,k=3)
auto_gpt_docs = vectordb.similarity_search(question,k=3, filter={"source": "../data/autogpt.pdf"})

for index in range(3):
    print(docs[index].metadata)
    print('++++')
    print(auto_gpt_docs[index].metadata)
    print('\n')
    

{'page': 5, 'source': '../data/autogpt.pdf'}
++++
{'page': 5, 'source': '../data/autogpt.pdf'}


{'page': 2, 'source': '../data/lora.pdf'}
++++
{'page': 4, 'source': '../data/autogpt.pdf'}


{'page': 4, 'source': '../data/lora.pdf'}
++++
{'page': 2, 'source': '../data/autogpt.pdf'}




In [32]:
# Working with metadata using selfquery
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain.chains.query_constructor.base import AttributeInfo
from langchain.llms import OpenAI

metadata_field_info = [
    AttributeInfo(
        name="source",
        description="The paper the chunk is from, should be one of `../data/autogpt.pdf`, `../data/lora.pdf`",
        type="string",
    ),
    AttributeInfo(
        name="page",
        description="The page from the paper",
        type="integer",
    ),
]


llm = OpenAI(model='gpt-3.5-turbo-instruct', temperature=0)

document_content_description = "LLM Papers"

retriever = SelfQueryRetriever.from_llm(
    llm,
    vectordb,
    document_content_description,
    metadata_field_info,
    verbose=True
)


In [36]:

query = 'What did they say about methodology in the autogpt paper page 1 ?' # cause error
query = 'What did they say about methodology in the autogpt paper ?' 

docs = retriever.get_relevant_documents(query)
for d in docs:
    print(d.page_content[:20])
    print(d.metadata)




3.4 Additional opini
{'page': 3, 'source': '../data/autogpt.pdf'}
such as self-consist
{'page': 0, 'source': '../data/autogpt.pdf'}
to enhance its decis
{'page': 5, 'source': '../data/autogpt.pdf'}
and ALFWorld experim
{'page': 5, 'source': '../data/autogpt.pdf'}


In [37]:
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor

def pretty_print_docs(docs):
    print(f"\n{'-' * 100}\n".join([f"Document {i+1}:\n\n" + d.page_content for i, d in enumerate(docs)]))

compressor = LLMChainExtractor.from_llm(llm)

compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor,
    base_retriever=vectordb.as_retriever()
)

In [38]:
query = 'What did they say about methodology in the autogpt paper ?' 
compressed_docs = compression_retriever.get_relevant_documents(query)
pretty_print_docs(compressed_docs)

Document 1:

"Our experimental results highlight the successful adaptation
of the Auto-GPT styled agent to complex online decision-
making tasks through straightforward prompt design, sur-
passing IL-based baseline models specifically designed for
these tasks. Among the foundational LLMs powering Auto-
GPT, GPT-4 demonstrates superior performance. Addition-
ally, we introduce an innovative strategy of incorporat-
ing additional opinions from external expert models, fur-
ther enhancing the decision-making capabilities of Auto-
GPT styled agents, particularly benefiting GPT-4. Our Addi-
tional Opinions algorithm provides a lightweight supervised
training approach for Auto-GPT styled agents, enabling
improved performance without requiring extensive fine-
tuning of the LLMs."
----------------------------------------------------------------------------------------------------
Document 2:

- "2 METHODOLOGY"
- "2.1 Tasks and baseline models"
- "2.1.1 WebShop. Webshop [ 24] is a simulated envi