In [4]:
from IPython.display import Markdown, display

## Loading Documents

In [5]:
from langchain.document_loaders import TextLoader
from langchain.text_splitter import CharacterTextSplitter

In [6]:
loader = TextLoader("data/sample.txt", encoding='utf-8')
documents = loader.load()

In [7]:
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
docs = text_splitter.split_documents(documents)

## Embeddings

In [8]:
from langchain.embeddings import GooglePalmEmbeddings

In [9]:
import os
palm_api_key = os.getenv("PALM_API_KEY")

In [10]:
embeddings = GooglePalmEmbeddings(google_api_key=palm_api_key)

## Vector Stores
These allow us to store our text embeddings for querying later. In this notebook we will be using `pinecone`.

In [11]:
from langchain.vectorstores.pinecone import Pinecone

### Connect to Pinecone

In [12]:
import os
import pinecone

# initialize pinecone
pinecone.init(
    api_key=os.getenv("PINECONE_API_KEY_02"),  # find at app.pinecone.io
    environment=os.getenv("PINECONE_ENV"),  # next to api key in console
)


In [13]:

index_name = "langchain-demo"

# First, check if our index already exists. If it doesn't, we create it
if index_name not in pinecone.list_indexes():
    # we create a new index
    pinecone.create_index(name=index_name, metric="cosine", dimension=768)
# The OpenAI embedding model `text-embedding-ada-002 uses 1536 dimensions`
docsearch = Pinecone.from_documents(docs, embeddings, index_name=index_name)

# if you already have an index, you can load it like this
# docsearch = Pinecone.from_existing_index(index_name, embeddings)

query = "What is the point of this text?"
docs = docsearch.similarity_search(query)

In [14]:
index = pinecone.GRPCIndex("langchain-demo")
index.describe_index_stats()

{'dimension': 768,
 'index_fullness': 0.00074,
 'namespaces': {'': {'vector_count': 74}},
 'total_vector_count': 74}

## LLM

In [15]:
from langchain.llms import GooglePalm

In [16]:
llm = GooglePalm(google_api_key=palm_api_key, temperature=0.7)

## RetrievalQA

In [17]:
from langchain.chains import RetrievalQA

In [None]:
query = "What is the procedure of annotating a vehicle?"

In [18]:
retriever = docsearch.as_retriever()

### Stuff Method
In this section we explore retrieval using the `stuff` method.

In [24]:
qa_stuff = RetrievalQA.from_chain_type(
    llm=llm, 
    chain_type="stuff", 
    retriever=retriever, 
    verbose=True
)

In [25]:
response = qa_stuff.run(query)
display(Markdown(response))



[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


The procedure of annotating a vehicle is as follows:

1. Create a 3D bounding box for the vehicle, based on available LiDAR measurements, in combination with vision based information from the image. If there is no LiDAR data, the 3D bounding box should still be annotated and the vehicle size and position in the world should be estimated according to the table below.
2. Each task will contain around 300 frames and each object should be tracked with a single ID.
3. Each bounding box should be drawn as tightly as possible to the object.
4. Please take the following steps to avoid making errors and to ensure highest quality:
    * Use the most specific label possible for each object.
    * Make sure that the object is completely inside the bounding box.
    * Do not overlap bounding boxes.
    * Do not draw bounding boxes around parts of objects.
5. If the cargo is a vehicle, it should be annotated as a NondescriptVehicle object. Cargo should otherwise not be annotated.
6. For release validation, the cargo should not be annotated, even if the cargo is a vehicle.

## Other Retrieval Methods

### Map Reduce
Gets a response for each chunk and question pair then uses another llm call to summarize the responses.
Good for recursive document summarization.

- Requires more calls
- Assumes documents (chunks) are independent

In [None]:
qa_stuff = RetrievalQA.from_chain_type(
    llm=llm, 
    chain_type="map_reduce", 
    retriever=retriever, 
    verbose=True
)

In [None]:
response = qa_stuff.run(query)
display(Markdown(response))



[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


The procedure of annotating a vehicle is as follows:

1. Create a 3D bounding box for the vehicle, based on available LiDAR measurements, in combination with vision based information from the image. If there is no LiDAR data, the 3D bounding box should still be annotated and the vehicle size and position in the world should be estimated according to the table below.
2. Each task will contain around 300 frames and each object should be tracked with a single ID.
3. Each bounding box should be drawn as tightly as possible to the object.
4. Please take the following steps to avoid making errors and to ensure highest quality:
    * Use the most specific label possible for each object.
    * Make sure that the object is completely inside the bounding box.
    * Do not overlap bounding boxes.
    * Do not draw bounding boxes around parts of objects.
5. If the cargo is a vehicle, it should be annotated as a NondescriptVehicle object. Cargo should otherwise not be annotated.
6. For release validation, the cargo should not be annotated, even if the cargo is a vehicle.

### Refine
Iteratively builds answer from past calls (like a cascade).

- Longer answer
- Takes longer (slower)
- At least as many calls as map reduce.

In [None]:
qa_stuff = RetrievalQA.from_chain_type(
    llm=llm, 
    chain_type="refine", 
    retriever=retriever, 
    verbose=True
)

In [None]:
response = qa_stuff.run(query)
display(Markdown(response))



[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


The procedure of annotating a vehicle is as follows:

1. Create a 3D bounding box for the vehicle, based on available LiDAR measurements, in combination with vision based information from the image. If there is no LiDAR data, the 3D bounding box should still be annotated and the vehicle size and position in the world should be estimated according to the table below.
2. Each task will contain around 300 frames and each object should be tracked with a single ID.
3. Each bounding box should be drawn as tightly as possible to the object.
4. Please take the following steps to avoid making errors and to ensure highest quality:
    * Use the most specific label possible for each object.
    * Make sure that the object is completely inside the bounding box.
    * Do not overlap bounding boxes.
    * Do not draw bounding boxes around parts of objects.
5. If the cargo is a vehicle, it should be annotated as a NondescriptVehicle object. Cargo should otherwise not be annotated.
6. For release validation, the cargo should not be annotated, even if the cargo is a vehicle.

### Map Rerank (Experimental)
Single call for each document returning a score from which we specify the highest score to select.
- Faster