In [1]:
from IPython.display import Markdown, display

from chunker import get_chunks

## Loading Documents

In [2]:
from langchain.document_loaders import TextLoader
from langchain.text_splitter import CharacterTextSplitter, SpacyTextSplitter

In [4]:
loader = TextLoader("data/world_bank_articles.txt", encoding='utf-8')
documents = loader.load()

In [16]:

text_splitter = CharacterTextSplitter(chunk_size=2000, chunk_overlap=0, separator='\n')
docs = text_splitter.split_documents(documents)

Created a chunk of size 2397, which is longer than the specified 2000
Created a chunk of size 2804, which is longer than the specified 2000


### Patch
THe `CharacterTextSplitter` is unreliable and will often output chunks larger than the value we specify. Until this can be solved, we will resort to our tried and tested lower level custom code.

In [35]:
with open("data/imf_article_txt", encoding='utf-8') as f:
    documents = f.read()

In [36]:
docs = get_chunks(documents, tokens=1000)

## Embeddings

In [7]:
from langchain.embeddings import GooglePalmEmbeddings

In [8]:
import os
palm_api_key = os.getenv("PALM_API_KEY")

In [9]:
embeddings = GooglePalmEmbeddings(google_api_key=palm_api_key)

## Vector Stores
These allow us to store our text embeddings for querying later. In this notebook we will be using `pinecone`.

In [10]:
from langchain.vectorstores.pinecone import Pinecone

### Connect to Pinecone

In [11]:
import os
import pinecone

# initialize pinecone
pinecone.init(
    api_key=os.getenv("PINECONE_API_KEY_03"),  # find at app.pinecone.io
    environment=os.getenv("PINECONE_ENV"),  # next to api key in console
)


In [17]:

index_name = "econwiki"

# First, check if our index already exists. If it doesn't, we create it
if index_name not in pinecone.list_indexes():
    # we create a new index
    pinecone.create_index(name=index_name, metric="cosine", dimension=768)
# The OpenAI embedding model `text-embedding-ada-002 uses 1536 dimensions`
docsearch = Pinecone.from_documents(docs, embeddings, index_name=index_name)

# if you already have an index, you can load it like this
# docsearch = Pinecone.from_existing_index(index_name, embeddings)

query = "What is the point of this text?"
docs = docsearch.similarity_search(query)

## LLM

In [None]:
from langchain.llms import GooglePalm

In [None]:
llm = GooglePalm(google_api_key=palm_api_key, temperature=0.7)

## RetrievalQA
In this section we explore retrieval using the `stuff` method.

In [None]:
from langchain.chains import RetrievalQA

In [None]:
retriever = docsearch.as_retriever()

In [None]:
qa_stuff = RetrievalQA.from_chain_type(
    llm=llm, 
    chain_type="stuff", 
    retriever=retriever, 
    verbose=True
)

In [None]:
query = "What is the point of this text?"

In [None]:
response = qa_stuff.run(query)
display(Markdown(response))



[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


This document provides instructions on how to annotate vehicles, pedestrians, and two-wheelers for validation.

## Other Retrieval Methods

### Map Reduce
Gets a response for each chunk and question pair then uses another llm call to summarize the responses.
Good for recursive document summarization.

- Requires more calls
- Assumes documents (chunks) are independent

### Refine
Iteratively builds answer from past calls (like a cascade).

- Longer answer
- Takes longer (slower)
- At least as many calls as map reduce.

### Map Rerank (Experimental)
Single call for each document returning a score from which we specify the highest score to select.
- Faster