In [4]:
!pip install pypdf

Collecting pypdf
  Downloading pypdf-3.9.0-py3-none-any.whl (249 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m249.5/249.5 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: pypdf
Successfully installed pypdf-3.9.0


In [5]:
from langchain.document_loaders import UnstructuredPDFLoader, OnlinePDFLoader, PyPDFLoader

from langchain.text_splitter import RecursiveCharacterTextSplitter
import os

In [1]:
from dotenv import load_dotenv,find_dotenv

load_dotenv(find_dotenv())

True

In [6]:
loader = PyPDFLoader("TheWayOfKings.pdf")

## Other options for loaders 
# loader = UnstructuredPDFLoader("../data/field-guide-to-data-science.pdf")
# loader = OnlinePDFLoader("https://wolfpaulus.com/wp-content/uploads/2017/05/field-guide-to-data-science.pdf")

In [7]:
data = loader.load()

In [10]:
# Note: If you're using PyPDFLoader then it will split by page for you already
print (f'You have {len(data)} document(s) in your data')
print (f'There are {len(data[0].page_content)} characters in your document')

You have 1687 document(s) in your data
There are 0 characters in your document


In [11]:
# Note: If you're using PyPDFLoader then we'll be splitting for the 2nd time.
# This is optional, test out on your own data.

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_documents(data)

In [12]:
print (f'Now you have {len(texts)} documents')

Now you have 3128 documents


In [13]:
from langchain.vectorstores import Chroma, Pinecone
from langchain.embeddings.openai import OpenAIEmbeddings
import pinecone

  from tqdm.autonotebook import tqdm


In [18]:
# Check to see if there is an environment variable with you API keys, if not, use what you put below
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')

PINECONE_API_KEY = os.getenv('PINECONE_API_KEY_CONTATOGZ')
PINECONE_API_ENV = os.getenv('PINECONE_ENV_CONTATOGZ')

In [19]:
from langchain.embeddings import HuggingFaceEmbeddings
embeddings = HuggingFaceEmbeddings()

In [20]:
# initialize pinecone
pinecone.init(
    api_key=PINECONE_API_KEY,  # find at app.pinecone.io
    environment=PINECONE_API_ENV  # next to api key in console
)
index_name = "the-way-of-kings"

In [21]:
pinecone.create_index(name=index_name, metric='euclidean', dimension=768)
index = pinecone.Index(index_name)

In [22]:
docsearch = Pinecone.from_texts([t.page_content for t in texts], embeddings, index_name=index_name)

In [25]:
query = "What is the way of kings?"
docs = docsearch.similarity_search(query)

In [28]:
# Here's an example of the first document that was returned
print(docs)

[Document(page_content='THE WAY OF KINGS', metadata={}), Document(page_content='THE WAY OF KINGS', metadata={}), Document(page_content='This is a work of fiction. All of the characters,\norganizations, and events portrayed in this novel are\neither products of the author’s imagination or are\nused fictitiously.\nTHE WAY OF KINGS\nCopyright © 2010 by Dragonsteel Entertainment,\nLLC\nAll rights reserved.\nInterior illustrations by Isaac Stewart, Ben\nMcSweeney, and Greg Call Edited by Moshe Feder\nA Tor Book\nPublished by Tom Doherty Associates, LLC\n175 Fifth Avenue\nNew York, NY 10010\nwww.tor-forge.com\nTor\n®\n is a registered trademark of Tom Doherty\nAssociates, LLC.', metadata={}), Document(page_content='THE WAY OF KINGS\nBook One of\nTHE STORMLIGHT ARCHIVE\nA TOM DOHERTY ASSOCIATES BOOK • NEW YORK', metadata={})]


In [29]:
from langchain.llms import OpenAI
from langchain.chains.question_answering import load_qa_chain

In [33]:

llm = OpenAI(temperature=0.5, openai_api_key=OPENAI_API_KEY)
chain = load_qa_chain(llm, chain_type="stuff")

In [42]:
query = "What is a Gemheart and how it can it be obtained?"
docs = docsearch.similarity_search(query, k=20)

In [44]:
docs

[Document(page_content='Most gemhearts were harvested quite differently\nthan the one had been today. Sometime during the\nstrange life cycle of the chasmfiends, they sought the\nwestern side of the Plains, where the plateaus were\nwider. They climbed up onto the tops and made a\nrocky chrysalis, waiting for the coming of a\nhighstorm.\nDuring that time, they were vulnerable. You just\nhad to get to the plateau where it rested, break into\nits chrysalis with some mallets or a Shardblade, then\ncut out the gemheart. Easy work for a fortune. And\nthe beasts came frequently, often several times a\nweek, so long as the weather didn’t get too cold.\nDalinar looked up at the hulking carcass. Tiny,\nnear-invisible spren were floating out of the beast’s\nbody, vanishing into the air. They looked like the\ntongues of smoke that might come off a candle after\nbeing snuff ed. Nobody knew what kind of spren they\nwere; you only saw them around the freshly killed\nbodies of greatshells.\nHe shook h

In [None]:
chain.run(input_documents=docs, question=query)

In [40]:
query = "Tell me all the information you can about Kaladin"
docs = docsearch.similarity_search(query)

In [41]:
chain.run(input_documents=docs, question=query)

" Kaladin is a man with a bushy black beard and a glyphward tattoo on each hand. He was once a soldier and was called Stormblessed, but he has since come to believe he has bad luck. He is familiar with Brightlord Dalinar, a highprince and the most honorable Shardbearer in the king's army. He is also learning the ways of the lighteyes and eventually plans to return to his hometown to prove to Roshone, Rillir, and Laral that they were wrong to dismiss him."

## One thing I've noticed is that, yes the prompt keeps getting bigger. And that is concernable, like, financially and performancely speaking. With this is mind to, this may indicate that breaking the content in smaller chunks can be even better and more performant than the current approach. I'm not sure if this is the case, but it's worth a try. I'll try to do this in the next iteration.