In [None]:
!pip install pypdf

Collecting pypdf
  Downloading pypdf-3.9.0-py3-none-any.whl (249 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m249.5/249.5 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: pypdf
Successfully installed pypdf-3.9.0


In [1]:
from langchain.document_loaders import UnstructuredPDFLoader, OnlinePDFLoader, PyPDFLoader

from langchain.text_splitter import RecursiveCharacterTextSplitter
import os

In [2]:
from dotenv import load_dotenv,find_dotenv

load_dotenv(find_dotenv())

True

In [2]:
loader = PyPDFLoader("TheWayOfKings.pdf")

## Other options for loaders 
# loader = UnstructuredPDFLoader("../data/field-guide-to-data-science.pdf")
# loader = OnlinePDFLoader("https://wolfpaulus.com/wp-content/uploads/2017/05/field-guide-to-data-science.pdf")

In [3]:
data = loader.load()

In [4]:
# Note: If you're using PyPDFLoader then it will split by page for you already
print (f'You have {len(data)} document(s) in your data')
print (f'There are {len(data[0].page_content)} characters in your document')

You have 1687 document(s) in your data
There are 0 characters in your document


In [8]:
# Note: If you're using PyPDFLoader then we'll be splitting for the 2nd time.
# This is optional, test out on your own data.

text_splitter = RecursiveCharacterTextSplitter(chunk_size=95, chunk_overlap=20)
texts = text_splitter.split_documents(data)

Tryied with bigger chunks and smaller chunks, different opinions. Not conclusive results, people argue that between 500 and 1000 is the best choice. Using windows for content is a better approach.

In [19]:
print (f'Now you have {len(texts)} documents')

Now you have 34733 documents


In [18]:
texts

[Document(page_content='THE WAY OF KINGS', metadata={'source': 'TheWayOfKings.pdf', 'page': 1}),
 Document(page_content='TOR BOOKS BY BRANDON SANDERSON\nWarbreaker\nThe Mistborn Trilogy\nMistborn\nThe Well of Ascension', metadata={'source': 'TheWayOfKings.pdf', 'page': 2}),
 Document(page_content='The Hero of Ages\nElantris', metadata={'source': 'TheWayOfKings.pdf', 'page': 2}),
 Document(page_content='BRANDON SANDERSON', metadata={'source': 'TheWayOfKings.pdf', 'page': 3}),
 Document(page_content='THE WAY OF KINGS\nBook One of\nTHE STORMLIGHT ARCHIVE\nA TOM DOHERTY ASSOCIATES BOOK • NEW YORK', metadata={'source': 'TheWayOfKings.pdf', 'page': 4}),
 Document(page_content='For Emily,\n     Who is too patient\n             Too kindly\n                 And too wonderful', metadata={'source': 'TheWayOfKings.pdf', 'page': 5}),
 Document(page_content='For words.\n         But I try anyway.', metadata={'source': 'TheWayOfKings.pdf', 'page': 5}),
 Document(page_content='ACKNOWLEDGMENTS\nI finis

In [11]:
from langchain.vectorstores import Chroma, Pinecone
from langchain.embeddings.openai import OpenAIEmbeddings
import pinecone

  from tqdm.autonotebook import tqdm


In [12]:
# Check to see if there is an environment variable with you API keys, if not, use what you put below
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
PINECONE_API_KEY = os.getenv('PINECONE_API_KEY_CONTATOGZ')
PINECONE_API_ENV = os.getenv('PINECONE_ENV_CONTATOGZ')

In [13]:
from langchain.embeddings import HuggingFaceEmbeddings
embeddings = HuggingFaceEmbeddings()

In [14]:
# initialize pinecone
pinecone.init(
    api_key=PINECONE_API_KEY, 
    environment=PINECONE_API_ENV  
)
index_name = "the-way-of-kings"

In [17]:
pinecone.create_index(name=index_name, metric='euclidean', dimension=768)
index = pinecone.Index(index_name)

In [20]:
docsearch = Pinecone.from_texts([t.page_content for t in texts], embeddings, index_name=index_name)

In [23]:
query = "What is the way of kings?"
docs = docsearch.similarity_search(query, k=75)

In [24]:
docs

[Document(page_content='THE WAY OF KINGS', metadata={}),
 Document(page_content='THE WAY OF KINGS', metadata={}),
 Document(page_content='The Way of\nKings\n. But I didn’t understand something. Nohadon', metadata={}),
 Document(page_content='The Way of Kings,\n and that is only because of the\nVanrial’s efforts.”', metadata={}),
 Document(page_content='The Way of Kings\n, “‘I made the trip and—as the', metadata={}),
 Document(page_content='THE WAY OF KINGS\nCopyright © 2010 by Dragonsteel Entertainment,\nLLC\nAll rights reserved.', metadata={}),
 Document(page_content='The Way of\nKings\n. It would disagree strongly with the things\nSadeas was implying.', metadata={}),
 Document(page_content='make the point that kings should consider the\nconsequences of their commands?”', metadata={}),
 Document(page_content='Kill, destroy, and cut your way to the king. Be\nseen doing it. Leave witnesses. Wounded but', metadata={}),
 Document(page_content='book called \nThe Way of Kings\n. Gavilar fav

In [25]:
from langchain.llms import OpenAI
from langchain.chains.question_answering import load_qa_chain

In [26]:

llm = OpenAI(temperature=0.5, openai_api_key=OPENAI_API_KEY)
chain = load_qa_chain(llm, chain_type="stuff")

In [45]:
query = "Define and detail what is a highstorm?"
docs = docsearch.similarity_search(query, k=80)

In [47]:
chain.run(input_documents=docs, question=query)

' A highstorm is a powerful storm that typically occurs in the world of Roshar. It is characterized by strong winds, heavy rains, and large amounts of Stormlight. Highstorms usually move from east to west, and can cause flooding and destruction. They are predicted by stormwardens and can be temporarily shielded from by nearby rock formations.'

In [40]:
query = "Tell me all the information you can about Kaladin"
docs = docsearch.similarity_search(query)

In [41]:
chain.run(input_documents=docs, question=query)

" Kaladin is a man with a bushy black beard and a glyphward tattoo on each hand. He was once a soldier and was called Stormblessed, but he has since come to believe he has bad luck. He is familiar with Brightlord Dalinar, a highprince and the most honorable Shardbearer in the king's army. He is also learning the ways of the lighteyes and eventually plans to return to his hometown to prove to Roshone, Rillir, and Laral that they were wrong to dismiss him."

## One thing I've noticed is that, yes the prompt keeps getting bigger. And that is concernable, financially and performancely speaking. With this is mind too, this may indicate that breaking the content in smaller chunks can be even better and more performant than the current approach. I'm not sure if this is the case, but it's worth a try. I'll try to do this in the next iteration.

[Question: Why ChatGPT Plus (GPT-4) answers better than local Langchain + Pinecone Tests?](https://www.reddit.com/r/LangChain/comments/13o5m15/question_why_chatgpt_plus_gpt4_answers_better/)