In [None]:
%pip install langchain
%pip install tiktoken
%pip install openai
%pip install pypdf
%pip install pinecone-client

In [2]:
# PDF Loaders. If unstructured gives you a hard time, try PyPDFLoader
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import os

Could not import azure.core python package.


### Load your data

In [3]:
loader = PyPDFLoader("../data/field-guide-to-data-science.pdf")

## Other options for loaders 
# loader = UnstructuredPDFLoader("../data/field-guide-to-data-science.pdf")
# loader = OnlinePDFLoader("https://wolfpaulus.com/wp-content/uploads/2017/05/field-guide-to-data-science.pdf")

In [4]:
data = loader.load()

In [5]:
# Note: If you're using PyPDFLoader then it will split by page for you already
print (f'You have {len(data)} document(s) in your data')
print (f'There are {len(data[30].page_content)} characters in your document')

You have 126 document(s) in your data
There are 2812 characters in your document


### Chunk your data up into smaller documents

In [6]:
# Note: If you're using PyPDFLoader then we'll be splitting for the 2nd time.
# This is optional, test out on your own data.

text_splitter = RecursiveCharacterTextSplitter(chunk_size=200, chunk_overlap=10)
texts = text_splitter.split_documents(data)

In [7]:
print (f'Now you have {len(texts)} documents')

Now you have 1222 documents


### Create embeddings of your documents to get ready for semantic search

In [8]:
from langchain.vectorstores import Pinecone
from langchain.embeddings.openai import OpenAIEmbeddings
import pinecone

  from tqdm.autonotebook import tqdm


In [9]:
# Check to see if there is an environment variable with you API keys, if not, use what you put below
OPENAI_API_KEY = os.environ.get('OPENAI_API_KEY', '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')

PINECONE_API_KEY = os.environ.get('PINECONE_API_KEY', '15a1bab4-632b-40ee-a1ab-4c4167b1fab4')
PINECONE_API_ENV = os.environ.get('PINECONE_API_ENV', 'us-west4-gcp-free') # You may need to switch with your env

In [10]:
embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)

In [11]:
# initialize pinecone
pinecone.init(
    api_key=PINECONE_API_KEY,  # find at app.pinecone.io
    environment=PINECONE_API_ENV  # next to api key in console
)
index_name = "langchain" # put in the name of your pinecone index here

In [12]:
docsearch = Pinecone.from_texts([t.page_content for t in texts], embeddings, index_name=index_name)

# if you already have an index, you can load it like this
# docsearch = Pinecone.from_existing_index(index_name, embeddings)

In [40]:
query = "What are some top quotes you can list relating to data science?"
docs = docsearch.similarity_search(query)

In [41]:
# Here's an example of the first document that was returned
print(f"Number of docs retrieved: {len(docs)}")

Number of docs retrieved: 4


### Query those docs to get your answer back

In [20]:
from langchain.llms import OpenAI
from langchain.chains.question_answering import load_qa_chain
from langchain.callbacks import get_openai_callback

In [43]:

with get_openai_callback() as cb:
  llm = ChatOpenAI(temperature=0.7, openai_api_key=OPENAI_API_KEY)
  chain = load_qa_chain(llm, chain_type="stuff")  
  docs = docsearch.similarity_search(query)
  
  print(chain.run(input_documents=docs, question=query))
  
  print(cb)


Here are some quotes related to data science from the given context:

- "In the jungle of data, don't miss the forest for the trees, or the trees for the forest." - Paul Yacci
- "I treat Data Science like I do rock." - Stephanie Rivera
- "Data Science is about formally analyzing everything around you and becoming data-driven." - @ekohlwey
- "The best way to predict the future is to have your data tell you what it is." - @fchollet
Tokens Used: 329
	Prompt Tokens: 223
	Completion Tokens: 106
Successful Requests: 1
Total Cost (USD): $0.000658


### Using Conversational Chain

In [39]:
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from langchain.chat_models import ChatOpenAI

memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
chat_history = []

with get_openai_callback() as cb:
  qa = ConversationalRetrievalChain.from_llm(ChatOpenAI(model="gpt-3.5-turbo", temperature=0, openai_api_key=OPENAI_API_KEY), docsearch.as_retriever())

  query = "What are some top quotes you can list relating to data science?"  
  result = qa({"question": query, "chat_history": chat_history})
  chat_history.append((query, result["answer"]))
  print(result["answer"])

  query = "What is your top quote from from the list and why?"  
  result = qa({"question": query, "chat_history": chat_history})
  chat_history.append((query, result["answer"]))
  print(result["answer"])


  print(cb)

Here are some top quotes related to data science:

1. "In God we trust. All others must bring data." - W. Edwards Deming
2. "Data is the new oil." - Clive Humby
3. "Without big data analytics, companies are blind and deaf, wandering out onto the web like deer on a freeway." - Geoffrey Moore
4. "Data science is not a magic wand that can wave away all complexity, but it is a powerful tool for gaining insights from data." - Cathy O'Neil
5. "Data is a precious thing and will last longer than the systems themselves." - Tim Berners-Lee
6. "Data science is about using data to create as much impact as possible for your company." - Hilary Mason
7. "The goal is to turn data into information, and information into insight." - Carly Fiorina
8. "Data scientists are like artists, but instead of painting or sculpting, we create models and algorithms." - Jake Porway
9. "Data science is the sexiest job of the 21st century." - Hal Varian
10. "Data is the new science. Big data holds the answers." - Pat Ge