In [73]:
from langchain.document_loaders import PyPDFLoader
from langchain.chains import RetrievalQA
from langchain_openai import ChatOpenAI
from langchain_openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from dotenv import load_dotenv
import openai
import os
from langchain.vectorstores import Chroma


## Load PDF data
Loads the data and splits it into chunks.
Each chunk contains 1 langchain document which corresponds to  1 PDF page.

In [47]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
loader = PyPDFLoader("data/document.pdf")
chunks = loader.load_and_split(text_splitter)

In [56]:
print(chunks[0])
print("The chunk contains " + str(len(chunks[0].page_content)) + " characters")

page_content="Can Machines Really Think,Learn, and Act Intelligently?In this post, we're going to define whatmachine learning is and how computersthink and learn. We're also going to look atsome history relevant to the developmentof the intelligent machine.jeffSep 18, 2023\nGenerated using Microsoft Designer\n😎\nThere are so many introductory posts about AI and ML, andyet I decided to write this one. Do you know why? Becausethey're all boring. Not this one though, this one is cool. It alsoincludes an awesome optional quiz to test your AIsuperpowers. But here's the deal, you promise to subscribeif you do the quiz. Okay?" metadata={'source': 'data/document.pdf', 'page': 0}
The chunk contains 609 characters


## Setup models

In [57]:

# Load environment variables from .env file
load_dotenv()
# Access the API key using the variable name defined in the .env file
openai.api_key = os.getenv("OPENAI_API_KEY")

# Initialize the OpenAI chat model
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0.8)

# initialize the OpenAI embeddings model
embeddings = OpenAIEmbeddings()

In [79]:
if os.path.exists("chroma"):
    print("Loading Chroma from disk.")
    Chroma(persist_directory="chroma", embedding_function=embeddings)
else:
    chroma_db = Chroma.from_documents(documents=chunks,
                                    embedding=embeddings,
                                    persist_directory="chroma",
                                    collection_name="lc_chroma_demo")

Loading Chroma from disk.


In [65]:
query = "What is this document about?"

In [69]:
result = chroma_db.similarity_search(query)
print(result)

[Document(page_content='📚', metadata={'page': 3, 'source': 'data/document.pdf'}), Document(page_content="sense? Great. If not, have no fear as we're going todive deeper into this topic below, so keep on reading!", metadata={'page': 3, 'source': 'data/document.pdf'}), Document(page_content="Can Machines Really Think,Learn, and Act Intelligently?In this post, we're going to define whatmachine learning is and how computersthink and learn. We're also going to look atsome history relevant to the developmentof the intelligent machine.jeffSep 18, 2023\nGenerated using Microsoft Designer\n😎\nThere are so many introductory posts about AI and ML, andyet I decided to write this one. Do you know why? Becausethey're all boring. Not this one though, this one is cool. It alsoincludes an awesome optional quiz to test your AIsuperpowers. But here's the deal, you promise to subscribeif you do the quiz. Okay?", metadata={'page': 0, 'source': 'data/document.pdf'}), Document(page_content='Do it.Become a me

In [72]:
result_with_scores = chroma_db.similarity_search_with_score(query)
print(result_with_scores)

[(Document(page_content='📚', metadata={'page': 3, 'source': 'data/document.pdf'}), 0.45090146004976395), (Document(page_content="sense? Great. If not, have no fear as we're going todive deeper into this topic below, so keep on reading!", metadata={'page': 3, 'source': 'data/document.pdf'}), 0.47940380145106226), (Document(page_content="Can Machines Really Think,Learn, and Act Intelligently?In this post, we're going to define whatmachine learning is and how computersthink and learn. We're also going to look atsome history relevant to the developmentof the intelligent machine.jeffSep 18, 2023\nGenerated using Microsoft Designer\n😎\nThere are so many introductory posts about AI and ML, andyet I decided to write this one. Do you know why? Becausethey're all boring. Not this one though, this one is cool. It alsoincludes an awesome optional quiz to test your AIsuperpowers. But here's the deal, you promise to subscribeif you do the quiz. Okay?", metadata={'page': 0, 'source': 'data/document.p

In [74]:
chain = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=chroma_db.as_retriever())

In [77]:
response = chain.invoke(query)
print(response)

{'query': 'What is this document about?', 'result': 'This document is about machine learning, artificial intelligence, and the development of intelligent machines. It discusses how machines think, learn, and act intelligently, as well as the history of artificial intelligence.'}
