# Chat with your PDF files
Without including the source files
___

Installing required libraries

In [1]:
#!pip install openai langchain pypdf chromadb docarray

In [2]:
import pandas as pd

I saved my OPEN AI KEY as a csv file

In [3]:
k = pd.read_csv("keys.csv")

In [4]:
import os 
import openai
os.environ['OPENAI_API_KEY'] = k["key"][0] ## <- REPLACE WITH YOUR OWN OPEN AI KEY
openai.api_key  = os.environ['OPENAI_API_KEY']

Document Loading

In [5]:
from langchain.document_loaders import PyPDFLoader
loader = PyPDFLoader("Reading from SQL Databases.pdf")##<-REPLACE WITH YOUR PDF FILE
pages = loader.load()
print(len(pages))
pages[0].metadata

13


{'source': 'Reading from SQL Databases.pdf', 'page': 0}

Split Documents

In [6]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 250,
    chunk_overlap = 25
)
splits = text_splitter.split_documents(pages)
print(len(splits))
splits[1].page_content

27


'S\nQL (Structured Query Language)\n▶Pronounced Ess Queue Ell orSequel\n▶The package RODBC is used to read SQL databases (and other database\nformats).\n▶Load required package\n> library(RODBC)\n▶Get an overview of the package: library(help=RODBC)'

In [7]:
#!pip install tiktoken

Vector Store

In [8]:
from langchain.embeddings.openai import OpenAIEmbeddings
embedding = OpenAIEmbeddings()
from langchain.vectorstores import Chroma
persist_directory = 'sagemaker-studiolab-notebooks/GenAIDemos/ChromaPDF/'
vectordb = Chroma.from_documents(
    documents=splits,
    embedding=embedding,
    persist_directory=persist_directory
)
vectordb.persist()

#VectorStore is "persisted" once and then it can be reused removing the comments in the following lines
# embedding = OpenAIEmbeddings()
# vectordb = Chroma(persist_directory=persist_directory, embedding_function=embedding)
# print(vectordb._collection.count())

27


Chat Chain

In [9]:
from langchain.memory import ConversationBufferMemory
memory = ConversationBufferMemory(
    memory_key="chat_history",
    return_messages=True
)

from langchain.chat_models import ChatOpenAI
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)

from langchain.chains import ConversationalRetrievalChain
retriever=vectordb.as_retriever()

qa = ConversationalRetrievalChain.from_llm(
    llm,
    retriever=retriever,
    memory=memory
)

Chat With PDF

In [10]:
question = "How can I get all the rows from a Table using R?"
result = qa({"question": question})
print(result['answer'])

To get all the rows from a table using R, you can use the `sqlFetch()` function. Here's an example:

```R
# Assuming you have a database connection named 'conn'
# and you want to get all the rows from the 'manufacturer' table in the 'bi' schema

mf <- sqlFetch(conn, "bi.manufacturer")
```

This will fetch all the rows from the 'manufacturer' table and store them in the 'mf' variable. You can then use the 'mf' variable to work with the data.


In [11]:
question = "How many data types are supported by RODBC library in R language?"
result = qa({"question": question})
print(result['answer'])

The RODBC library in the R language supports several data types, including smallint, integer, bigint, character, numeric, date, factor, varchar, and datetime.


In [12]:
result

{'question': 'How many data types are supported by RODBC library in R language?',
 'chat_history': [HumanMessage(content='How can I get all the rows from a Table using R?', additional_kwargs={}, example=False),
  AIMessage(content='To get all the rows from a table using R, you can use the `sqlFetch()` function. Here\'s an example:\n\n```R\n# Assuming you have a database connection named \'conn\'\n# and you want to get all the rows from the \'manufacturer\' table in the \'bi\' schema\n\nmf <- sqlFetch(conn, "bi.manufacturer")\n```\n\nThis will fetch all the rows from the \'manufacturer\' table and store them in the \'mf\' variable. You can then use the \'mf\' variable to work with the data.', additional_kwargs={}, example=False),
  HumanMessage(content='How many data types are supported by RODBC library in R language?', additional_kwargs={}, example=False),
  AIMessage(content='The RODBC library in the R language supports several data types, including smallint, integer, bigint, characte

In [13]:
qa.run("How many data types are supported by RODBC library in R language?")

'The RODBC library in R supports the following data types:\n\n- smallint: integer\n- integer: integer\n- bigint: integer\n- character: character\n- numeric: numeric\n- date: factor\n- varchar: integer or factor\n- datetime: character\n- POSIXct: character'