# PDF Q&A using LLMs

# API keys

In [33]:
os.environ['OPENAI_API_KEY'] = openai_key

# Simple PDF reader

In [2]:
# importing required modules
from PyPDF2 import PdfReader
  
pdf_folder_path = 'pdf_folder'
fn = 'test_file.pdf'
fn = 'Algebris_UCITS.pdf' #p199-223 for fund focus: Algebris Financial Credit Fund // 0-135 for main section
fn = 'FOMCpresconf20230614.pdf'

# creating a pdf reader object
reader = PdfReader(pdf_folder_path+'/'+fn)
# printing number of pages in pdf file
print(f'*** {len(reader.pages)} pages found ***')

*** 5 pages found ***


In [3]:
total_text = ''
for page in reader.pages:
    total_text += page.extract_text()

word_count = len(total_text.split())
print("*** Total words:", word_count,'***')

*** Total words: 1457 ***


In [4]:
vec_price = 0.002/1000
doc_price = word_count*vec_price
doc_price

0.002914

# Lang-Chain

In [5]:
import os

## Load & Parse PDF

In [6]:
# From raw PDF doc, load, split into smaller chunks before passing to Vectorizer & LLM
from langchain.document_loaders import PyPDFLoader,UnstructuredPDFLoader
loader = PyPDFLoader(pdf_folder_path+'/'+fn)
documents = loader.load()

In [7]:
len(documents)

5

### Split in chunks

In [8]:
# Split the docs in chunks 
from langchain.text_splitter import RecursiveCharacterTextSplitter #text splitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=100)
texts = text_splitter.split_documents(documents)

In [9]:
len(texts)

9

### Embeddings

In [10]:
os.environ["HUGGINGFACEHUB_API_TOKEN"] = huggingface_key
os.environ["OPENAI_API_KEY"] = openai_key

In [30]:
# HF embedding
from langchain.embeddings import HuggingFaceEmbeddings #for using HugginFace models
from langchain.embeddings import OpenAIEmbeddings
OpenAIEmbeddings.openai_api_key = openai_key
hugg_ef = HuggingFaceEmbeddings(model_name = 'sentence-transformers/all-mpnet-base-v2') # default embedding model HuggingFace Hub
open_ai_ef = OpenAIEmbeddings(model = 'text-embedding-ada-002') # default model - performing well and much cheaper

### Vector DB - Chroma

In [31]:
# create vector store
from langchain.vectorstores import Chroma
store = Chroma.from_documents(texts,
                              embedding=open_ai_ef,
                             collection_name='fomc')


AuthenticationError: <empty message>

### Select LLM model

In [23]:
# Select llm model
#Load llm with selected one
from langchain import HuggingFaceHub
from langchain.llms import OpenAI
from langchain.chat_models import ChatOpenAI

OpenAI.openai_api_key = openai_key
ChatOpenAI.openai_api_key = openai_key

#Hugging Face model
hugg_face_llm=HuggingFaceHub(repo_id="google/flan-t5-large", model_kwargs={"temperature":1})

#Open-AI models
# "text-davinci-003, text-davinci-002, text-curie-001, text-babbage-001, text-ada-001"
open_ai_llm = OpenAI(model='text-davinci-003',temperature=0.2) #most advanced / more exepensive

# Chat Model
# "gpt-4, gpt-4-0613, gpt-4-32k, gpt-4-32k-0613, gpt-3.5-turbo, gpt-3.5-turbo-0613, gpt-3.5-turbo-16k, gpt-3.5-turbo-16k-0613"
chat_open_ai_llm = ChatOpenAI(model='gpt-3.5-turbo',temperature=0.2) #init llm with temperature argument (control creativity)


### RetrieveQA - to pass only similar chunks to llm

In [24]:
# Select number of chunks to pass to llm model after similarity performed on embeddings
k = 8
retriever = store.as_retriever(search_type='similarity',  # similarity or mmr
                               search_kwargs={'k':k}
                              )

In [25]:
# To select only the chunks with high similarity
from langchain.chains import RetrievalQA
qa = RetrievalQA.from_chain_type(
    llm=chat_open_ai_llm,
    chain_type='stuff', #refine / stuff (no refine for gpt 3.5 turbo)
    retriever= retriever,
    return_source_documents=True,
)

### Prompts

In [26]:
# FOMC Prompts
prompt = "what has been decided? rate hike or cut? by how much?"
prompt = "Growth outlook? Output as bullet points"
prompt = "inflation outlook? Output as bullet points"
prompt = "Details the votes for rate hike or cut. output as bullet points"
prompt = "any guidance on future rates hike, pause or cut?"

In [997]:
#K-Means paper prompts
prompt = "what is this paper about?"
prompt = "who's the author of this paper?"
prompt = "can you provide summary of findings?"

In [998]:
#Algebris prospectus prompts
prompt = "when can I buy or sell shares?"
prompt = "Detail what is a dealing day for the fund and specify the cut-off time?"
prompt = "what is the cut-off time.Oußtput as bullets??"
prompt = "what is the management fee for each share class type. Output as list?"
prompt = "what is the performance fee for each share class type. Output as list?"
prompt = "How much is performance fee? what are the terms / conditions.Output as bullets?"
prompt = "Can the fund invest in other funds ?"
prompt = "What's the fund strategy or objective?"
prompt = "Can the fund be levered / apply leverage? What would be maximum amount if any?"
prompt = 'List all the instruments and asset classes the fund can invest in'

In [1031]:
prompt = "When selling / redeeming shares, is there any risk of gate? Output as bullets"
prompt = "is there any lock-up / penalty if redeeming early from the fund? Output as bullets"

In [1057]:
prompt = "any guidance on future rates hike, pause or cut?"

### Run the model Q&A

In [27]:
answer = qa(prompt)

AuthenticationError: <empty message>

### Print answers and refs

In [None]:
print(answer['result'])

In [22]:
[print(f'\n---------------------------\n Page:{doc.metadata["page"]}\n---------------------------\n{doc.page_content} {doc.metadata["page"]}') for doc in answer['source_documents']]

NameError: name 'answer' is not defined