In [1]:
# Common imports
import os
from dotenv import load_dotenv
from openai import OpenAI
from langchain_openai import ChatOpenAI
from langchain_openai import OpenAIEmbeddings

# Importing Dependencies for RAG
from langchain.document_loaders import PyPDFLoader
from langchain_core.documents import Document
from langchain_community.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter


#Import the key CrewAI classes
# from crewai import Agent, Task, Crew

import json
import lolviz


In [3]:
# Load .env for API password
load_dotenv('.env')
API_KEY = os.getenv("OPENAI_API_KEY")
client = OpenAI(api_key=API_KEY)

# Check which openAI model we using 
print(f'OPENAI_MODEL_NAME = "{os.getenv('OPENAI_MODEL_NAME')}"')

OPENAI_MODEL_NAME = "gpt-4o-mini"


In [5]:
# Note that this function directly take in "messages" as the parameter.
def get_completion(messages, model="gpt-4o-mini", temperature=0, top_p=1.0, max_tokens=1024, n=1):
    response = client.chat.completions.create(
        model=model,
        messages=messages,
        temperature=temperature,
        top_p=top_p,
        max_tokens=max_tokens,
        n=1
    )
    return response.choices[0].message.content

In [7]:
# This function is for calculating the tokens given the "message"

import tiktoken

def count_tokens(text):
    encoding = tiktoken.encoding_for_model('gpt-4o-mini')
    return len(encoding.encode(text))

def count_tokens_from_message_rough(messages):
    encoding = tiktoken.encoding_for_model('gpt-4o-mini')
    value = ' '.join([x.get('content') for x in messages])
    return len(encoding.encode(value))

In [9]:
# embedding model that we will use for the session
embeddings_model = OpenAIEmbeddings(model='text-embedding-3-small')

# llm to be used in RAG pipeplines in this notebook
llm = ChatOpenAI(model='gpt-4o-mini', temperature=0, seed=42)

In [11]:
# Get the name of the files in the folder

# folder path
dir_path = r'./News Releases'

# list to store files
filename_list = []

# Iterate directory
for path in os.listdir(dir_path):
    # check if current path is a file
    if os.path.isfile(os.path.join(dir_path, path)):
        filename_list.append(path)

# Print the list of filenames and the count
print(filename_list)
print()
print(len(filename_list))

['CPF interest rates from 1 April 2024 to 30 June 2024.pdf', 'CPF interest rates from 1 January 2024 to 31 March 2024 and Basic Healthcare Sum for 2024.pdf', 'CPF interest rates from 1 July 2024 to 30 September 2024.pdf', 'Extends 4 percent interest rate floor until 31 Dec 25.pdf', 'Joint advisory on government official impersonation scam.pdf', 'Lowering maximum daily withdrawal limit.pdf', 'MoneySense marks 20th anniversary.pdf', 'National retirement planning.pdf', 'New default daily limit.pdf']

9


In [13]:
from langchain.document_loaders import PyPDFLoader

# load the documents
list_of_documents_loaded = []
for filename in filename_list:
  if filename.endswith('.pdf'): 
    try:
        # try to load the document
        pdf_path = os.path.join('News Releases', filename)
        loader = PyPDFLoader(pdf_path)

        # load() returns a list of Document objects
        data = loader.load()
        # use extend() to add to the list_of_documents_loaded
        list_of_documents_loaded.extend(data)
        print(f"Loaded {filename}")

    except Exception as e:
        # if there is an error loading the document, print the error and continue to the next document
        print(f"Error loading {filename}: {e}")
        continue
print()
print("Total documents loaded:", len(list_of_documents_loaded))

Loaded CPF interest rates from 1 April 2024 to 30 June 2024.pdf
Loaded CPF interest rates from 1 January 2024 to 31 March 2024 and Basic Healthcare Sum for 2024.pdf
Loaded CPF interest rates from 1 July 2024 to 30 September 2024.pdf
Loaded Extends 4 percent interest rate floor until 31 Dec 25.pdf
Loaded Joint advisory on government official impersonation scam.pdf
Loaded Lowering maximum daily withdrawal limit.pdf
Loaded MoneySense marks 20th anniversary.pdf
Loaded National retirement planning.pdf
Loaded New default daily limit.pdf

Total documents loaded: 20


In [15]:
# Check if the items in list_of_documents_loaded are Document objects
from langchain_core.documents import Document 
for i, doc in enumerate(list_of_documents_loaded):
    if isinstance(doc, Document):
        print(f"Item {i} is a Document object.")
    else:
        print(f"Item {i} is NOT a Document object.")


Item 0 is a Document object.
Item 1 is a Document object.
Item 2 is a Document object.
Item 3 is a Document object.
Item 4 is a Document object.
Item 5 is a Document object.
Item 6 is a Document object.
Item 7 is a Document object.
Item 8 is a Document object.
Item 9 is a Document object.
Item 10 is a Document object.
Item 11 is a Document object.
Item 12 is a Document object.
Item 13 is a Document object.
Item 14 is a Document object.
Item 15 is a Document object.
Item 16 is a Document object.
Item 17 is a Document object.
Item 18 is a Document object.
Item 19 is a Document object.


In [17]:
list_of_documents_loaded[0]

Document(metadata={'source': 'News Releases\\CPF interest rates from 1 April 2024 to 30 June 2024.pdf', 'page': 0}, page_content='1 / 2CPF interest rates from 1 April 2024 to 30 June 2024\ncpf.gov .sg/member/infohub/news/news-releases/cpf-interest-rates-from-1-april-2024-to-30-june-2024\nCPF Interest Rates from 1 April 2024 to 30 June 2024\nOrdinary Account Special, MediSave and Retirement Account\nInterest Rate\nFloor2.5% 4.05%\nCPF members below 55 years old\nMembers earn an extra 1% interest  on the first $60,000 of their combined CPF\nbalances\nCPF members aged 55 and above\nMembers earn an extra 2% interest  on the first $30,000 of their combined CPF\nbalances, and an extra 1%  on the next $30,000\nHDB Concessionary Interest Rate from 1 April 2024 to 30 June 2024\nRemains unchanged at 2.6%\nNote: All interest rates are quoted on a per annum basis.\nWith the Special, MediSave and Retirement Account (SMRA) pegged rate exceeding the\nfloor rate of 4%, savings in the SMRA will earn 4.

In [96]:
list_of_documents_loaded[1]

Document(metadata={'source': 'News Releases\\CPF interest rates from 1 April 2024 to 30 June 2024.pdf', 'page': 1}, page_content='2 / 2at $20,000 for OA). For members aged 55 and above, the Government pays an extra 2%\ninterest on the first $30,000 of their combined balances (capped at $20,000 for OA), and an\nextra 1% on the next $30,000.\nThe extra interest received on the OA balances will go into the member ’s Special Account\n(SA) or Retirement Account (RA). If a member is above 55 years old and participates in the\nCPF LIFE scheme, the extra interest will still be earned on his or her combined CPF\nbalances, which includes the savings used for CPF LIFE.\nInterest Rate for CPF Ordinary Account and HDB Concessionary Interest Rate\nThe OA interest rate will be maintained at 2.5% per annum from 1 April 2024 to 30 June\n2024.\nCorrespondingly , the concessionary interest rate for HDB housing loans, which is pegged at\n0.1% above the OA interest rate, will remain unchanged at 2.6% per a

In [19]:
# Check size of document to decide if we need to split
i = 0
for doc in list_of_documents_loaded:
    i += 1
    print(f'Document {i} -  has {count_tokens(doc.page_content)} tokens') 

Document 1 -  has 427 tokens
Document 2 -  has 312 tokens
Document 3 -  has 405 tokens
Document 4 -  has 625 tokens
Document 5 -  has 224 tokens
Document 6 -  has 444 tokens
Document 7 -  has 216 tokens
Document 8 -  has 443 tokens
Document 9 -  has 335 tokens
Document 10 -  has 533 tokens
Document 11 -  has 431 tokens
Document 12 -  has 590 tokens
Document 13 -  has 413 tokens
Document 14 -  has 550 tokens
Document 15 -  has 99 tokens
Document 16 -  has 486 tokens
Document 17 -  has 588 tokens
Document 18 -  has 149 tokens
Document 19 -  has 518 tokens
Document 20 -  has 413 tokens


In [21]:
from langchain_community.vectorstores import Chroma
# Create the vector database
vectordb = Chroma.from_documents(
    documents=list_of_documents_loaded,
    embedding=embeddings_model,
    collection_name="naive_splitter", 
    persist_directory="./vector_db"
)

In [23]:
# Create the RAG pipeline
# V short codes as we are using langchain predefine function
from langchain.chains import RetrievalQA #QA = Question and Answer

# The `llm` is defined earlier in the notebook (using GPT-4o-mini)
rag_chain = RetrievalQA.from_llm(
    retriever=vectordb.as_retriever(), llm=llm
)

In [25]:
# Now we can use the RAG pipeline to ask questions
# Let's ask a question that we know is in the documents
# Once you provide a question, you get an answer, detail of where the info etc come from is not visible
llm_response = rag_chain.invoke('What is maximum Daily Withdrawal Limit for online CPF withdrawals ?')
print(llm_response['result'])

The maximum Daily Withdrawal Limit for online CPF withdrawals will be lowered to $50,000 effective from 25 September 2024.
