In [4]:
#!pip install google-cloud-aiplatform
#!pip install --upgrade langchain
#!pip install --upgrade chromadb
#!pip install pypdf

In [1]:
import os
import google.generativeai as palm
import textwrap
import numpy as np
import pandas as pd
import langchain
import chromadb
import shutil
from chromadb.config import Settings
from langchain.vectorstores import Chroma
from langchain.document_loaders import DirectoryLoader
from langchain.document_loaders import PyPDFDirectoryLoader
#from langchain.document_loaders import PyPDFLoader # for loading the pdf
from langchain.text_splitter import CharacterTextSplitter
# from langchain.embeddings.google_palm import GooglePalmEmbeddings

In [3]:
# palm.configure(api_key=os.environ["PALM_API_KEY"])
# palm_embedding_model = [m for m in palm.list_models() if 'embedText' in m.supported_generation_methods][0]
# palm_text_model = [m for m in palm.list_models() if 'generateText' in m.supported_generation_methods][0]
# print(palm_embedding_model.name)
# print(palm_text_model.name)

In [4]:
palm.configure(api_key=os.environ["PALM_API_KEY"])
embedding_model = [m for m in palm.list_models() if 'embedText' in m.supported_generation_methods][0]
text_model = [m for m in palm.list_models() if 'generateText' in m.supported_generation_methods][0]

## creating dataframe for pdf files

In [5]:
docs_directory = str(os.getcwd()) + '/test'
loader = PyPDFDirectoryLoader(f"{docs_directory}")
# documents = loader.load()
docs = loader.load_and_split()

In [6]:
len(docs)

260

In [7]:
context = "\n".join(str(p.page_content) for p in docs)
print("The total words in the context: ", len(context))

The total words in the context:  428167


In [8]:
# Convert the list of dictionaries to a DataFrame
data = {
    'context': [doc.page_content for doc in docs],
    'source': [doc.metadata['source'] for doc in docs],
    'page': [doc.metadata['page'] for doc in docs]
}
df = pd.DataFrame(data)

In [11]:
print(df['context'][0])

7/24/23, 10:54 PM About Fund-Raisers
https://www .charities.gov .sg/Pages/Fund-Raising/About-Fund-Raisers.aspx 1/1About Fund-Raisers
Organisations (including charities and Institutions of a Public Character (IPCs)) or individuals may raise funds for charitable,
benevolent or philanthropic purposes, such as to support a charity’s operations or programmes, or to provide financial aid for
disaster relief in other countries. 
"Fund-raising appeal" is defined as an appeal to any persons to give money or property, or a receipt from any persons of money or
property for charitable, benevolent or philanthropic purposes.  All fund-raising appeals in Singapore, whether online or offline,
regardless if it is for local or foreign charitable, benevolent or philanthropic purposes, are regulated under the Charities Act, Charities
(Fund-Raising Appeals for Local and Foreign Charitable Purposes) Regulations 2012 and the Charities (Institutions of A Public
Character) Regulations (for IPCs).
This section 

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 260 entries, 0 to 259
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   context  260 non-null    object
 1   source   260 non-null    object
 2   page     260 non-null    int64 
dtypes: int64(1), object(2)
memory usage: 6.2+ KB


In [28]:
# import csv
# from io import StringIO

# for index, row in df.iterrows():
#     text = row['context']
#     escaped_text = csv.writer(StringIO(), quoting=csv.QUOTE_NONNUMERIC).writerow([text])
#     if text != escaped_text:
#         print(f"Row {index}: '{text}' needs to be escaped as '{escaped_text}'")

In [13]:
df.to_csv('fund.csv', index=False, escapechar='\\')
#df.to_csv('fund.csv', index=False)

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 260 entries, 0 to 259
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   context  260 non-null    object
 1   source   260 non-null    object
 2   page     260 non-null    int64 
dtypes: int64(1), object(2)
memory usage: 6.2+ KB


In [35]:
# Get the embeddings of each text and add to an embeddings column in the dataframe
def embed_fn(text):
    return palm.generate_embeddings(model=embedding_model, text=text)['embedding']

In [45]:
def find_best_passage(query, dataframe):
    """
    Compute the distances between the query and each document in the dataframe
    using the dot product.
    """
    query_embedding = palm.generate_embeddings(model=embedding_model, text=query)
    print(len(query_embedding['embedding']))
    dot_products = np.dot(np.stack(dataframe['embedding']), query_embedding['embedding'])

    #idx = np.argmax(dot_products)
    idx_topn = np.argsort(dot_products)[::-1][:10]
    tmp_text = ""
    
    for idx in idx_topn:
        tmp_text = tmp_text + str(dataframe.iloc[idx]['page_content']) + "\n\n"
    
    return tmp_text # Return text from index with max value


In [37]:
def make_prompt(query,df):
    relevant_passage=find_best_passage(query,df)
    escaped = relevant_passage.replace("'", "").replace('"', "").replace("\n", " ")
    prompt = textwrap.dedent("""You are an intelligent customer support from People Association (PA) Singapore. You are a helpful and informative bot that answers questions using text from the reference passages included below. \
    Be sure to respond in a complete sentence, being comprehensive, including all relevant background information. \
    However, you are talking to a non-technical audience, so be sure to break down complicated concepts and \
    strike a friendly and converstional tone. \
    If the passage is irrelevant to the answer, you may ignore it.
    QUESTION: '{query}'
    PASSAGE: '{relevant_passage}'

    ANSWER:
    """).format(query=query, relevant_passage=escaped)
    print(prompt)
    return prompt

In [38]:
def answer_query_with_context(query,df):
    answer = palm.generate_text(prompt=make_prompt(query,df),
                                model=text_model,
                                candidate_count=1,
                                temperature=0,
                                max_output_tokens=500)
    print(answer.result)
    return answer.result
    # return answer.result.strip(" \n").replace('\n', '<br />')


In [40]:
embedding_model = [m for m in palm.list_models() if 'embedText' in m.supported_generation_methods][0]

In [41]:
print("start generating embeddings ...")
df['embedding'] = df['page_content'].apply(embed_fn)
print("finished generating embeddings")

start generating embeddings ...
finished generating embeddings


In [42]:
df.head()

Unnamed: 0,page_content,source,page,embedding
0,"First published in the Government Gazette , El...",C:\Users\chatgptadmin\Documents\FAQ_FUND_chatb...,0,"[-0.03116282, -0.037926417, -0.030958083, 0.01..."
1,(b) any exempt charity or charity registered u...,C:\Users\chatgptadmin\Documents\FAQ_FUND_chatb...,1,"[-0.030313207, -0.027389854, -0.022180524, 0.0..."
2,Made this 23rd day of August 2011.\nCHAN HENG ...,C:\Users\chatgptadmin\Documents\FAQ_FUND_chatb...,2,"[-0.017592462, -0.025003977, -0.056929152, -0...."
3,"First published in the Government Gazette , El...",C:\Users\chatgptadmin\Documents\FAQ_FUND_chatb...,0,"[-0.02403276, -0.042849418, 0.0014022454, -0.0..."
4,PART III\nCONTROL OF FUND-RAISING FOR\nCHARITA...,C:\Users\chatgptadmin\Documents\FAQ_FUND_chatb...,1,"[-0.005580242, -0.044570442, -0.0045610387, 0...."


In [44]:
len(df['embedding'][0]) # PALM embedding vectors is always 768 long

768

In [48]:
query = "what does charity mean"
answer_query_with_context(query,df)

768
You are an intelligent customer support from People Association (PA) Singapore. You are a helpful and informative bot that answers questions using text from the reference passages included below.     Be sure to respond in a complete sentence, being comprehensive, including all relevant background information.     However, you are talking to a non-technical audience, so be sure to break down complicated concepts and     strike a friendly and converstional tone.     If the passage is irrelevant to the answer, you may ignore it.
    QUESTION: 'what does charity mean'
    PASSAGE: '(a) the whole or part of — (i) the consideration given for goods or services sold or supplied by the commercial participatoror person; or (ii) any proceeds (other than such consideration) of a promotional venture undertaken by the commercial participator or person; or (b) sums given by the commercial participator or person by way of donation in connection with the sale or supply of any such goods or services

'According to the Charities Act 1994, a charity is any institution, corporate or not, which is established for charitable purposes and is subject to the control of the General Division of the High Court in exercise of the jurisdiction of the General Division of the High Court with respect to charities.'

## testing

In [34]:
loader = PyPDFDirectoryLoader(f"{docs_directory}")
# documents = loader.load()
docs = loader.load_and_split()
# chunk size refers to max no. of chars, not tokens
#     text_splitter = CharacterTextSplitter(separator = "\n\n", 
#                                           chunk_size=800, 
#                                           chunk_overlap=80, 
#                                           length_function = len)

#     texts = text_splitter.split_documents(documents)

chroma_db = Chroma.from_documents(docs,#texts, 
                                  palm_embedding, 
                                  collection_name=collection_name, 
                                  persist_directory=persist_directory)
chroma_db.persist()
print("document embedding creation done")

document embedding creation done


In [53]:
# Expose index to the retriever
retriever = chroma_db.as_retriever(search_type="similarity", search_kwargs={"k": 3})

In [62]:
docs = retriever.get_relevant_documents("Class licence for fundraiser lotteries")

In [63]:
print(docs[0].page_content)

Excluded gambling services
3.This Order does not apply to the provision of any gambling
service that is covered by a class licence under —
(a) the Gambling Control (Minor Gambling —Class Licence)
Order 2022 (G.N. No. S 659/2022); or
(b) the Gambling Control (Trade and Other Promotional
Games and Lotteries —Class Licence) Order 2022
(G.N. No. S 660/2022).
Class licence for fundraiser lotteries
4.—(1) Unless exempt under section 128 of the Act, every eligible
person who provides a gambling service by conducting a fundraiser
lottery at any time on or after 2 August 2022 in or from any place in
Singapore, is subject by virtue of this Order to a class licence inconnection with the provision of such a gambling service.
(2) Where a fundraiser lottery is conducted, at any time on or after
2 August 2022 in or from any place in Singapore, by a gambling
service agent acting under the authority of an eligible person, the
gambling service agent is also subject by virtue of this Order to a class
lic

In [None]:
def initialize_chromadb():
    print("creating embedding for documents...")
    if os.path.exists(persist_directory):
        # Remove the entire directory and its content
        shutil.rmtree(persist_directory)
        print("existing collection is deleted")

    #loader = DirectoryLoader(f"{docs_directory}", glob="**/*.txt")
    #loader = DirectoryLoader(f"{docs_directory}", glob="**/*")
    loader = PyPDFDirectoryLoader(f"{docs_directory}")
    # documents = loader.load()
    docs = loader.load_and_split()
    # chunk size refers to max no. of chars, not tokens
#     text_splitter = CharacterTextSplitter(separator = "\n\n", 
#                                           chunk_size=800, 
#                                           chunk_overlap=80, 
#                                           length_function = len)
    
#     texts = text_splitter.split_documents(documents)

    chroma_db = Chroma.from_documents(docs,#texts, 
                                      palm_embedding, 
                                      collection_name=collection_name, 
                                      persist_directory=persist_directory)
    chroma_db.persist()
    print("document embedding creation done")

initialize_chromadb()

In [None]:
# https://safe.menlosecurity.com/https://github.com/chroma-core/chroma/blob/main/chromadb/utils/embedding_functions.py
# palm_embedding = embedding_functions.GooglePalmEmbeddingFunction(api_key=os.environ["PALM_API_KEY"], 
#                                                                  model_name=palm_embedding_model.name)

# palm_embedding = GooglePalmEmbeddings(google_api_key=os.environ["PALM_API_KEY"], 
#                                       model_name=palm_embedding_model.name)