In [8]:
import os
import google.generativeai as genai

os.environ['GOOGLE_API_KEY'] = 'AIzaSyAk2SGsbPm5H-6K-rNgnIhQsBYwkm2GHhE'
genai.configure(api_key='AIzaSyAk2SGsbPm5H-6K-rNgnIhQsBYwkm2GHhE')

# data retrieve

In [9]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.schema.document import Document
from langchain.vectorstores.chroma import Chroma
from langchain_google_genai import GoogleGenerativeAIEmbeddings

DATA_PATH = r"F:\CMC\CMC_Study\Code\data\determinants-of-financial-inclusion-in-vietnam-a-demand-side-approach.pdf"
CHROMA_PATH = "./Chroma_finance"

In [10]:
def load_documents():
    # document_loader = PyPDFDirectoryLoader(DATA_PATH)
    # return document_loader.load()
    # from langchain_community.document_loaders import PyPDFLoader
    # loader = PyPDFLoader(DATA_PATH)
    # pages = loader.load_and_split()
    # return pages
    from langchain_community.document_loaders import UnstructuredFileLoader
    loader = UnstructuredFileLoader(DATA_PATH)
    docs = loader.load()
    return docs


def split_documents(documents: list[Document]):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=800,
        chunk_overlap=80,
        length_function=len,
        is_separator_regex=False,
    )
    return text_splitter.split_documents(documents)


def get_embedding_function():
    gemini_embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
    return gemini_embeddings


def add_to_chroma(chunks: list[Document]):
    db = Chroma(
        persist_directory=CHROMA_PATH, embedding_function=get_embedding_function()
    )

    # Calculate Page IDs.
    chunks_with_ids = calculate_chunk_ids(chunks)

    # Add or Update the documents.
    existing_items = db.get(include=[])  # IDs are always included by default
    existing_ids = set(existing_items["ids"])
    print(f"Number of existing documents in DB: {len(existing_ids)}")

    new_chunks = []
    for chunk in chunks_with_ids:
        if chunk.metadata["id"] not in existing_ids:
            new_chunks.append(chunk)

    if len(new_chunks):
        print(f"👉 Adding new documents: {len(new_chunks)}")
        new_chunk_ids = [chunk.metadata["id"] for chunk in new_chunks]
        db.add_documents(new_chunks, ids=new_chunk_ids)
        db.persist()
    else:
        print("✅ No new documents to add")


def calculate_chunk_ids(chunks):
    last_page_id = None
    current_chunk_index = 0

    for chunk in chunks:
        source = chunk.metadata.get("source")
        page = chunk.metadata.get("page")
        current_page_id = f"{source}:{page}"
        if current_page_id == last_page_id:
            current_chunk_index += 1
        else:
            current_chunk_index = 0
        chunk_id = f"{current_page_id}:{current_chunk_index}"
        last_page_id = current_page_id
        chunk.metadata["id"] = chunk_id

    return chunks

In [6]:
documents = load_documents()
chunks = split_documents(documents)
add_to_chroma(chunks)

Number of existing documents in DB: 109
✅ No new documents to add


# Run

In [11]:
from langchain.load import dumps, loads

def get_unique_union(documents: list[list]):
    """ Unique union of retrieved docs """
    # Flatten list of lists, and convert each Document to string
    # làm phằng, đưa list of list -> 1 list
    flattened_docs = [dumps(doc) for sublist in documents for doc in sublist]
    # Get unique documents
    unique_docs = list(set(flattened_docs))
    # Return list documents
    return [loads(doc) for doc in unique_docs]

In [12]:
from langchain_google_genai import GoogleGenerativeAI
from langchain_core.output_parsers import StrOutputParser
from langchain.prompts import PromptTemplate

model = GoogleGenerativeAI(model="models/gemini-1.5-pro-001", temperature=0.1)
embeddings = get_embedding_function()
db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embeddings)
retriever = db.as_retriever(search_kwargs={"k": 3})

In [13]:
template = """You are an AI language model assistant. Your task is to based on user questions and retrieval documents, 
point out important headings and generate questions for each such heading. By By creating many small questions, your goal 
is to help make the answer clearer on each relevant aspect.
Provide these alternative questions separated by newlines, output: list queries. Original question: {question}"""
prompt_1 = PromptTemplate.from_template(template)

In [14]:
generate_queries = (
    prompt_1 
    | model
    | StrOutputParser() 
    | (lambda x: x.split("\n"))
)

In [15]:
generate_queries.invoke({"question": "What are the determinants of financial inclusion in Vietnam?"})

['Please provide me with the retrieval documents you mentioned so I can formulate relevant headings and questions. I need the context from the documents to accurately identify the determinants of financial inclusion in Vietnam and create insightful questions. ',
 '',
 'For example, are the documents about:',
 '',
 '* The impact of mobile banking on financial inclusion?',
 '* The role of government policies in promoting financial inclusion?',
 '* The challenges of reaching rural populations with financial services?',
 '',
 'Once I have this information, I can generate headings like:',
 '',
 '* **Mobile Banking Adoption**',
 '    * What is the smartphone penetration rate in Vietnam?',
 '    * What are the barriers to mobile banking adoption among different demographics?',
 '* **Government Policies and Regulations**',
 '    * What specific policies has the Vietnamese government implemented to promote financial inclusion?',
 '    * How effective have these policies been in achieving their 

In [16]:
question = "What are the determinants of financial inclusion in Vietnam?"
retrieval_chain = generate_queries | retriever.map() | get_unique_union
docs = retrieval_chain.invoke({"question":question})
print(len(docs))

9


  warn_beta(


In [17]:
from operator import itemgetter
from langchain_core.runnables import RunnablePassthrough

In [18]:
template2 = """Answer the following question based on this context:

{context}

Question: {question}
"""

In [19]:
prompt2 = PromptTemplate.from_template(template2)

In [20]:
final_rag_chain = (
    {"context": retrieval_chain, 
     "question": itemgetter("question")} 
    | prompt2
    | model
    | StrOutputParser()
)

In [21]:
result = final_rag_chain.invoke({"question":question})

In [22]:
print(result)

The determinants of financial inclusion in Vietnam, according to this research paper, are:

* **Demographic characteristics:** This includes factors like gender, age, education attainment, income level, and working status. 
* **Access to financial services:** This refers to the availability of financial institutions, like banks and ATMs, particularly for those in rural areas.
* **Usage of financial services:** This goes beyond just having access and looks at whether individuals actually utilize financial products and services.
* **Financial literacy:**  The research highlights the need for policies promoting financial literacy through education to increase the adoption and usage of financial products and services.
* **Income level:** The study found that low-income people are disadvantaged in capturing financial inclusion, emphasizing the need for policies to help these groups access and use financial services.

The paper uses data from the World Bank Global Findex Database and employs