In [None]:
!pip install langchain-community
!pip install -U duckduckgo-search
!pip install langchain
!pip install langchain_google_genai
!pip install "unstructured[pdf]"
!pip install chromadb

In [1]:
import os
import google.generativeai as genai

os.environ['GOOGLE_API_KEY'] = 'AIzaSyAk2SGsbPm5H-6K-rNgnIhQsBYwkm2GHhE'
genai.configure(api_key='AIzaSyAk2SGsbPm5H-6K-rNgnIhQsBYwkm2GHhE')

# DATA

In [4]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.schema.document import Document
from langchain.vectorstores.chroma import Chroma
from langchain_google_genai import GoogleGenerativeAIEmbeddings

DATA_PATH = r"/content/determinants-of-financial-inclusion-in-vietnam-a-demand-side-approach.pdf"
CHROMA_PATH = "./Chroma_finance"

def load_documents():
    # document_loader = PyPDFDirectoryLoader(DATA_PATH)
    # return document_loader.load()
    # from langchain_community.document_loaders import PyPDFLoader
    # loader = PyPDFLoader(DATA_PATH)
    # pages = loader.load_and_split()
    # return pages
    from langchain_community.document_loaders import UnstructuredFileLoader
    loader = UnstructuredFileLoader(DATA_PATH)
    docs = loader.load()
    return docs


def split_documents(documents: list[Document]):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=800,
        chunk_overlap=80,
        length_function=len,
        is_separator_regex=False,
    )
    return text_splitter.split_documents(documents)


def get_embedding_function():
    gemini_embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
    return gemini_embeddings


def add_to_chroma(chunks: list[Document]):
    db = Chroma(
        persist_directory=CHROMA_PATH, embedding_function=get_embedding_function()
    )

    # Calculate Page IDs.
    chunks_with_ids = calculate_chunk_ids(chunks)

    # Add or Update the documents.
    existing_items = db.get(include=[])  # IDs are always included by default
    existing_ids = set(existing_items["ids"])
    print(f"Number of existing documents in DB: {len(existing_ids)}")

    new_chunks = []
    for chunk in chunks_with_ids:
        if chunk.metadata["id"] not in existing_ids:
            new_chunks.append(chunk)

    if len(new_chunks):
        print(f"👉 Adding new documents: {len(new_chunks)}")
        new_chunk_ids = [chunk.metadata["id"] for chunk in new_chunks]
        db.add_documents(new_chunks, ids=new_chunk_ids)
        db.persist()
    else:
        print("✅ No new documents to add")


def calculate_chunk_ids(chunks):
    last_page_id = None
    current_chunk_index = 0

    for chunk in chunks:
        source = chunk.metadata.get("source")
        page = chunk.metadata.get("page")
        current_page_id = f"{source}:{page}"
        if current_page_id == last_page_id:
            current_chunk_index += 1
        else:
            current_chunk_index = 0
        chunk_id = f"{current_page_id}:{current_chunk_index}"
        last_page_id = current_page_id
        chunk.metadata["id"] = chunk_id

    return chunks

In [7]:
documents = load_documents()
chunks = split_documents(documents)
add_to_chroma(chunks)

Number of existing documents in DB: 0
👉 Adding new documents: 109


  warn_deprecated(


#RUN

In [18]:
from langchain.load import dumps, loads
from langchain_community.utilities import DuckDuckGoSearchAPIWrapper
from langchain_community.tools import DuckDuckGoSearchResults
from langchain_core.documents import Document
from typing import List
wrapper = DuckDuckGoSearchAPIWrapper(max_results=10)
search = DuckDuckGoSearchResults(api_wrapper=wrapper, source="news")

def search_api(queries: list):
    res = []
    for query in queries:
        if query != "":
            res.append(search.run(query))
    return res

def format_sources(docs: str):
    key = ['snippet:', 'title:', 'link:']
    docs = docs[1:-1]
    list_doc = docs.split("],")
    source = []
    for doc in list_doc:
        metadata = {}
        ind_1 = doc.find(key[0])
        ind_2 = doc.find(key[1])
        ind_3 = doc.find(key[2])
        metadata['title'] = doc[ind_2 + 7:ind_3]
        metadata['source'] = doc[ind_3 + 6:]
        document = Document(
            page_content = doc[ind_1+2 + 7:ind_2],
            metadata=metadata
        )
        source.append(document)
    return source

def get_unique_union(documents: list[list]):
    """ Unique union of retrieved docs """
    # Flatten list of lists, and convert each Document to string
    # làm phằng, đưa list of list -> 1 list
    flattened_docs = [dumps(doc) for sublist in documents for doc in sublist]
    # Get unique documents
    unique_docs = list(set(flattened_docs))
    # Return list documents
    return [loads(doc) for doc in unique_docs]

# generate_queries = (
#     prompt_1
#     | model
#     | StrOutputParser()
#     | (lambda x: x.split("\n"))
#     )

def format_docs(docs: List[Document]) -> str:
    """Convert Documents to a single string.:"""
    formatted = [
        f"Article Snippet: {doc.page_content}" for doc in docs
    ]
    return "\n\n" + "\n\n".join(formatted)


def retrieval_local(generate_queries_chain, retrieval, question):
    print("retrieval_local")
    retrieval_chain = generate_queries_chain | retrieval.map() | get_unique_union
    docs = retrieval_chain.invoke({"question":question})
    return docs

def retrieval_onl(generate_queries_chain, question):
    print("retrieval_onl")
    search_chain = generate_queries_chain | search_api
    docs_search = search_chain.invoke({"question":question})
    documents = []
    for val in docs_search.copy():
        doc = format_sources(val)
        documents.extend(doc)
    return documents

def retrieval_fusion(generate_queries_chain, retrieval, question):
    print("retrieval")
    doc_pdf = retrieval_local(generate_queries_chain, retrieval, question)
    doc_search = retrieval_onl(generate_queries_chain, question)
    return doc_pdf + doc_search

In [9]:
from langchain_google_genai import GoogleGenerativeAI
from langchain_core.output_parsers import StrOutputParser
from langchain.prompts import PromptTemplate
from operator import itemgetter
from langchain_core.runnables import (
    RunnableLambda,
    RunnableParallel,
    RunnablePassthrough,
)

model = GoogleGenerativeAI(model="models/gemini-1.5-pro-001", temperature=0.1)
embeddings = get_embedding_function()
db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embeddings)
retrieval = db.as_retriever(search_kwargs={"k": 1})

In [10]:
template="""You are an AI language model assistant, question analysis expert. Your task is to base on user questions point out important headings
with the intention of asking questions about each heading. By creating many small questions, your goal
is to help make the answer clearer on each relevant aspect.
Provide these headers separated by newlines: \n
Question: {question}"""

template2 = """Answer the following question based on this context:
{context}
Question: {question}
"""

template4 = """
You're a helpful AI assistant. Given a user question and some DuckDuckGoSearchResults article snippets,
answer the user question. If none of the articles answer the question, just say you don't know."
\n\nHere are the context:{context}
"""

prompt_1 = PromptTemplate.from_template(template)
prompt_2 = PromptTemplate.from_template(template2)
# prompt_3 = PromptTemplate.from_template(template3)
prompt_4 = PromptTemplate.from_template(template4)

In [11]:
question = "What are the determinants of financial inclusion in Vietnam?"
generate_queries = (
    prompt_1
    | model
    | StrOutputParser()
    | (lambda x: x.split("\n"))
)
generate_queries.invoke(question)
# context = retrieval_fusion(generate_queries, retrieval, question)

['## Determinants of Financial Inclusion in Vietnam:',
 '',
 '1. **Economic Factors:**',
 '    * What is the role of GDP growth in promoting financial inclusion?',
 '    * How does income inequality impact access to financial services?',
 '    * What is the impact of employment levels and types of employment on financial inclusion?',
 '',
 '2. **Demographic Factors:**',
 '    * How does age influence financial inclusion rates?',
 '    * Are there gender disparities in access to financial services?',
 '    * Does education level correlate with financial inclusion?',
 '    * What is the impact of geographical location (urban vs. rural) on financial access?',
 '',
 '3. **Institutional Factors:**',
 '    * What is the role of government policies and regulations in promoting financial inclusion?',
 '    * How developed is the financial infrastructure (e.g., bank branches, ATMs) in Vietnam?',
 '    * What is the role of technology and digital financial services in expanding financial inclusi

In [13]:
context = retrieval_fusion(generate_queries, retrieval, question)
context

retrieval
retrieval_local
retrieval_onl


[Document(page_content='Financial inclusion is mainly observed on the suppli- ers’ side while the demand side which presents the actual usage of financial products or services more accurately reflects the level of financial inclusion of end-consumers. On the supply side, access to financial institutions is usu- the numbers of bank ally reported as the data of branches and automatic teller machines (ATM). These indicators considered banking penetration two common measurements of financial inclusion (Beck et al., 2005, Beck et al., 2008). Sarma (2008) also employs the pro- portion of loans and deposits to GDP as the result of', metadata={'id': '/content/determinants-of-financial-inclusion-in-vietnam-a-demand-side-approach.pdf:None:5', 'source': '/content/determinants-of-financial-inclusion-in-vietnam-a-demand-side-approach.pdf'}),
 Document(page_content='From the user’s characteristics, a huge number of studies empirically assess the potential key driver. As a pioneer, Demirg€ucx-Kunt an

In [19]:
def run_main(context, question):
    final_rag_chain = (
        {"context": itemgetter("context"),
         "question": itemgetter("question")}
        | prompt_2
        | model
        | StrOutputParser()
    )
    return final_rag_chain.invoke({'context': context, 'question': question})

res = run_main(format_docs(context), question)
# question = "What are the determinants of financial inclusion in Vietnam?"
# final_rag_chain.invoke({"question": question})

In [20]:
res

'Based on the provided article snippets, the key determinants of financial inclusion in Vietnam are:\n\n* **Total income per household:** Higher income households are more likely to be financially included. This is consistent with findings from other studies that highlight income as a major factor in financial inclusion.\n* **Education level:** Individuals with higher levels of education are more likely to be financially included.  The article specifically mentions that "less educated" individuals face barriers to financial inclusion.\n* **Labor force participation:** Individuals who are part of the workforce are more likely to be financially included compared to those out of the workforce.\n* **Age:** Age has a significant impact on financial inclusion, though the relationship is not simply linear. The article mentions both "old people" facing barriers and "young age" being associated with lower financial inclusion in other studies.\n* **Gender:** While the article highlights gender d