In [135]:
!pip install -U duckduckgo-search

Collecting duckduckgo-search
  Using cached duckduckgo_search-6.1.7-py3-none-any.whl (24 kB)
Installing collected packages: duckduckgo-search
Successfully installed duckduckgo-search-6.1.7


You should consider upgrading via the 'F:\CMC\CMC_Study\Code\venv\Scripts\python.exe -m pip install --upgrade pip' command.


In [1]:
import os
import google.generativeai as genai

os.environ['GOOGLE_API_KEY'] = 'AIzaSyAk2SGsbPm5H-6K-rNgnIhQsBYwkm2GHhE'
genai.configure(api_key='AIzaSyAk2SGsbPm5H-6K-rNgnIhQsBYwkm2GHhE')

# data retrieve

In [2]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.schema.document import Document
from langchain.vectorstores.chroma import Chroma
from langchain_google_genai import GoogleGenerativeAIEmbeddings

DATA_PATH = r"F:\CMC\CMC_Study\Code\data\determinants-of-financial-inclusion-in-vietnam-a-demand-side-approach.pdf"
CHROMA_PATH = "./Chroma_finance"

In [3]:
def load_documents():
    # document_loader = PyPDFDirectoryLoader(DATA_PATH)
    # return document_loader.load()
    # from langchain_community.document_loaders import PyPDFLoader
    # loader = PyPDFLoader(DATA_PATH)
    # pages = loader.load_and_split()
    # return pages
    from langchain_community.document_loaders import UnstructuredFileLoader
    loader = UnstructuredFileLoader(DATA_PATH)
    docs = loader.load()
    return docs


def split_documents(documents: list[Document]):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=800,
        chunk_overlap=80,
        length_function=len,
        is_separator_regex=False,
    )
    return text_splitter.split_documents(documents)


def get_embedding_function():
    gemini_embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
    return gemini_embeddings


def add_to_chroma(chunks: list[Document]):
    db = Chroma(
        persist_directory=CHROMA_PATH, embedding_function=get_embedding_function()
    )

    # Calculate Page IDs.
    chunks_with_ids = calculate_chunk_ids(chunks)

    # Add or Update the documents.
    existing_items = db.get(include=[])  # IDs are always included by default
    existing_ids = set(existing_items["ids"])
    print(f"Number of existing documents in DB: {len(existing_ids)}")

    new_chunks = []
    for chunk in chunks_with_ids:
        if chunk.metadata["id"] not in existing_ids:
            new_chunks.append(chunk)

    if len(new_chunks):
        print(f"👉 Adding new documents: {len(new_chunks)}")
        new_chunk_ids = [chunk.metadata["id"] for chunk in new_chunks]
        db.add_documents(new_chunks, ids=new_chunk_ids)
        db.persist()
    else:
        print("✅ No new documents to add")


def calculate_chunk_ids(chunks):
    last_page_id = None
    current_chunk_index = 0

    for chunk in chunks:
        source = chunk.metadata.get("source")
        page = chunk.metadata.get("page")
        current_page_id = f"{source}:{page}"
        if current_page_id == last_page_id:
            current_chunk_index += 1
        else:
            current_chunk_index = 0
        chunk_id = f"{current_page_id}:{current_chunk_index}"
        last_page_id = current_page_id
        chunk.metadata["id"] = chunk_id

    return chunks

In [4]:
# documents = load_documents()
# chunks = split_documents(documents)
# add_to_chroma(chunks)

# Run

In [63]:
from langchain.load import dumps, loads
from langchain_community.utilities import DuckDuckGoSearchAPIWrapper
from langchain_community.tools import DuckDuckGoSearchResults
from langchain_core.documents import Document
from typing import List
wrapper = DuckDuckGoSearchAPIWrapper(max_results=10)
search = DuckDuckGoSearchResults(api_wrapper=wrapper, source="news")

def search_api(queries: list):
    res = []
    for query in queries:
        if query != "":
            res.append(search.run(query))
    return res

def format_sources(docs: str):
    key = ['snippet:', 'title:', 'link:']
    docs = docs[1:-1]
    list_doc = docs.split("],")
    source = []
    for doc in list_doc:
        metadata = {}
        ind_1 = doc.find(key[0])
        ind_2 = doc.find(key[1])
        ind_3 = doc.find(key[2])
        metadata['title'] = doc[ind_2 + 7:ind_3]
        metadata['source'] = doc[ind_3 + 6:]
        document = Document(
            page_content = doc[ind_1+2 + 7:ind_2],
            metadata=metadata
        )
        source.append(document)
    return source
    
def get_unique_union(documents: list[list]):
    """ Unique union of retrieved docs """
    # Flatten list of lists, and convert each Document to string
    # làm phằng, đưa list of list -> 1 list
    flattened_docs = [dumps(doc) for sublist in documents for doc in sublist]
    # Get unique documents
    unique_docs = list(set(flattened_docs))
    # Return list documents
    return [loads(doc) for doc in unique_docs]

# generate_queries = (
#     prompt_1 
#     | model
#     | StrOutputParser() 
#     | (lambda x: x.split("\n"))
#     )

def format_docs(docs: List[Document]) -> str:
    """Convert Documents to a single string.:"""
    formatted = [
        f"Article Snippet: {doc.page_content}" for doc in docs
    ]
    return "\n\n" + "\n\n".join(formatted)

    
def retrieval_local(generate_queries_chain, retrieval, question):
    print("retrieval_local")
    retrieval_chain = generate_queries_chain | retrieval.map() | get_unique_union
    docs = retrieval_chain.invoke({"question":question})
    return docs

def retrieval_onl(generate_queries_chain, question):
    print("retrieval_onl")
    search_chain = generate_queries_chain | search_api
    docs_search = search_chain.invoke({"question":question})
    documents = []
    for val in docs_search.copy():
        doc = format_sources(val)
        documents.extend(doc)
    return documents

def retrieval_fusion(generate_queries_chain, retrieval, question):
    print("retrieval")
    doc_pdf = retrieval_local(generate_queries_chain, retrieval, question)
    doc_search = retrieval_onl(generate_queries_chain, question)
    return doc_pdf + doc_search

In [21]:
from langchain_google_genai import GoogleGenerativeAI
from langchain_core.output_parsers import StrOutputParser
from langchain.prompts import PromptTemplate
from operator import itemgetter
from langchain_core.runnables import (
    RunnableLambda,
    RunnableParallel,
    RunnablePassthrough,
)

model = GoogleGenerativeAI(model="models/gemini-1.5-pro-001", temperature=0.1)
embeddings = get_embedding_function()
db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embeddings)
retrieval = db.as_retriever(search_kwargs={"k": 1})

In [56]:
template="""You are an AI language model assistant, question analysis expert. Your task is to base on user questions point out important headings 
with the intention of asking questions about each heading. By creating many small questions, your goal
is to help make the answer clearer on each relevant aspect.
Provide these headers separated by newlines: \n
Question: {question}"""

template2 = """Answer the following question based on this context:
{context}
Question: {question}
"""

template4 = """
You're a helpful AI assistant. Given a user question and some DuckDuckGoSearchResults article snippets, 
answer the user question. If none of the articles answer the question, just say you don't know."
\n\nHere are the context:{context}
"""

prompt_1 = PromptTemplate.from_template(template)
prompt_2 = PromptTemplate.from_template(template2)
# prompt_3 = PromptTemplate.from_template(template3)
prompt_4 = PromptTemplate.from_template(template4)

In [57]:
question = "What are the determinants of financial inclusion in Vietnam?"
generate_queries = (
    prompt_1 
    | model
    | StrOutputParser() 
    | (lambda x: x.split("\n"))
)
generate_queries.invoke(question)
# context = retrieval_fusion(generate_queries, retrieval, question)

['## Determinants of Financial Inclusion in Vietnam:',
 '',
 '1. **Economic Factors:**',
 '    * GDP Growth and Income Levels',
 '    * Income Inequality and Poverty Rates',
 '    * Employment and Labor Market Conditions',
 '',
 '2. **Demographic Factors:**',
 '    * Population Density and Urbanization',
 '    * Age Distribution and Financial Literacy Levels',
 '    * Gender Disparity in Financial Access',
 '',
 '3. **Institutional Factors:**',
 '    * Government Policies and Regulations',
 '    * Central Bank Initiatives and Financial Education Programs',
 '    * Development of Financial Infrastructure and Technology',
 '',
 '4. **Supply-Side Factors:**',
 '    * Presence and Reach of Traditional Banking Institutions',
 '    * Role of Microfinance Institutions and Fintech Companies',
 '    * Availability and Affordability of Financial Products and Services',
 '',
 '5. **Demand-Side Factors:**',
 '    * Public Trust and Confidence in Financial Institutions',
 '    * Financial Literacy 

In [58]:
context = retrieval_fusion(generate_queries, retrieval, question)

retrieval
retrieval_local
retrieval_onl


In [59]:
context

[Document(page_content='Vietnam has been in the transitional process from a centrally planned economy controlled by the government to a market-oriented system. After applying this policy in 1986, Vietnam has gained a lot of social and economic achievements. From a poor country, Vietnam is now categorized as a lower-middle-income country with a sta- ble and prospective economic growth rate. To reach this achievement, the financial industry plays an essential role. Vietnam has been focusing on improving the finan- cial sector as a key driver of economic growth. However, based on the assessment of the World Bank (2017), only 31% of Vietnamese people above 15 years old have an account at formal financial institutions, while the average number in lower-middle-income countries is 56%. This figure demonstrates the low', metadata={'id': 'F:\\CMC\\CMC_Study\\Code\\data\\determinants-of-financial-inclusion-in-vietnam-a-demand-side-approach.pdf:None:9', 'source': 'F:\\CMC\\CMC_Study\\Code\\data\\


<class 'str'>


In [61]:

def run_main(context, question):
    final_rag_chain = (
        {"context": itemgetter("context"),
         "question": itemgetter("question")}
        | prompt_2
        | model
        | StrOutputParser()
    )
    return final_rag_chain.invoke({'context': context, 'question': question})

res = run_main(format_docs(context), question)
# question = "What are the determinants of financial inclusion in Vietnam?"
# final_rag_chain.invoke({"question": question})




In [62]:
res

'Here\'s a breakdown of the question "What are the determinants of financial inclusion in Vietnam?" into relevant sub-topics and questions:\n\n**1. Understanding Financial Inclusion in the Vietnamese Context**\n\n* **What is the current state of financial inclusion in Vietnam?** (How widespread is the use of financial services? What are the key statistics?)\n* **What are the specific challenges and barriers to financial inclusion in Vietnam?** (Consider geographical factors, income disparities, cultural factors, etc.)\n\n**2.  Demographic Determinants**\n\n* **How does education level impact financial inclusion in Vietnam?** (Do individuals with higher education levels have better access to and usage of financial services?)\n* **What is the relationship between income and financial inclusion in Vietnam?** (Are there disparities in access and usage based on income levels?)\n* **Does gender play a role in financial inclusion in Vietnam?** (Are there differences in access, usage, and type