In [None]:
%pip install -U langchain-community
%pip install pypdf
%pip install chromadb
%pip install sentence-transformers
%pip install -U bitsandbytes

In [1]:
import os
import re
import logging
import torch
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from langchain.vectorstores.chroma import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
import json

2025-07-21 16:11:44.424621: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-07-21 16:11:44.438308: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1753114304.456404    8589 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1753114304.462003    8589 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-07-21 16:11:44.479684: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

In [2]:
# Set up logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")

assert torch.cuda.is_available(), "CUDA is not available. GPU is not being used!"
print("CUDA device in use:", torch.cuda.get_device_name(0))

model_id = "aisingapore/Llama-SEA-LION-v3.5-8B-R"

tokenizer = AutoTokenizer.from_pretrained(model_id)

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    load_in_8bit=True,          # Enable 8-bit quantization
    torch_dtype=torch.float16,
)

pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    device_map="auto",
)

logging.info("Pipeline created.")


DATA_PATH = "./data/"
CHROMA_PATH = "chroma"
embedding_model = HuggingFaceEmbeddings(model_name="intfloat/multilingual-e5-base")

PROMPT_TEMPLATE = """
You are a helpful loan officer. Use the information below to match the user's profile with suitable loan products from the brochure.

Speak **in Khmer**. Use short, clear sentences. Avoid technical terms and do not over-explain. No bold or italic formatting.

User Information:
- Location: {location}
- Monthly Income: ${monthly_income}
- Business Age: {business_age_months} months
- Collateral: {collateral}
- Existing Loans: {existing_loans}

Loan Brochure:
{context}

Instructions:
1. List the loan products the user qualifies for.
2. Briefly explain why they qualify.
3. If they don’t qualify, explain why.
4. Keep the answer short and easy to understand.
5. Do not answer anything you dont have the context to.
"""




def load_documents():
  """
  Load PDF documents from the specified directory using PyPDFDirectoryLoader.

  Returns:
      List[Document]: Loaded PDF documents with metadata including source file and page number.
  """
  from langchain.document_loaders import PyPDFDirectoryLoader

  # Load PDF documents
  document_loader = PyPDFDirectoryLoader(DATA_PATH)
  documents = document_loader.load()

  # Ensure metadata contains 'source' and 'page'
  for i, doc in enumerate(documents[:5]):  # Just print first 5 for debug
      print(f"Doc {i + 1}:")
      print(f"  Source: {doc.metadata.get('source')}")
      print(f"  Page: {doc.metadata.get('page')}")

  return documents


def split_text(documents: list[Document]):
  """
  Split the text of the loaded documents into smaller chunks using RecursiveCharacterTextSplitter.

  Args:
      documents (list[Document]): List of Document objects to be split.

  Returns:
      List of Document objects: Documents with text split into smaller chunks.
  """
  text_splitter = RecursiveCharacterTextSplitter(
      chunk_size=512,
      chunk_overlap=100,
      length_function=len,
      add_start_index=True
  )

  chunks = text_splitter.split_documents(documents)

  # Log chunks and metadata for debugging
  print(f"Split {len(documents)} documents into {len(chunks)} chunks.")
  for i, chunk in enumerate(chunks[:3]):  # Show a few samples
      print(f"Chunk {i + 1} metadata:", chunk.metadata)

  return chunks



def save_to_chroma(chunks: list[Document]):

  """
  Save the split document chunks to a Chroma vector store.
  Args:
      chunks (list[Document]): List of Document objects to be saved.

  Returns:
      None
  """

  # Load existing DB or create a new one
  if os.path.exists(CHROMA_PATH):
      db = Chroma(
          persist_directory=CHROMA_PATH,
          embedding_function=embedding_model
      )
      db.add_documents(chunks)  # Add new documents
      print("Added new documents to existing Chroma vector store.")
  else:
      db = Chroma.from_documents(
          chunks,
          embedding_model,
          persist_directory=CHROMA_PATH
      )
      print("Created new Chroma vector store.")

  db.persist()
  print(f"Saved {len(chunks)} chunks to {CHROMA_PATH}.")

def generate_data_store():
  """
  Main function to generate vector database in chroma from documents.
  Returns:
      None
  """
  documents = load_documents() # Load documents from a source
  chunks = split_text(documents) # Split documents into manageable chunks
  save_to_chroma(chunks) # Save the processed data to a data store

def extract_entities(text: str) -> dict:
    print(f"[INFO] Extracting entities from text: {text}")

    ner_prompt = f"""
    You are an information extractor. Your task is to return ONLY a valid JSON object with these keys: location, monthly_income, business_age_months, collateral, existing_loans.
    
    DO NOT include any explanations, introductions, or any text outside the JSON. ONLY output the JSON.
    
    User input:
    {text}
    
    Output:
    """

    messages = [{"role": "user", "content": ner_prompt}]

    print("[INFO] Running SEA-LION NER prompt with thinking_mode off...")
    try:
        prompt = tokenizer.apply_chat_template(
            messages,
            add_generation_prompt=True,
            tokenize=False,
            thinking_mode="off"
        )

        output = pipeline(
            prompt,
            max_new_tokens=512,
            return_full_text=False,
            truncation=True,
            do_sample=False,
        )
    except Exception as e:
        print(f"[ERROR] Pipeline execution failed: {e}")
        return {
            "location": None,
            "monthly_income": None,
            "business_age_months": None,
            "collateral": None,
            "existing_loans": None
        }

    ner_text = output[0].get("generated_text", "").strip()
    print(f"[INFO] Raw NER output:\n{ner_text}")

    entities = {
        "location": None,
        "monthly_income": None,
        "business_age_months": None,
        "collateral": None,
        "existing_loans": None
    }

    

    json_match = re.search(r"\{.*\}", ner_text, re.DOTALL)
    if json_match:
        json_str = json_match.group(0)
        print(f"[DEBUG] Matched JSON string: {json_str}")
        try:
            parsed = json.loads(json_str)
            for key in entities.keys():
                if key in parsed:
                    val = parsed[key]
                    if key in ["monthly_income", "business_age_months"]:
                        try:
                            val_clean = int(re.sub(r"[^\d]", "", str(val)))
                            print(f"[DEBUG] Parsed {key}: raw='{val}', cleaned={val_clean}")
                            val = val_clean
                        except Exception as e:
                            print(f"[WARN] Failed to parse {key} value '{val}': {e}")
                    entities[key] = val
                else:
                    print(f"[WARN] Missing key '{key}' in parsed output")
        except json.JSONDecodeError as e:
            print(f"[WARN] JSON decoding failed: {e}")
    else:
        print("[WARN] No JSON object found in NER output")

    print(f"[INFO] Extracted entities: {entities}")
    return entities




def ask_question(query_text: str, k: int = 3):
    logging.info("Starting question processing...")

    # Extract entities from user input via SEA-LION NER
    entities = extract_entities(query_text)
    logging.info(f"Extracted entities: {entities}")

    db = Chroma(
        persist_directory=CHROMA_PATH,
        embedding_function=embedding_model
    )

    logging.info("Performing semantic search...")
    results = db.similarity_search(query_text, k=k)
    logging.info(f"Retrieved {len(results)} relevant chunks.")

    context_chunks = []
    for doc in results:
        metadata = doc.metadata or {}
        context_chunks.append({
            "filename": os.path.basename(metadata.get("source", "unknown.pdf")),
            "page": metadata.get("page", 1),
            "text": doc.page_content.strip()
        })

    context_text = "\n\n".join([chunk["text"] for chunk in context_chunks])

    prompt = PROMPT_TEMPLATE.format(
        location=entities.get("location", "Unknown"),
        monthly_income=entities.get("monthly_income", "Unknown"),
        business_age_months=entities.get("business_age_months", "Unknown"),
        collateral=entities.get("collateral", "Unknown"),
        existing_loans=entities.get("existing_loans", "Unknown"),
        context=context_text
    ).strip()

    logging.info("Generating response from SEA-LION pipeline...")
    messages = [{"role": "user", "content": prompt}]

    print("[INFO] Prompt: ", prompt)
    output = pipeline(
        messages,
        max_new_tokens=4096,
        return_full_text=False,
        truncation=True,
    )
    logging.info("Response generation complete.")

    print("[INFO] Output: ", output)

    answer = output[0]["generated_text"].strip()

    return answer, context_chunks


CUDA device in use: Tesla T4


The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
2025-07-21 16:11:48,959 [INFO] We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Device set to use cuda:0
2025-07-21 16:12:06,737 [INFO] Pipeline created.
  embedding_model = HuggingFaceEmbeddings(model_name="intfloat/multilingual-e5-base")
2025-07-21 16:12:07,207 [INFO] Use pytorch device_name: cuda:0
2025-07-21 16:12:07,207 [INFO] Load pretrained SentenceTransformer: intfloat/multilingual-e5-base


In [3]:
generate_data_store()

Doc 1:
  Source: data/Loan Product Km.pdf
  Page: 0
Split 1 documents into 2 chunks.
Chunk 1 metadata: {'producer': 'Microsoft® Word 2019', 'creator': 'Microsoft® Word 2019', 'creationdate': '2025-07-21T23:11:01+07:00', 'author': 'NHEM DARAYUT', 'moddate': '2025-07-21T23:11:01+07:00', 'source': 'data/Loan Product Km.pdf', 'total_pages': 1, 'page': 0, 'page_label': '1', 'start_index': 0}
Chunk 2 metadata: {'producer': 'Microsoft® Word 2019', 'creator': 'Microsoft® Word 2019', 'creationdate': '2025-07-21T23:11:01+07:00', 'author': 'NHEM DARAYUT', 'moddate': '2025-07-21T23:11:01+07:00', 'source': 'data/Loan Product Km.pdf', 'total_pages': 1, 'page': 0, 'page_label': '1', 'start_index': 417}


  db = Chroma(
2025-07-21 16:12:09,987 [INFO] Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.


Added new documents to existing Chroma vector store.
Saved 2 chunks to chroma.


  db.persist()


In [None]:
# Run your query
example_query = "អាជីវកម្មរបស់ខ្ញុំស្ថិតនៅភ្នំពេញ ដំណើរការមកបាន ១៤ ខែ មានចំណូល ៩០០ ដុល្លារក្នុងមួយខែ។ ខ្ញុំមានម៉ូតូជាវត្ថុបញ្ចាំ និងមិនមានប្រាក់កម្ចីណាមួយស្រាប់។"

answer, context = ask_question(example_query, 3)

print("\nGenerated Answer:\n", answer)


2025-07-21 16:12:10,314 [INFO] Starting question processing...
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


[INFO] Extracting entities from text: អាជីវកម្មរបស់ខ្ញុំស្ថិតនៅភ្នំពេញ ដំណើរការមកបាន ១៤ ខែ មានចំណូល ៩០០ ដុល្លារក្នុងមួយខែ។ ខ្ញុំមានម៉ូតូជាវត្ថុបញ្ចាំ និងមិនមានប្រាក់កម្ចីណាមួយស្រាប់។
[INFO] Running SEA-LION NER prompt with thinking_mode off...


2025-07-21 16:12:19,146 [INFO] Extracted entities: {'location': 'ភ្នំពេញ', 'monthly_income': 900, 'business_age_months': 14, 'collateral': 'ម៉ូតូ', 'existing_loans': 'គ្មាន'}
2025-07-21 16:12:19,150 [INFO] Performing semantic search...
2025-07-21 16:12:19,179 [INFO] Retrieved 2 relevant chunks.
2025-07-21 16:12:19,180 [INFO] Generating response from SEA-LION pipeline...


[INFO] Raw NER output:
{
  "location": "ភ្នំពេញ",
  "monthly_income": 900,
  "business_age_months": 14,
  "collateral": "ម៉ូតូ",
  "existing_loans": "គ្មាន"
}
[DEBUG] Matched JSON string: {
  "location": "ភ្នំពេញ",
  "monthly_income": 900,
  "business_age_months": 14,
  "collateral": "ម៉ូតូ",
  "existing_loans": "គ្មាន"
}
[DEBUG] Parsed monthly_income: raw='900', cleaned=900
[DEBUG] Parsed business_age_months: raw='14', cleaned=14
[INFO] Extracted entities: {'location': 'ភ្នំពេញ', 'monthly_income': 900, 'business_age_months': 14, 'collateral': 'ម៉ូតូ', 'existing_loans': 'គ្មាន'}
[INFO] Prompt:  You are a helpful loan officer. Use the information below to match the user's profile with suitable loan products from the brochure.

Speak **in Khmer**. Use short, clear sentences. Avoid technical terms and do not over-explain. No bold or italic formatting.

User Information:
- Location: ភ្នំពេញ
- Monthly Income: $900
- Business Age: 14 months
- Collateral: ម៉ូតូ
- Existing Loans: គ្មាន

Loan B

2025-07-21 16:14:10,776 [INFO] Response generation complete.


[INFO] Output:  [{'generated_text': "Okay, let's start by looking at the user's information. They are located in Phnom Penh, have a monthly income of $900, their business is 14 months old, and they have a motorcycle as collateral. They don't have any existing loans.\n\nNow, checking the loan brochure. There are three products: Equipment Backed Loan, MSME Growth Loan, and Nano Startup Loan.\n\nFirst, the Equipment Backed Loan. The requirements are unclear from the provided info, but since the user has collateral (motorcycle), they might qualify. However, the brochure doesn't specify income requirements for this product, so I can't confirm for sure.\n\nNext, the MSME Growth Loan. It requires a monthly income of at least $800, which the user has ($900). The business needs to be 12 months old, which it is (14 months). They can use motorcycle as collateral. The loan amount is $1,000-$10,000. This seems to fit. The interest rate is 1.5% monthly, but the user's income is sufficient, so they q