# Building RAG with Qwen2.5

In [None]:
import json
import os
import shutil
from langchain.docstore.document import Document
from langchain.vectorstores import Chroma
from langchain_huggingface import HuggingFaceEmbeddings

# Step 0: Clear the Chroma database if it exists
persist_dir = "./chroma.db"
if os.path.exists(persist_dir):
    shutil.rmtree(persist_dir)

# Step 1: Folder for JSON Files
input_folder = "Vietnam-Law-rag_json"

# Step 2: Load All JSON Files and Convert to LangChain Documents
documents = []
for file_name in os.listdir(input_folder):
    if file_name.endswith(".json"):
        file_path = os.path.join(input_folder, file_name)
        base_file_name = os.path.splitext(file_name)[0]  # Remove the extension for `file_id`
        
        with open(file_path, "r", encoding="utf-8") as file:
            data = json.load(file)
        
        # Convert JSON data to LangChain Document objects
        documents.extend([
            Document(
                page_content=entry["text"],
                metadata={
                    "id": entry["id"],
                    "article": entry["article"],
                    "clause": entry["clause"],
                    "title": entry["title"],
                    "file_id": base_file_name
                }
            )
            for entry in data
        ])

print(f"Loaded {len(documents)} documents from {input_folder}.")

# Step 3: Initialize HuggingFace Embeddings
embeddings_model = HuggingFaceEmbeddings()

# Step 4: Create Chroma Vector Store
vectorstore = Chroma.from_documents(
    documents=documents,
    embedding=embeddings_model,
    persist_directory=persist_dir
)

print("Chroma database created and saved at:", persist_dir)

# Test query
## RAG database builded on cloud servers, fetch them then run the below cell

The aim is to optimize the returned data after the query search before push into the LLM Models, below here use Qwen2.5 for example.

Just download the chroma.db, then symlink or put them in the current working git folder, then run the second cell.

### CELL 2

In [25]:
import torch
from langchain.vectorstores import Chroma
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from langchain_huggingface import HuggingFacePipeline
from langchain_huggingface import HuggingFaceEmbeddings

# Step 1: Load the Chroma Database
persist_dir = "./chroma.db"

# Initialize the embedding function
embeddings_model = HuggingFaceEmbeddings()

# Load the Chroma database with the embedding function
vectorstore = Chroma(
    persist_directory=persist_dir,
    embedding_function=embeddings_model
)

print("Chroma database loaded.")

# Step 2: Load Qwen Model
model_id = "Qwen/Qwen2.5-0.5B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id)

# Step 3: Set Device for GPU/CPU
device = 0 if torch.cuda.is_available() else -1

# Step 4: Create a Text-Generation Pipeline with GPU/CPU
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=100,
    device=device,
    clean_up_tokenization_spaces=True
)

# Wrap the pipeline for LangChain
hf = HuggingFacePipeline(pipeline=pipe)

print("Model and pipeline initialized.")

# Move model to CPU to release GPU memory
torch.cuda.empty_cache()
model.to("cpu")
torch.cuda.empty_cache()

Chroma database loaded.
Model and pipeline initialized.


### CELL 3

In [None]:
import re
from langchain.chains import RetrievalQA
import torch

# Move model to GPU
torch.cuda.empty_cache()
model.to("cuda" if torch.cuda.is_available() else "cpu")

# Increase retrieval limit
retriever = vectorstore.as_retriever(search_kwargs={"k": 25})

qa_chain = RetrievalQA.from_chain_type(
    llm=hf,
    retriever=retriever,
    return_source_documents=True
)

# Query and retrieval
query = "Thuốc cổ truyền là gì"
result = qa_chain.invoke({"query": query})

# Print the Result
print("Answer:", result["result"])

# # Print the Source Documents
# print("Source Documents:")
# for doc in result["source_documents"]:
#     print(f"Metadata: {doc.metadata}")
#     print(f"Content: {doc.page_content}\n")

# Move model to CPU to release GPU memory
torch.cuda.empty_cache()
model.to("cpu")
torch.cuda.empty_cache()

### CELL 4

In [28]:
import torch
from langchain.chains import RetrievalQA
import re

# Constants
k_retrieval = 75

# Move model to GPU
torch.cuda.empty_cache()
model.to("cuda" if torch.cuda.is_available() else "cpu")

# Define retriever
retriever = vectorstore.as_retriever(search_kwargs={"k": k_retrieval})

# Define a custom retrieval QA chain with preprocessing
class CustomRetrievalQA(RetrievalQA):
    def _get_relevant_documents(self, query: str):
        """
        Override the retrieval mechanism to preprocess documents.
        """
        docs = super()._get_relevant_documents(query)
        # Preprocess retrieved documents, e.g., filter noise, deduplicate, etc.
        processed_docs = [
            doc.page_content.strip() for doc in docs if len(doc.page_content.strip()) > 10
        ]
        # Optionally combine or summarize the processed docs
        combined_docs = " ".join(processed_docs)
        return combined_docs

# Create the QA chain
qa_chain = CustomRetrievalQA.from_chain_type(
    llm=hf,
    retriever=retriever,
    return_source_documents=True
)

# Query and retrieval
query = "Trẻ em là gì"
result = qa_chain.invoke({"query": query})  # Use invoke for langchain 0.1.0+

# Print the Result
# print("Answer:", result["result"])
match = re.search(r"Helpful Answer:(.*)", result["result"], re.DOTALL)
if match:
    helpful_answer = match.group(1).strip()
else:
    helpful_answer = "No answer provided."

# Print the Source Documents
print("Source Documents:")
for doc in result["source_documents"]:
    print(f"Metadata: {doc.metadata}")
    print(f"Content: {doc.page_content}\n")

# Print the extracted helpful answer
print("Answer:", helpful_answer)

# Move model to CPU to release GPU memory
torch.cuda.empty_cache()
model.to("cpu")
torch.cuda.empty_cache()

Source Documents:
Metadata: {'article': 'Điều 10', 'clause': '1b', 'file_id': 'Luật-102-2016-QH13', 'id': 'Điều 10.1b', 'title': 'Trẻ em có hoàn cảnh đặc biệt'}
Content: Trẻ em bị bỏ rơi;

Metadata: {'article': 'Điều 7', 'clause': '3', 'file_id': 'Luật-11-2017-QH14', 'id': 'Điều 7.3', 'title': 'Người được trợ giúp pháp lý'}
Content: Trẻ em.

Metadata: {'article': 'Điều 48', 'clause': '1g', 'file_id': 'Luật-15-2023-QH15', 'id': 'Điều 48.1g', 'title': 'Hình thức tổ chức của cơ sở khám bệnh, chữa bệnh'}
Content: Trạm y tế;

Metadata: {'article': 'Điều 90', 'clause': '1c', 'file_id': 'Luật-08-2022-QH15', 'id': 'Điều 90.1c', 'title': 'Hoạt động thuê ngoài'}
Content: Quản trị rủi ro;

Metadata: {'article': 'Điều 10', 'clause': '1a', 'file_id': 'Luật-102-2016-QH13', 'id': 'Điều 10.1a', 'title': 'Trẻ em có hoàn cảnh đặc biệt'}
Content: Trẻ em mồ côi cả cha và mẹ;

Metadata: {'article': 'Điều 10', 'clause': '1m', 'file_id': 'Luật-102-2016-QH13', 'id': 'Điều 10.1m', 'title': 'Trẻ em có hoàn cảnh