# Building RAG with Qwen2.5

In [None]:
import json
import torch
import os
import shutil
from langchain.docstore.document import Document
from langchain.vectorstores import Chroma
from langchain_huggingface import HuggingFaceEmbeddings
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from langchain_huggingface import HuggingFacePipeline
from langchain.chains import RetrievalQA

# Clear any unused GPU memory
torch.cuda.empty_cache()

# Step 0: Clear the Chroma database if it exists
persist_dir = "./chroma.db"
if os.path.exists(persist_dir):
    shutil.rmtree(persist_dir)

# Step 1: Folder for JSON Files
input_folder = "Vietnam-Law-rag_json"

# Step 2: Load All JSON Files and Convert to LangChain Documents
documents = []
for file_name in os.listdir(input_folder):
    if file_name.endswith(".json"):
        file_path = os.path.join(input_folder, file_name)
        base_file_name = os.path.splitext(file_name)[0]  # Remove the extension for `file_id`
        
        with open(file_path, "r", encoding="utf-8") as file:
            data = json.load(file)
        
        # Convert JSON data to LangChain Document objects
        documents.extend([
            Document(
                page_content=entry["text"],
                metadata={
                    "id": entry["id"],
                    "article": entry["article"],
                    "clause": entry["clause"],
                    "title": entry["title"],
                    "file_id": base_file_name
                }
            )
            for entry in data
        ])

print(f"Loaded {len(documents)} documents from {input_folder}.")

# Step 3: Initialize HuggingFace Embeddings
embeddings_model = HuggingFaceEmbeddings()

# Step 4: Create Chroma Vector Store
vectorstore = Chroma.from_documents(
    documents=documents,
    embedding=embeddings_model,
    persist_directory=persist_dir
)

# Step 5: Load Qwen Model
model_id = "Qwen/Qwen2.5-0.5B"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id)

# Step 6: Set Device for GPU/CPU
device = 0 if torch.cuda.is_available() else -1

# Step 7: Create a Text-Generation Pipeline with GPU/CPU and Handle Tokenization Warning
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=100,
    device=device,
    clean_up_tokenization_spaces=True
)

# Wrap the pipeline for LangChain
hf = HuggingFacePipeline(pipeline=pipe)

# Step 8: Create a Retrieval-Based QA System
qa_chain = RetrievalQA.from_chain_type(
    llm=hf,
    retriever=vectorstore.as_retriever(),
    return_source_documents=True
)

# Clean up GPU memory and move model to CPU after training/inference
torch.cuda.empty_cache()
model.to("cpu")  # Move model back to CPU to free up GPU memory
torch.cuda.empty_cache()

print("System is ready for retrieval-based QA!")

# Test query
## RAG database builded on cloud servers, fetch them then run the below cell

The aim is to optimize the returned data after the query search before push into the LLM Models, below here use Qwen2.5 for example.

Just download the chroma.db, then symlink or put them in the current working git folder, then run the second cell.

In [None]:
import re

# Move model to GPU at the start
torch.cuda.empty_cache()
model.to("cuda" if torch.cuda.is_available() else "cpu")

# Increase retrieval limit
retriever = vectorstore.as_retriever(search_kwargs={"k": 250})

qa_chain = RetrievalQA.from_chain_type(
    llm=hf,
    retriever=retriever,
    return_source_documents=True
)

def deduplicate_documents(documents):
    """
    Remove duplicate content from the retrieved documents.
    """
    seen_content = set()
    unique_documents = []
    for doc in documents:
        # Use page content as the key for deduplication
        content_key = doc.page_content.strip()
        if content_key not in seen_content:
            seen_content.add(content_key)
            unique_documents.append(doc)
    return unique_documents

def expand_context_with_children(documents):
    """
    Expand parent clauses by appending child clause content using regex to identify parent-child relationships.
    """
    # Organize documents by parent id prefix
    grouped_docs = {}
    for doc in documents:
        # Extract parent ID using regex to match parent prefix (e.g., "Điều 2.33")
        match = re.match(r"(.*?\.\d+)", doc.metadata["id"])
        parent_id = match.group(1) if match else doc.metadata["id"]
        if parent_id not in grouped_docs:
            grouped_docs[parent_id] = []
        grouped_docs[parent_id].append(doc)

    # Create expanded documents
    expanded_documents = []
    for parent_id, docs in grouped_docs.items():
        # Sort documents to ensure children are added in order (e.g., "Điều 2.33a" comes after "Điều 2.33")
        sorted_docs = sorted(docs, key=lambda d: d.metadata["id"])
        # Combine content from parent and children
        combined_text = " ".join(d.page_content for d in sorted_docs)
        # Use the first document's metadata for the combined document
        parent_doc = sorted_docs[0]
        expanded_documents.append({
            "id": parent_doc.metadata["id"],
            "article": parent_doc.metadata.get("article"),
            "clause": parent_doc.metadata.get("clause"),
            "title": parent_doc.metadata.get("title"),
            "text": combined_text,
            "file_id": parent_doc.metadata.get("file_id")
        })

    return expanded_documents

# Query and retrieval
query = "An toàn lao động là gì"
result = qa_chain({"query": query})

# Step 1: Deduplicate retrieved documents
unique_docs = deduplicate_documents(result["source_documents"])

# Step 2: Expand context for parent clauses
expanded_docs = expand_context_with_children(unique_docs)

# Print the Result
print("Answer:", result["result"])

# Print the Source Documents
print("Source Documents:")
for doc in result["source_documents"]:
    print(f"Metadata: {doc.metadata}")
    print(f"Content: {doc.page_content}\n")

# Move model to CPU to release GPU memory
torch.cuda.empty_cache()
model.to("cpu")
torch.cuda.empty_cache()