# Building RAG with Qwen2.5

### CELL 1

In [None]:
import json
import os
import shutil
import torch
from transformers import AutoTokenizer, AutoModel
from langchain.vectorstores import Chroma
from langchain.docstore.document import Document

# ViT5 Model Details
model_name = "VietAI/vit5-base"  # Use the ViT5 model

# Initialize ViT5 tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

class ViT5Embeddings:
    def __init__(self, model, tokenizer, device="cuda" if torch.cuda.is_available() else "cpu"):
        self.model = model.to(device)
        self.tokenizer = tokenizer
        self.device = device

    def embed_text(self, text: str) -> torch.Tensor:
        """Generate embeddings for a given text."""
        # Tokenize and get model outputs
        inputs = self.tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
        inputs = {k: v.to(self.device) for k, v in inputs.items()}
        
        # For embedding, we only need the encoder output (not the decoder)
        with torch.no_grad():  # Disable gradients to save memory during inference
            outputs = self.model.encoder(**inputs)  # Use encoder directly

        # Use mean pooling of the last hidden states
        hidden_states = outputs.last_hidden_state  # Shape: [batch_size, seq_length, hidden_dim]
        attention_mask = inputs["attention_mask"]
        mask_expanded = attention_mask.unsqueeze(-1).expand(hidden_states.size())
        sum_embeddings = torch.sum(hidden_states * mask_expanded, 1)
        sum_mask = torch.clamp(mask_expanded.sum(1), min=1e-9)
        mean_pooled = sum_embeddings / sum_mask
        
        return mean_pooled.squeeze(0).detach().cpu()

    def embed_documents(self, texts: list[str]) -> list[torch.Tensor]:
        """Generate embeddings for a batch of documents."""
        return [self.embed_text(text) for text in texts]

    def embed_query(self, text: str) -> torch.Tensor:
        """Generate an embedding for a query."""
        return self.embed_text(text)

# Initialize ViT5 embeddings
vit5_embeddings = ViT5Embeddings(model=model, tokenizer=tokenizer)

# Step 1: Clear Chroma database if it exists
persist_dir = "./chroma.db"
if os.path.exists(persist_dir):
    shutil.rmtree(persist_dir)

# Step 2: Load JSON files and convert to LangChain Documents
input_folder = "./finetune_law_vietnam/Vietnam-Law-rag_json"
documents = []
for file_name in os.listdir(input_folder):
    if file_name.endswith(".json"):
        file_path = os.path.join(input_folder, file_name)
        base_file_name = os.path.splitext(file_name)[0]
        
        with open(file_path, "r", encoding="utf-8") as file:
            data = json.load(file)
        
        documents.extend([
            Document(
                page_content=entry["text"],
                metadata={
                    "id": entry["id"],
                    "article": entry["article"],
                    "clause": entry["clause"],
                    "title": entry["title"],
                    "file_id": base_file_name
                }
            )
            for entry in data
        ])

print(f"Loaded {len(documents)} documents from {input_folder}.")

# Step 3: Create embeddings for documents
embedded_documents = [
    {
        "embedding": vit5_embeddings.embed_text(doc.page_content),
        "metadata": doc.metadata,
        "content": doc.page_content,
    }
    for doc in documents
]

# Step 4: Store in Chroma Vector Store
vectorstore = Chroma.from_embeddings(
    embeddings=[doc["embedding"] for doc in embedded_documents],
    documents=[doc["content"] for doc in embedded_documents],
    metadatas=[doc["metadata"] for doc in embedded_documents],
    persist_directory=persist_dir
)

vectorstore.persist()
print("Chroma database created and saved at:", persist_dir)

# Test query
## RAG database builded on cloud servers, fetch them then run the below cell

The aim is to optimize the returned data after the query search before push into the LLM Models, below here use Qwen2.5 for example.

Just download the chroma.db, then symlink or put them in the current working git folder, then run the second cell.

### CELL 2

In [None]:
import torch
from langchain.vectorstores import Chroma
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from langchain_huggingface import HuggingFacePipeline
from transformers import AutoModel

# VietAI Embedding Class (for VietAI/vit5-base)
class VietAIEmbeddings:
    def __init__(self, model, tokenizer, device="cuda" if torch.cuda.is_available() else "cpu"):
        self.model = model.to(device)
        self.tokenizer = tokenizer
        self.device = device

    def embed_text(self, text: str) -> torch.Tensor:
        """Generate embeddings for a given text."""
        inputs = self.tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
        inputs = {k: v.to(self.device) for k, v in inputs.items()}
        outputs = self.model(**inputs)
        hidden_states = outputs.last_hidden_state
        attention_mask = inputs["attention_mask"]
        mask_expanded = attention_mask.unsqueeze(-1).expand(hidden_states.size())
        sum_embeddings = torch.sum(hidden_states * mask_expanded, 1)
        sum_mask = torch.clamp(mask_expanded.sum(1), min=1e-9)
        mean_pooled = sum_embeddings / sum_mask
        return mean_pooled.squeeze(0).detach().cpu()

    def embed_query(self, text: str) -> torch.Tensor:
        """Generate an embedding for a query."""
        return self.embed_text(text)

# Step 1: Load the Chroma Database
persist_dir = "./chroma.db"

# Reinitialize VietAI/vit5-base tokenizer and model
vit5_model_id = "VietAI/vit5-base"
vit5_tokenizer = AutoTokenizer.from_pretrained(vit5_model_id)
vit5_model = AutoModel.from_pretrained(vit5_model_id)
vit5_embeddings = VietAIEmbeddings(model=vit5_model, tokenizer=vit5_tokenizer)

# Load the Chroma database
vectorstore = Chroma(
    persist_directory=persist_dir,
    embedding_function=vit5_embeddings.embed_query  # Use VietAI/vit5-base embedding function
)

print("Chroma database loaded.")

# Step 2: Load Qwen Model for Text Generation
qwen_model_id = "Qwen/Qwen2.5-0.5B"
qwen_tokenizer = AutoTokenizer.from_pretrained(qwen_model_id)
qwen_model = AutoModelForCausalLM.from_pretrained(qwen_model_id)

# Step 3: Set Device for GPU/CPU
device = 0 if torch.cuda.is_available() else -1

# Step 4: Create a Text-Generation Pipeline with GPU/CPU for Qwen Model
pipe = pipeline(
    "text-generation",
    model=qwen_model,
    tokenizer=qwen_tokenizer,
    max_new_tokens=100,
    device=device,
    clean_up_tokenization_spaces=True
)

# Wrap the pipeline for LangChain
hf = HuggingFacePipeline(pipeline=pipe)

print("Qwen Model and pipeline initialized.")

# Step 5: Move Qwen Model to CPU to release GPU memory (if used)
torch.cuda.empty_cache()
qwen_model.to("cpu")
torch.cuda.empty_cache()

  vectorstore = Chroma(


Chroma database loaded.
Model and pipeline initialized.


### CELL 3

In [None]:
import re
from langchain.chains import RetrievalQA

# Move model to GPU
torch.cuda.empty_cache()
model.to("cuda" if torch.cuda.is_available() else "cpu")

# Increase retrieval limit
retriever = vectorstore.as_retriever(search_kwargs={"k": 50})

qa_chain = RetrievalQA.from_chain_type(
    llm=hf,
    retriever=retriever,
    return_source_documents=True
)

# Query and retrieval
query = "Trẻ em là gì"
result = qa_chain({"query": query})

# Print the Result
print("Answer:", result["result"])

# Print the Source Documents
print("Source Documents:")
for doc in result["source_documents"]:
    print(f"Metadata: {doc.metadata}")
    print(f"Content: {doc.page_content}\n")

# Move model to CPU to release GPU memory
torch.cuda.empty_cache()
model.to("cpu")
torch.cuda.empty_cache()

### CELL 4

In [None]:
import re
from langchain.chains import RetrievalQA
import torch

# Move model to GPU
torch.cuda.empty_cache()
model.to("cuda" if torch.cuda.is_available() else "cpu")

# Increase retrieval limit
retriever = vectorstore.as_retriever(search_kwargs={"k": 250})

# Define a custom retriever function that intercepts and modifies the results
def custom_retriever(query, k=300):
    # Retrieve relevant documents from the retriever
    results = retriever.get_relevant_documents(query)
    
    modified_documents = []
    
    for doc in results:
        # Example of cleaning content: Removing unwanted words or patterns (adjust regex as needed)
        cleaned_content = re.sub(r"unwanted_pattern", "", doc.page_content)
        
        # Ensure that the content is relevant to the query (optional: further filtering logic)
        if 'thời hạn' in cleaned_content.lower():  # Assuming query context involves time limits like 'thời hạn'
            modified_doc = {
                "metadata": doc.metadata,
                "page_content": cleaned_content,
            }
            modified_documents.append(modified_doc)
    
    return modified_documents

# Define the instruction for the model
instruction = "Answer: Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer."

# Use the custom retriever to retrieve and modify documents
query = "Trẻ em là gì"
retrieved_docs = custom_retriever(query)

# Debug: Check if any documents are retrieved and modified
if not retrieved_docs:
    print("No relevant documents retrieved or filtered out.")
else:
    print(f"Retrieved {len(retrieved_docs)} documents.")

# Concatenate the instruction with the retrieved documents
input_to_llm = instruction + " " + " ".join([doc["page_content"] for doc in retrieved_docs])

# Check the input format before passing to the model
print("Input to LLM:", input_to_llm[:500])  # Print the first 500 characters for debugging

# Send the modified input to your LLM (e.g., Qwen2.5 model)
result_from_llm = hf(input_to_llm)  # Assuming 'hf' is your model callable
print("LLM Output:", result_from_llm)

# Move model to CPU to release GPU memory
torch.cuda.empty_cache()
model.to("cpu")
torch.cuda.empty_cache()