# download libs
pip install torch transformers accelerate bitsandbytes langchain langchain-community langchain-experimental langchain-huggingface langchain-chroma langchain-text-splitters langchain-core chromadb

In [None]:
import torch
torch.cuda.is_available()

In [None]:
from transformers import BitsAndBytesConfig # for compressing model e.g. 16bits -> 4bits

from transformers import (
                          AutoTokenizer, # Tokenize Model
                          AutoModelForCausalLM,  # LLM Loader - used for loading and using pre-trained models designed for causal language modeling tasks
                          pipeline) # pipline to setup llm-task oritented model
                                    # pipline("text-classification", model='model', device=0)

from langchain_huggingface import HuggingFaceEmbeddings # huggingface sentence_transformer embedding models
from langchain_huggingface.llms import HuggingFacePipeline # like transformer pipeline

from langchain.memory import ConversationBufferMemory # Deprecated
from langchain_community.chat_message_histories import ChatMessageHistory # Deprecated
from langchain_community.document_loaders import PyPDFLoader, TextLoader # PDF Processing
from langchain.chains import ConversationalRetrievalChain # Deprecated
from langchain_experimental.text_splitter import SemanticChunker # module for chunking text

from langchain_chroma import Chroma # AI-native vector databases (ai-native mean built for handle large-scale AI workloads efficiently)
from langchain_text_splitters import RecursiveCharacterTextSplitter # recursively divide text, then merge them together if merge_size < chunk_size
from langchain_core.runnables import RunnablePassthrough # Use for testing (make 'example' easy to execute and experiment with)
from langchain_core.output_parsers import StrOutputParser # format LLM's output text into (list, dict or any custom structure we can work with)
from langchain import hub

In [None]:
# Read PDF file
Loader = PyPDFLoader
# FILE_PATH = "25 Thu·∫≠t Ng·ªØ AI - Machine Learning d·ªÖ hi·ªÉu cho ng∆∞·ªùi m·ªõi.pdf"
FILE_PATH = "iot_security_report.pdf"
loader = Loader(FILE_PATH)
documents = loader.load()

In [None]:
print(documents[:50])

[bkai-foundation-model 2024](https://huggingface.co/bkai-foundation-models/vietnamese-bi-encoder)

In [None]:
embeddings = HuggingFaceEmbeddings(
    model_name = "bkai-foundation-models/vietnamese-bi-encoder",
    model_kwargs = {'device': 'cuda'},
    encode_kwargs = {'normalize_embeddings': True}
) # convert text to vector (not chunking yet)

In [None]:
# runtime:
# + bkai-foundation-models/vietnamese-bi-encoder: 3 mins
# + keepitreal/vietnamese-sbert: 3mins
semantic_splitter = SemanticChunker(
    embeddings=embeddings,
    buffer_size=1, # total sentence collected before perform text split
    breakpoint_threshold_type='percentile', # set splitting style: 'percentage' of similarity
    breakpoint_threshold_amount=95, # split text if similarity score > 95%
    min_chunk_size=500,
    add_start_index=True, # assign index for chunk
)

docs = semantic_splitter.split_documents(documents)
print("Number of sementic chunks:", len(docs))

In [None]:
vector_db = Chroma.from_documents(documents=docs,
                                  embedding=embeddings)

retriever = vector_db.as_retriever()

In [None]:
result = retriever.invoke("IoT l√† g√¨ ?")
print("Num of relevant documents: ", len(result))

#? Kh√¥ng Embedd ƒë∆∞·ª£c h√¨nh (√Ω nghƒ©a c·ªßa h√¨nh)
#? May retrieve duplicate documents
for i, doc in enumerate(result, 1):
    print(f"\nüìÑ Documellmnt {i}")
    print("-" * 60)
    print(f"üìÑ Page       : {doc.metadata.get('page_label', doc.metadata.get('page'))}")
    print(f"üìù Content    :\n{doc.page_content.strip()}")
    print("-" * 60)

In [None]:
with open('token.txt', 'r') as f:
    hg_token = f.read() #? read huggingface token from token.txt file

In [None]:
# set up config
nf4_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16
)

In [None]:
from huggingface_hub import notebook_login
notebook_login()

In [None]:
#? Initialize Model and Tokenizer
#? PhoGPT-5.5B
#? Phi-2 (2.7B)
#? lmsys/vicuna-7b-v1.5
MODEL_NAME= "google/gemma-2b-it"

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=nf4_config, # add config
    low_cpu_mem_usage=True,
    token=hg_token
).to("cuda")

In [None]:
# Check if the model is on CUDA
if next(model.parameters()).is_cuda:
    print("Model is running on CUDA.")
else:
    print("Model is not running on CUDA.")

In [None]:
tokenizer = AutoTokenizer.from_pretrained(
    MODEL_NAME,
    use_fast=True,
    padding_side='left',   # 'left' or 'right' depending on model style (e.g., causal LM often prefers left)
    truncation_side='left'
)

In [None]:
tokenizer.pad_token = tokenizer.eos_token

# #? Integrated tokenizer and model into a Pipeline (for convinient)
model_pipeline = pipeline(
    'text-generation',
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=1024, # output token
    device_map="auto" # auto allocate GPU if available
)

llm = HuggingFacePipeline(
    pipeline=model_pipeline,
)

## Learn how to prompt so the LLM can generate better multiple-choice question

V√≠ d·ª• v·ªÅ m·ªôt c√¢u h·ªèi tr·∫Øc nghi·ªám t·ªët:

C√¢u h·ªèi: T·∫•n c√¥ng side-channel l√† g√¨?

Ph∆∞∆°ng √°n:

A. L√† t·∫•n c√¥ng t·ª´ xa v√†o giao di·ªán web.

B. L√† ki·ªÉu t·∫•n c√¥ng d·ª±a tr√™n h√†nh vi ti√™u th·ª• nƒÉng l∆∞·ª£ng c·ªßa thi·∫øt b·ªã.

C. L√† t·∫•n c√¥ng tr·ª±c di·ªán v√†o h·∫° t·∫ßng m·∫°ng

D. L√† t·∫•n c√¥ng d·ª±a v√†o b·ª©c x·∫° ƒëi·ªán t·ª´ ƒë·ªÉ l·∫•y kh√≥a m√£ h√≥a.

ƒê√°p √°n ƒë√∫ng: D

In [None]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

parser = StrOutputParser()


In [None]:
import os

def save_result(result, file_path):
    with open(file_path, 'w') as f:
            f.write(result)

In [None]:
from langchain.prompts import PromptTemplate

In [None]:
prompt = PromptTemplate.from_template("""
        Tr·∫£ l·ªùi ng·∫Øn g·ªçn, r√µ r√†ng b·∫±ng ti·∫øng vi·ªát v√† ch·ªâ d·ª±a tr√™n th√¥ng tin c√≥ s·∫µn b√™n d∆∞·ªõi.
        N·∫øu kh√¥ng t√¨m th·∫•y th√¥ng tin, h√£y n√≥i r√µ l√† kh√¥ng c√≥ d·ªØ li·ªáu li√™n quan.

        N·ªôi dung t√†i li·ªáu:
        {context}

        C√¢u h·ªèi:
        {question}

        Tr·∫£ l·ªùi:
""") #? d√πng {{ }} ƒë·ªÉ langchain kh√¥ng nh·∫≠n string b√™n trong {} l√† Bi·∫øn

rag_chain = (
{"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | parser
)

query = 'Li·ªát k√™ c√°c th√†nh ph·∫ßn trong h·ªá th·ªëng IoT ?'
result = rag_chain.invoke(query)

In [None]:
print(result)

### Customize RAG Output to Json

In [None]:
multi_choice_prompt = """
        D·ª±a v√†o n·ªôi dung sau, h√£y:
        1. T√≥m t·∫Øt t·ªëi ƒëa 3 √Ω ch√≠nh, k√®m theo s·ªë trang n·∫øu c√≥.
        2. Tr·∫£ l·ªùi c√¢u h·ªèi b·∫±ng ti·∫øng Vi·ªát ng·∫Øn g·ªçn v√† ch√≠nh x√°c.
        3. N·∫øu kh√¥ng c√≥ th√¥ng tin li√™n quan, h√£y ƒë·ªÉ "Answer" l√† "Kh√¥ng c√≥ d·ªØ li·ªáu li√™n quan".

        ƒê·∫£m b·∫£o tr·∫£ k·∫øt qu·∫£ **·ªü d·∫°ng JSON** v·ªõi c·∫•u tr√∫c sau:
        {{"main_ideas": [
            {{"point": "√ù ch√≠nh 1", "source": "Trang ..."}},
            {{"point": "√ù ch√≠nh 2", "source": "Trang ..."}},
            {{"point": "√ù ch√≠nh 3", "source": "Trang ..."}}
        ],
        "answer": "C√¢u tr·∫£ l·ªùi ng·∫Øn g·ªçn"
        }}

        Vui l√≤ng ch·ªâ tr·∫£ l·ªùi b·∫±ng format JSON, kh√¥ng gi·∫£i th√≠ch th√™m.

        Context:
        {context}

        Question:
        {question}

        Answer:

""" #? d√πng {{ }} ƒë·ªÉ langchain kh√¥ng nh·∫≠n string b√™n trong {} l√† Bi·∫øn

In [None]:
from langchain.output_parsers import PydanticOutputParser
from pydantic import BaseModel, Field
from typing import List

class MainIdea(BaseModel):
    point: str
    source: str

class QAResponse(BaseModel):
    main_ideas: List[MainIdea]
    answer: str

parser = PydanticOutputParser(pydantic_object=QAResponse)
prompt_template = PromptTemplate(
    template=multi_choice_prompt,
    input_variables=["context", "question"],
    partial_variables={"format_instructions": parser.get_format_instructions()}
)

rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt_template
    | llm
    | parser
)

# query = 'Li·ªát k√™ c√°c th√†nh ph·∫ßn trong h·ªá th·ªëng IoT ?'
query = 'IoT l√† g√¨ ?'
result = rag_chain.invoke(query)
print(result)

In [None]:
multi_choice_prompt = """
        D·ª±a v√†o n·ªôi dung sau, h√£y:
        1. T√≥m t·∫Øt t·ªëi ƒëa 3 √Ω ch√≠nh, k√®m theo s·ªë trang n·∫øu c√≥.
        2. Tr·∫£ l·ªùi c√¢u h·ªèi b·∫±ng ti·∫øng Vi·ªát ng·∫Øn g·ªçn v√† ch√≠nh x√°c.
        3. N·∫øu kh√¥ng c√≥ th√¥ng tin li√™n quan, h√£y ƒë·ªÉ "Answer" l√† "Kh√¥ng c√≥ d·ªØ li·ªáu li√™n quan".

        ƒê·∫£m b·∫£o tr·∫£ k·∫øt qu·∫£ **·ªü d·∫°ng JSON** v·ªõi c·∫•u tr√∫c sau:
        {{"main_ideas": [
            {{"point": "√ù ch√≠nh 1", "source": "Trang ..."}},
            {{"point": "√ù ch√≠nh 2", "source": "Trang ..."}},
            {{"point": "√ù ch√≠nh 3", "source": "Trang ..."}}
        ],
        "answer": "C√¢u tr·∫£ l·ªùi ng·∫Øn g·ªçn"
        }}

        Vui l√≤ng ch·ªâ tr·∫£ l·ªùi b·∫±ng format JSON, kh√¥ng gi·∫£i th√≠ch th√™m.

        Context:
        {context}

        Question:
        {question}

        Answer:

""" #? d√πng {{ }} ƒë·ªÉ langchain kh√¥ng nh·∫≠n string b√™n trong {} l√† Bi·∫øn

In [None]:
def run_custom_rag(user_question):
    prompt = PromptTemplate.from_template(multi_choice_prompt)
    rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
        | prompt
        | llm
        | parser
    )


    query = user_question
    result = rag_chain.invoke(query)

    file_path = 'output.txt'
    save_result(result, file_path)


    return result

In [None]:
question = "IoT l√† gi?"
result = run_custom_rag(question)

In [None]:
query = "C√°c th√†nh ph·∫ßn trong h·ªá th·ªëng IoT bao g·ªìm nh·ªØng g√¨ ?"
result = run_custom_rag(query)