## Install



In [None]:
!pip -q install pypdf sentence-transformers
!pip -q install jq  tiktoken
!pip -q install langchain langchain_community langchain-qdrant
!pip -q install fastembed
!pip -q install groq
!pip -q install PyMuPDF

In [None]:
import os
from groq import Groq
from google.colab import userdata



client = Groq(
    api_key=userdata.get('GROQ_API'),
)

chat_completion = client.chat.completions.create(
    messages=[
        {
            "role": "user",
            "content": "แนะนำวิธีทำข้าวแกงกระหรี่",
        }
    ],
    model= "meta-llama/llama-4-maverick-17b-128e-instruct" ,  #"llama-3.1-70b-versatile"
)

print(chat_completion.choices[0].message.content)

In [None]:
# https://drive.google.com/file/d/1X80o1hlkf6bqpXW-MufBXmatAouVHlTz/view?usp=sharing

!gdown 1X80o1hlkf6bqpXW-MufBXmatAouVHlTz

In [None]:
import fitz
import json
import os
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.document_loaders import JSONLoader
from langchain.text_splitter import CharacterTextSplitter
from tqdm.auto import tqdm, trange

In [None]:
def process_text(text):
    return text.replace(' า','ำ')

def metadata_func(record: dict, metadata: dict) -> dict:

    metadata["file"] = record["file"]
    metadata["page"] = record["page"]
    metadata['source'] = '-'

    return metadata

In [None]:
## convert pdf to json
pdf_file = '/content/สรุปผลงานสภาผู้แทนราษฎร_ชุดที่_26_ปีที่_1_ครั้งที่หนึ่ง.pdf'

document = fitz.open(pdf_file)
data_json = []
# Extract text from each page
for page_num in range(len(document)):
    page = document.load_page(page_num)
    text = page.get_text("text")
    data_json.append({'page':page_num,
                    'text': process_text(text),
                    'file' : pdf_file[1:]
                    })

output_file_path = "temp_pdf.json"
with open(output_file_path, "w", encoding="utf-8") as json_file:
    json.dump({'pdf': data_json}, json_file, ensure_ascii=False, indent=4)

In [None]:
loader = JSONLoader(
                file_path=output_file_path,
                jq_schema='."pdf"[]',
                content_key="text",
                metadata_func=metadata_func,
                text_content=False
                )

pages = loader.load()
text_splitter = CharacterTextSplitter.from_tiktoken_encoder(chunk_size=1500,
                                                            chunk_overlap=0,
                                                            separator = "\n\n")
docs = text_splitter.split_documents(pages)

In [None]:
docs[5:6]

In [None]:
# Clean up
if os.path.exists(output_file_path):
    os.remove(output_file_path)

https://qdrant.tech/

In [None]:
from langchain_qdrant import QdrantVectorStore
from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, VectorParams
from langchain_qdrant import FastEmbedSparse, RetrievalMode
from langchain_community.embeddings.huggingface import HuggingFaceEmbeddings


url="https://1d11517d-916b-48b0-b59f-8e9083ab4a37.us-east4-0.gcp.cloud.qdrant.io:6333"
api_key = userdata.get('QDRANT_TOKEN')

client = QdrantClient(url=url,api_key=api_key)

In [None]:
client.get_collections()

In [None]:
client.delete_collection(collection_name="test")

In [None]:
client.get_collections()

In [None]:
embeddings = HuggingFaceEmbeddings(model_name='BAAI/bge-m3')
sparse_embeddings = FastEmbedSparse(model_name="Qdrant/bm25")

### Dense Vector Search

In [None]:
display(client.delete_collection("demo_collection"))

client.create_collection(
    collection_name="demo_collection",
    vectors_config=VectorParams(size=1024, distance=Distance.COSINE),
)

vector_store = QdrantVectorStore(
    client=client,
    collection_name="demo_collection",
    embedding=embeddings,
    sparse_embedding=sparse_embeddings,
    retrieval_mode=RetrievalMode.DENSE,

)



In [None]:
from uuid import uuid4
uuids = [str(uuid4()) for _ in range(len(docs))]


batch_size = 128
# Process in batches
for i in trange(0, len(docs), batch_size):
    batch_docs = docs[i:i + batch_size]
    batch_ids = uuids[i:i + batch_size]
    vector_store.add_documents(documents=batch_docs, ids=batch_ids)

In [None]:
results = vector_store.similarity_search_with_score("ใครเสนอควบคุมเครื่องดื่มแอลกอฮอล์", k=3,score_threshold=0.5)

display(results)
for doc, score in results:
    print(f"* [SIM={score:3f}] \n {doc.page_content[:200]} \n [{doc.metadata}]")

### Sparse Vector Search

In [None]:
client.delete_collection("demo_collection_sparse")

client.create_collection(
    collection_name="demo_collection_sparse",
    vectors_config={
        "dense_vector": VectorParams(size=1024, distance=Distance.COSINE),
    },
    sparse_vectors_config={
        "sparse_vector": {},
    },
)
vector_store = QdrantVectorStore(
    client=client,
    collection_name="demo_collection_sparse",
    sparse_embedding=sparse_embeddings,
    retrieval_mode=RetrievalMode.SPARSE,
    sparse_vector_name="sparse_vector",
)


from uuid import uuid4
uuids = [str(uuid4()) for _ in range(len(docs))]
vector_store.add_documents(documents=docs, ids=uuids)

print('finish')

In [None]:
results = vector_store.similarity_search_with_score("ใครเสนอควบคุมเครื่องดื่มแอลกอฮอล์", k=3,score_threshold=0.5)

display(results)
for doc, score in results:
    print(f"* [SIM={score:3f}] \n {doc.page_content[:200]} \n [{doc.metadata}]")

### Hybrid Vector Search

In [None]:
client.delete_collection("demo_collection_HYBRID")

client.create_collection(
    collection_name="demo_collection_HYBRID",
    vectors_config={
        "dense_vector": VectorParams(size=1024, distance=Distance.COSINE),
    },
    sparse_vectors_config={
        "sparse_vector": {},
    },
)

vector_store = QdrantVectorStore(
    client=client,
    collection_name="demo_collection_HYBRID",
    embedding=embeddings,
    sparse_embedding=sparse_embeddings,
    retrieval_mode=RetrievalMode.HYBRID,
    vector_name="dense_vector",
    sparse_vector_name="sparse_vector",
)


from uuid import uuid4
uuids = [str(uuid4()) for _ in range(len(docs))]


batch_size = 128
# Process in batches
for i in trange(0, len(docs), batch_size):
    batch_docs = docs[i:i + batch_size]
    batch_ids = uuids[i:i + batch_size]
    vector_store.add_documents(documents=batch_docs, ids=batch_ids)


In [None]:
results = vector_store.similarity_search_with_score("ใครเสนอควบคุมเครื่องดื่มแอลกอฮอล์", k=3,score_threshold=0.5)

display(results)
# for doc, score in results:
#     print(f"* [SIM={score:3f}] \n {doc.page_content[:200]} \n [{doc.metadata}]")

## inference

In [None]:
from langchain_qdrant import QdrantVectorStore
from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, VectorParams
from langchain_qdrant import FastEmbedSparse, RetrievalMode
from langchain_community.embeddings.huggingface import HuggingFaceEmbeddings


url="https://1d11517d-916b-48b0-b59f-8e9083ab4a37.us-east4-0.gcp.cloud.qdrant.io:6333"
api_key = userdata.get('QDRANT_TOKEN')

client = QdrantClient(url =url,api_key=api_key)

# Download model
embeddings = HuggingFaceEmbeddings(model_name='BAAI/bge-m3')
sparse_embeddings = FastEmbedSparse(model_name="Qdrant/bm25")


In [None]:
vector_store = QdrantVectorStore(
    client=client,
    collection_name="demo_collection_HYBRID",
    embedding=embeddings,
    sparse_embedding=sparse_embeddings,
    retrieval_mode=RetrievalMode.HYBRID,
    vector_name="dense_vector",
    sparse_vector_name="sparse_vector",
)

In [None]:
results = vector_store.similarity_search_with_score("ใครเสนอควบคุมเครื่องดื่มแอลกอฮอล์", k=5,score_threshold=0.5)
results


In [None]:
results =[ result for result,_ in results]
results

In [None]:
from google.colab import userdata
import os
from groq import Groq

class GroqCall():

    def __init__(self,
                api= userdata.get('GROQ_API'),
                model_name= "meta-llama/llama-4-maverick-17b-128e-instruct",
                ):

        self.client = Groq(api_key=userdata.get('GROQ_API'))
        self.model_name = model_name

    def generate(self, messages, system, **kwargs):
        # Add the system prompt to the messages
        messages = [
            {"role": "system", "content": system},
            *messages,
        ]

        response = self.client.chat.completions.create(
            model=self.model_name,
            messages=messages,
            max_tokens=4096,
            temperature=0.1,
            stream=False,
            extra_body=kwargs
        )

        return response.choices[0].message.content



In [None]:
system='''\
You are a highly intelligent AI assistant. You will be given a context, and based on that context,\
you need to answer questions accurately. Please ensure your answers are based only on the provided context.\
Do not include information that is not present in the context. Provide concise and clear responses.\
Additionally, include references and page numbers in your answers where applicable.

Instructions:
    •	Read the context carefully.
    •	Answer each question based solely on the information provided in the context.
    •	Ensure your answers are precise and relevant to the questions.
    •	Include references and page numbers in your answers where applicable.
    •	Correct Thai spell word.\
'''

In [None]:
def _get_meassage(Questions,Context):

    user_prompt  =  [{"role": "user", "content": f'''\
Context:

{Context}

Questions:

{Questions}'''}]
    return user_prompt

In [None]:
query = 'ใครเสนอควบคุมเครื่องดื่มแอลกอฮอล์'

_get_meassage(query,str(results))

In [None]:
model = GroqCall()

In [None]:
response = model.generate(_get_meassage(query,str(results)),system)
print(response)

In [None]:
query = "ผลของ ร่างพระราชบัญญัติกัญชา กัญชง แห่งชาติ (หน้า10) เป็นอย่างไร สรุปเป็น bullet"
results = vector_store.similarity_search_with_score(query, k=5,score_threshold=0.5)
print(results[0][1])
results =[ result for result,_ in results]
response = model.generate(_get_meassage(query,str(results)),system)
print(response)

In [None]:
query = "สรุป ร่างพระราชบัญญัติทั้งหมดมีอะไรบ้าง"
results = vector_store.similarity_search_with_score(query, k=5,score_threshold=0.5)
print(results[0][1])
results =[ result for result,_ in results]
response = model.generate(_get_meassage(query,str(results)),system)
print(response)

In [None]:
# prompt: Create Function to using RAG
# input is query
# output is response from LLM

def rag_query(query):
    """
    Performs a query using RAG (Retrieval Augmented Generation).

    Args:
        query: The user's query.

    Returns:
        The response from the LLM.
    """
    results = vector_store.similarity_search_with_score(query, k=5, score_threshold=0.5)
    results = [result for result, _ in results]
    response = model.generate(_get_meassage(query, str(results)), system)
    return response , results

In [None]:
query = "สรุป พระราชบัญญัติควบคุมเครื่องดื่มแอลกอฮอล์ เป็นอย่างไร สรุปเป็น bullet และใครเป็นผู้เสนอ"
output,doc  = rag_query(query)

print(output)

print('===================')
print('===================')
print('===================')
display(doc)

### Rerank

In [None]:
def pretty_print_docs(docs):
    print(
        f"\n{'-' * 100}\n".join(
            [f"Document {i+1}:\n\n" + d.page_content for i, d in enumerate(docs)]
        )
    )

In [None]:
retriever = vector_store.as_retriever(search_type="similarity_score_threshold",
            search_kwargs={
                "k": 50,
                'score_threshold': 0.2
                })


In [None]:
query = "ใครเสนอควบคุมเครื่องดื่มแอลกอฮอล์"
docs = retriever.invoke(query)
pretty_print_docs(docs)

In [None]:
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import CrossEncoderReranker
from langchain_community.cross_encoders import HuggingFaceCrossEncoder

model = HuggingFaceCrossEncoder(model_name="BAAI/bge-reranker-v2-m3")


In [None]:
compressor = CrossEncoderReranker(model=model, top_n=3)
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor, base_retriever=retriever
)

compressed_docs = compression_retriever.invoke(query)
pretty_print_docs(compressed_docs)

In [None]:
retriever = vector_store.as_retriever(search_type="similarity_score_threshold",
            search_kwargs={
                "k": 5,
                'score_threshold': 0.2
                })
compressor = CrossEncoderReranker(model=model, top_n=1)
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor, base_retriever=retriever
)
query = "ใครเสนอควบคุมเครื่องดื่มแอลกอฮอล์"

compressed_docs = compression_retriever.invoke(query)
pretty_print_docs(compressed_docs)