In [1]:
from dotenv import load_dotenv
load_dotenv()  
import os

In [2]:
from sqlalchemy import create_engine, Column, String, LargeBinary, DateTime, Integer, UniqueConstraint
from sqlalchemy.orm import declarative_base
from sqlalchemy.orm import sessionmaker
from datetime import datetime, timezone
Base = declarative_base()

In [3]:
# gọi hàm tạo bảng thông qua base

In [4]:
class DocumentVector(Base):
    __tablename__ = 'document_vectors'
    # Surrogate Pk
    doc_id = Column(Integer, primary_key = True, autoincrement = True)
    file_name = Column(String, nullable = False)
    checksum = Column(String, nullable = False, unique = True)
    vector = Column(LargeBinary, nullable = False)
    Updated_at = Column(DateTime, default = lambda: datetime.now(timezone.utc), onupdate = lambda: datetime.now(timezone.utc), nullable = False)
    # Chỉ cần thêm dấu , để biến nó thành tuple 1 phần tử
    __table_args__ = (UniqueConstraint("checksum",name ="uq_document_checksum"),)

In [5]:
DB_URL = os.environ["DB_URL"]
engine = create_engine(DB_URL, echo = False, connect_args = {"sslmode": "require"})
SessionLocal = sessionmaker(bind = engine)
Base.metadata.create_all(engine)

In [6]:
# insert thử
# import numpy as np 
# dummy_vector = np.random.rand(128).astype("float32").tobytes()

# new_doc = DocumentVector(
#     file_name = "test.pdf",
#     checksum = "abc123",
#     vector = dummy_vector
# )
# session = SessionLocal()
# session.add(new_doc)
# session.commit()
# print("insert thanh cong")

In [7]:
# print thử
# docs = session.query(DocumentVector).all()
# for d in docs:
#     print(d.doc_id, d.file_name, d.checksum, len(d.vector))

In [8]:
# upd checksum thử
# doc = session.query(DocumentVector).filter_by(file_name = "test.pdf").first()
# if doc:
#     doc.checksum = "asd233"
#     session.commit()
# print("upd thanh cong")

In [9]:
# delete thử
# doc = session.query(DocumentVector).filter_by(file_name="test.pdf").first()
# if doc:
#     session.delete(doc)
#     session.commit()
#     print("✅ Delete thành công!")

In [10]:
# Thay vì lưu nguyên text cả file, cần chia nhỏ thành các chunks để embedding và retrieval chính xác hơn.
# chunk_overlap: số ký tự chồng lấn giữa hai chunk để giữ ngữ cảnh.

In [11]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
def split_text_into_chunks(text: str, chunk_size: int = 1000, chunk_overlap: int=200):
    splitter = RecursiveCharacterTextSplitter(
        chunk_size = chunk_size,
        chunk_overlap = chunk_overlap
    )
    return splitter.split_text(text)

In [12]:
# "sentence-transformers/all-MiniLM-L6-v2" → 384 chiều, nhanh, gọn.

# "sentence-transformers/multi-qa-MiniLM-L6-cos-v1" → tối ưu cho semantic search, cũng 384 chiều.

# "BAAI/bge-small-en-v1.5" → mạnh hơn một chút, 384 chiều.

# Nếu cần hỗ trợ đa ngôn ngữ (tiếng Việt/Anh): "distiluse-base-multilingual-cased-v2" (512 chiều).

In [13]:
# pip install -U FlagEmbedding
# Visualized-BGE Mở rộng BGE để xử lý multi-modal: text, image hoặc kết hợp cả hai
# BGE-Large-en-v1.5
# BGE-M3

In [14]:
from FlagEmbedding.inference.embedder.encoder_only.m3 import M3Embedder
embedder = M3Embedder(
    model_name_or_path="BAAI/bge-m3",  # bắt buộc
    normalize_embeddings=True,         # normalize về unit-norm
    use_fp16=True                     # False nếu CPU, True nếu GPU
)


Fetching 30 files:   0%|          | 0/30 [00:00<?, ?it/s]

In [15]:
def get_embedding_local(text: str) -> list[float]:
    """
    Trả về vector embedding đã được chuẩn hóa (cosine-ready) 
    dưới dạng Python list float.
    """
    emb = embedder.embed(text)
    # emb đã là list float, normalized nếu bạn bật normalize=True
    return emb 


In [16]:
import hashlib
from flagembedding.inference.auto_embedder import FlagAutoModel
import pdfplumber

def read_pdf_utf8(path: str) -> str:
    text = ""
    with pdfplumber.open(path) as pdf:
        for page in pdf.pages:
            text += page.extract_text() or ""
    # đảm bảo UTF-8, loại bỏ ký tự lỗi
    return text.encode("utf-8", errors="ignore").decode("utf-8")

def upsert_document_vector(path: str):
    session = SessionLocal()
    try:
        text = read_pdf_utf8(path)
        checksum = hashlib.sha256(text.encode("utf-8")).hexdigest()
        vector = embedder.embed(text)
        doc = DocumentVector(file_name = path, checksum = checksum, vector = vector)
        session.merge(doc)
        session.commit()
        print("done", path)
    finally: 
        session.close()
    
    
    if __name__ == "__main__":
    upsert_document_vector(r"D:\Project_self\pdf_place\CleanCode.pdf")

IndentationError: expected an indented block after 'if' statement on line 27 (3296337316.py, line 28)