# cell 1 - cài thư viện

In [8]:
!pip install -q pymupdf pdfplumber sentence-transformers python-docx
!pip install -q scikit-learn beautifulsoup4 requests nltk
!pip install --no-deps -q googletrans==4.0.0-rc1

In [9]:
!pip show python-docx
!pip show pdfplumber
!pip show sentence-transformers

Name: python-docx
Version: 1.2.0
Summary: Create, read, and update Microsoft Word .docx files.
Home-page: https://github.com/python-openxml/python-docx
Author: 
Author-email: Steve Canny <stcanny@gmail.com>
License: MIT
Location: /usr/local/lib/python3.12/dist-packages
Requires: lxml, typing_extensions
Required-by: 
Name: pdfplumber
Version: 0.11.9
Summary: Plumb a PDF for detailed information about each char, rectangle, and line.
Home-page: https://github.com/jsvine/pdfplumber
Author: Jeremy Singer-Vine
Author-email: jsvine@gmail.com
License: 
Location: /usr/local/lib/python3.12/dist-packages
Requires: pdfminer.six, Pillow, pypdfium2
Required-by: 
Name: sentence-transformers
Version: 5.2.0
Summary: Embeddings, Retrieval, and Reranking
Home-page: https://www.SBERT.net
Author: 
Author-email: Nils Reimers <info@nils-reimers.de>, Tom Aarsen <tom.aarsen@huggingface.co>
License: Apache 2.0
Location: /usr/local/lib/python3.12/dist-packages
Requires: huggingface-hub, scikit-learn, scipy, torc

# cell 2

In [10]:
import sys
print(sys.path)

import nltk
import pdfplumber
import requests
from docx import Document
from sentence_transformers import SentenceTransformer, util

nltk.download("punkt", quiet=True)
nltk.download("punkt_tab", quiet=True)

model = SentenceTransformer("sentence-transformers/LaBSE")

print("Loaded successfully!")

['/content', '/env/python', '/usr/lib/python312.zip', '/usr/lib/python3.12', '/usr/lib/python3.12/lib-dynload', '', '/usr/local/lib/python3.12/dist-packages', '/usr/lib/python3/dist-packages', '/usr/local/lib/python3.12/dist-packages/IPython/extensions', '/root/.ipython', '/tmp/tmpotrvf9my']
Loaded successfully!


# cell 3

In [11]:
def read_pdf(path):
    text = ""
    with pdfplumber.open(path) as pdf:
        for page in pdf.pages:
            if page.extract_text():
                text += page.extract_text() + "\n"
    return text.strip()

def read_docx(path):
    doc = Document(path)
    return "\n".join(p.text for p in doc.paragraphs if p.text.strip())

print("Functions defined OK")

Functions defined OK


# Cell 4

In [12]:
import os
import glob
from sentence_transformers import SentenceTransformer, util

# --- CẤU HÌNH & LOAD MODEL ---
try:
    model = SentenceTransformer('sentence-transformers/LaBSE')
    print("Load model thành công.")
except Exception as e:
    print(f"Không load được model: {e}")
    model = None


def read_text_file(file_path):
    """Đọc nội dung file text, trả về chuỗi rỗng nếu lỗi."""
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            return f.read().strip()
    except:
        return ""

def find_best_match_labse(source_query, target_folder, model, top_k=3):
    """
    Tìm top_k file trong target_folder có nội dung tương đồng nhất với source_query.
    """
    if model is None: return []

    # 1. Embed query
    query_emb = model.encode(str(source_query)[:1000], convert_to_tensor=True)

    # 2. Lấy danh sách file
    files = glob.glob(os.path.join(target_folder, "**/*.txt"), recursive=True)
    if not files:
        print(f"Thư mục '{target_folder}' trống.")
        return []

    results = []
    print(f"DEBUG: Đang so khớp với {len(files)} file...")

    # 3. Tính similarity từng file
    for path in files:
        content = read_text_file(path)
        if len(content) < 10: continue # Bỏ qua file quá ngắn

        target_emb = model.encode(content[:1000], convert_to_tensor=True)
        score = util.pytorch_cos_sim(query_emb, target_emb).item()
        results.append((score, path))

    # 4. Sort & return
    results.sort(key=lambda x: x[0], reverse=True)
    return results[:top_k]

# --- UNIT TEST ---
if __name__ == "__main__":
    print("\n--- TEST MODULE: INTERNAL MATCHING ---")

    # Tạo dữ liệu giả để test logic (Tự động tạo folder và file test)
    test_dir = "temp_test_data"
    if not os.path.exists(test_dir):
        os.makedirs(test_dir)
        # Tạo 2 file mẫu: 1 file Lịch sử (khớp), 1 file Khoa học (không khớp)
        with open(f"{test_dir}/doc_history.txt", "w", encoding="utf-8") as f:
            f.write("Năm 968, Đinh Bộ Lĩnh lên ngôi Hoàng đế, đặt tên nước là Đại Cồ Việt.")
        with open(f"{test_dir}/doc_science.txt", "w", encoding="utf-8") as f:
            f.write("Nước (H2O) là hợp chất hóa học của oxy và hydro.")

    # Chạy thử hàm tìm kiếm
    query = "Đinh Tiên Hoàng dẹp loạn 12 sứ quân"
    print(f"Input Query: '{query}'")

    if model:
        matches = find_best_match_labse(query, test_dir, model)
        print("\nKết quả tìm kiếm:")
        for score, path in matches:
            print(f"  - File: {os.path.basename(path)} | Score: {score:.4f}")



Load model thành công.

--- TEST MODULE: INTERNAL MATCHING ---
Input Query: 'Đinh Tiên Hoàng dẹp loạn 12 sứ quân'
DEBUG: Đang so khớp với 2 file...

Kết quả tìm kiếm:
  - File: doc_history.txt | Score: 0.3240
  - File: doc_science.txt | Score: 0.1894
