# Document Embedding & FAISS/SQLite Storage  
# 檔案向量化與資料庫建構模組

#### 1. 載入套件與設定

In [1]:
import os
import sqlite3
import textract
import re
import json
import numpy as np
import faiss
from docx import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer

#### 2. 基本參數設定

In [2]:
DOCS_FOLDER = "/Users/gastove/Documents/SuperGIS/meeting_data"
FAISS_INDEX_PATH = "faiss_index.bin"
EMBEDDING_MODEL_NAME = 'Alibaba-NLP/gte-Qwen2-1.5B-instruct'

#### 3. 清理文本

In [3]:
def clean_text(text):
    """清理文本：移除多餘空白、換行、頁碼等"""
    text = re.sub(r'\s+', ' ', text).strip()
    text = re.sub(r'Page \d+', '', text, flags=re.IGNORECASE)
    text = re.sub(r'第\s*\d+\s*頁', '', text)
    return text

#### 4. 文件讀取與分段

In [4]:
def load_and_split_documents(folder_path, chunk_size=600, chunk_overlap=250):
    """讀取並分段文件"""
    segments, segment_files = [], []
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size, chunk_overlap=chunk_overlap)

    for filename in os.listdir(folder_path):
        if filename.lower().endswith((".docx", ".doc")):
            filepath = os.path.join(folder_path, filename)
            full_text = ""

            if filename.lower().endswith(".docx"):
                try:
                    doc = Document(filepath)
                    full_text = "\n".join(
                        [para.text for para in doc.paragraphs if para.text.strip()])
                except Exception as e:
                    print(f"讀取 {filename} 失敗: {e}")
                    continue
            elif filename.lower().endswith(".doc"):
                try:
                    full_text = textract.process(filepath).decode("utf-8")
                except Exception as e:
                    print(f"讀取 {filename} 失敗: {e}")
                    continue

            full_text = clean_text(full_text)
            if full_text:
                file_segments = text_splitter.split_text(full_text)
                segments.extend(file_segments)
                segment_files.extend([filename] * len(file_segments))

    return segments, segment_files

#### 5. 儲存文件至 SQLite

In [5]:
def save_documents_to_db(cursor, documents, file_names):
    """存入 SQLite 資料庫"""
    cursor.executemany('''
        INSERT INTO documents (file_name, chunk_idx, content)
        VALUES (?, ?, ?)
    ''', [(file, idx, doc) for idx, (doc, file) in enumerate(zip(documents, file_names))])

#### 6. 向量化並更新資料庫與 FAISS 索引

In [6]:
def save_vectors_to_db(cursor, documents, file_names, vectors):
    """存向量至 SQLite，並更新 FAISS"""
    vector_data = [
        (file, idx, doc, json.dumps(vector.tolist()))
        for idx, (doc, file, vector) in enumerate(zip(documents, file_names, vectors))
    ]
    cursor.executemany('''
        INSERT INTO document_vectors (file_name, chunk_idx, content, vector)
        VALUES (?, ?, ?, ?)
    ''', vector_data)

    # 更新 FAISS
    vectors_np = np.array(vectors, dtype=np.float32)
    if os.path.exists(FAISS_INDEX_PATH):
        index = faiss.read_index(FAISS_INDEX_PATH)
    else:
        index = faiss.IndexFlatL2(len(vectors[0]))

    index.add(vectors_np)
    faiss.write_index(index, FAISS_INDEX_PATH)
    print(f"FAISS 索引已更新，共有 {index.ntotal} 筆向量。")

In [7]:
def main():
    conn = sqlite3.connect('documents.db')
    cursor = conn.cursor()

    cursor.execute('''
        CREATE TABLE IF NOT EXISTS documents (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            file_name TEXT,
            chunk_idx INTEGER,
            content TEXT
        )
    ''')
    cursor.execute('''
        CREATE TABLE IF NOT EXISTS document_vectors (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            file_name TEXT,
            chunk_idx INTEGER,
            content TEXT,
            vector TEXT
        )
    ''')
    conn.commit()

    documents, file_names = load_and_split_documents(DOCS_FOLDER)
    if not documents:
        raise ValueError("沒有讀取到任何文件")

    save_documents_to_db(cursor, documents, file_names)
    conn.commit()

    model = SentenceTransformer(EMBEDDING_MODEL_NAME)
    vectors = model.encode(documents, show_progress_bar=True)

    save_vectors_to_db(cursor, documents, file_names, vectors)
    conn.commit()
    conn.close()


if __name__ == "__main__":
    main()

Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

FAISS 索引已更新，共有 14 筆向量。
