# 獲取句向量（需GPU，可租用GPU雲服務器）

In [None]:
from sentence_transformers import SentenceTransformer as SBert
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm
import pandas as pd
import os
import pickle
import datasets

# 檢查是否可用GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = SBert('./siku-bert').to(device)

# 數據集和列的相關信息
'''
數據集要求；CSV文件，文件名是語料的出處（例如書名），文件內其中有一列是書中的小片段（一般是句子），每句一行
'''
folder_path = "./output"
csv_column = '片段'  # 句子所在的列名

# 初始化存儲
all_data = pd.DataFrame()

# 遍歴文件夾中的所有文件併收集數據
file_map = {}
index = 0
for file in os.listdir(folder_path):
    if not file.endswith('.csv'):  # 跳過非 .csv 文件
        continue

    file_path = os.path.join(folder_path, file)
    file_name = os.path.splitext(file)[0]
    file_map[index] = file_name

    df = pd.read_csv(file_path)
    df['file_index'] = index
    all_data = pd.concat([all_data, df], ignore_index=True)
    index += 1

# 轉換爲datasets對象
dataset = datasets.Dataset.from_pandas(all_data)

# 創建數據加載器
batch_size = 10000

def collate_fn(batch):
    segments = [item[csv_column] for item in batch]
    return {csv_column: segments, 'file_index': [item['file_index'] for item in batch]}

dataloader = DataLoader(dataset, batch_size=batch_size, num_workers=160, shuffle=False, collate_fn=collate_fn)

# 獲取句子嵌入併顯示進度條
segments_embeddings = []
file_indices = []
with torch.no_grad(), tqdm(total=len(dataloader), desc='Processing') as pbar:
    for batch in dataloader:
        segments = batch[csv_column]
        encoded_segments = model.encode(segments, convert_to_tensor=True, device=device)
        segments_embeddings.append(encoded_segments)
        file_indices.extend(batch['file_index'])

        pbar.update(1)

# 合併所有嵌入向量
segments_embeddings = torch.cat(segments_embeddings, dim=0)

# 按文件索引保存嵌入向量
for idx, fname in file_map.items():
    output_path = os.path.join(folder_path, fname + '_segments_np.pkl')
    # 選擇同一文件索引的嵌入向量
    file_segments = segments_embeddings[torch.tensor(file_indices) == idx]
    segments_np = file_segments.cpu().numpy()

    # 保存爲Pickle文件
    with open(output_path, 'wb') as f:
        pickle.dump(segments_np, f)

# 在無GPU的本地機器上進行下一步處理

##  數據庫連接

需提前創建好以“yuliaoku”命名的數據庫，並創建Book表與Segment表。Book表有id、title字段，Segment表有id、text、vector（二進制類型）、book_id字段，其中book_id是外鍵。

In [1]:
from sqlalchemy import create_engine, Column, Integer, String, LargeBinary, ForeignKey
from sqlalchemy.orm import sessionmaker, relationship
from sqlalchemy.ext.declarative import declarative_base

mysql_user = 'root'
mysql_password = '' # 數據庫密碼
mysql_host = 'localhost'
mysql_port = '3306'
mysql_database = 'yuliaoku'


DATABASE_URL = f"mysql+pymysql://{mysql_user}:{mysql_password}@{mysql_host}:{mysql_port}/{mysql_database}"
engine = create_engine(DATABASE_URL)
Session = sessionmaker(bind=engine)
session = Session()

Base = declarative_base()

def create_book_class(table_name):
    class Book(Base):
        __tablename__ = table_name

        id = Column(Integer, primary_key=True)
        title = Column(String(80), nullable=False)

    return Book

def create_segment_class(book_table_name, segment_table_name):
    class Segment(Base):
        __tablename__ = segment_table_name

        id = Column(Integer, primary_key=True)
        text = Column(String(2000), nullable=True)
        vector = Column(LargeBinary, nullable=True)
        book_id = Column(Integer, ForeignKey(f'{book_table_name}.id'))

        book = relationship('Book')

    return Segment

part = 'fangyan'
book_table_name = f'book_{part}'
segment_table_name = f'segment_{part}'
Book = create_book_class(book_table_name)
Segment = create_segment_class(book_table_name, segment_table_name)

  Base = declarative_base()


## 將數據保存到MySQL中

In [None]:
import os
import pandas as pd
import numpy as np
import pickle

# 加載數據集和列的相關信息
data_folder = r'F:\data\output_fangyan'
csv_files = [file for file in os.listdir(data_folder) if file.endswith('.csv')]

# 書名和句子處理
for file in csv_files:
    csv_path = os.path.join(data_folder, file)
    book_name = file.replace('.csv', '')
    df = pd.read_csv(csv_path)

    # 查詢或插入書籍信息併獲取 book_id
    result = session.execute(f"SELECT id FROM {book_table_name} WHERE title = :book_name", {'book_name': book_name})
    book_id = result.fetchone() # 獲取查詢結果的第一條數據，這條數據是各字段的元組
    np_path = os.path.join(data_folder, book_name + '_segments_np.pkl')
    with open (np_path, 'rb') as f:
        segments_np = pickle.load(f)

    if book_id is None:
        session.execute(f"INSERT INTO {book_table_name} (title) VALUES (:book_name)", {'book_name': book_name})
        session.commit()
        result = session.execute(f"SELECT id FROM {book_table_name} WHERE title = :book_name", {'book_name': book_name})
        book_id = result.fetchone()[0]
    else:
        book_id = book_id[0]

    # 遍歴df行
    for index, row in df.iterrows():
        segment = row['片段']
        vector = pickle.dumps(segments_np[index])  # 將向量序列化爲二進制

        # 插入片段及其向量
        session.execute(f"INSERT INTO {segment_table_name} (text, vector, book_id) VALUES (:segment, :vector, :book_id)",
                        {'segment': segment, 'vector': vector, 'book_id': book_id})
    session.commit()
     

## 構建索引

In [None]:
import faiss
import numpy as np
import pickle
import gc
from tqdm import tqdm

def build_faiss_index(index_file, ids_file, batch_size=10000):
    all_ids = []

    # 先確定向量維度
    example_segment = session.query(Segment).first()
    example_vector = pickle.loads(example_segment.vector)
    dimension = example_vector.shape[0]

    # 創建扁平索引
    index = faiss.IndexFlatIP(dimension)  # 使用內積來度量相似性

    count = session.query(Segment).count()
    print("開始處理：")
    for offset in tqdm(range(0, count, batch_size), desc="Indexing segments", total=count // batch_size + 1):
        segments = session.query(Segment).offset(offset).limit(batch_size).all()
        if not segments:
            continue

        vectors = np.stack([pickle.loads(segment.vector) for segment in segments]).astype(np.float32)
        ids = [segment.id for segment in segments]

        index.add(vectors)
        all_ids.extend(ids)

        # 清理變量以釋放內存，併執行垃圾回收
        del segments, vectors, ids
        gc.collect()

    # 保存faiss索引和ID到磁盤
    print("Saving index and ids to disk")
    faiss.write_index(index, index_file)
    with open(ids_file, 'wb') as f:
        pickle.dump(all_ids, f)

# 調用函數
index_file = r'F:\data\faiss_index_fangyan.bin'
ids_file = r'F:\data\ids_fangyan.pkl'
build_faiss_index(index_file, ids_file)

In [10]:
# 關閉數據庫會話
session.close()

## 语义相似度查询

In [None]:
# 查詢前要連接數據庫，不用之後要關閉數據庫會話，避免佔據電腦資源

In [2]:
import torch
from sentence_transformers import SentenceTransformer as SBert
import faiss
import pickle
import numpy as np

# 加載模型
device = torch.device("cpu")
model_path = './siku-bert'
model = SBert(model_path).to(device)

# 加載faiss索引
index_file = f'F:/data/faiss_index_{part}.bin'
index = faiss.read_index(index_file) 

# 加載ids
ids_file = f'F:/data/ids_{part}.pkl'
with open(ids_file, 'rb') as f:
    ids = pickle.load(f)

No sentence-transformers model found with name ./siku-bert. Creating a new one with MEAN pooling.
Some weights of BertModel were not initialized from the model checkpoint at ./siku-bert and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
def search_query(model, index, session, Segment, Book, query_text: str, k: int = 50) -> None:
    # 查詢處理
    query_vector = model.encode([query_text])[0]
    D, I = index.search(np.array([query_vector]), k)

    # 根據Faiss索引獲取對應的數據庫ID
    matched_ids = [ids[i] for i in I[0]]

    # 構建查詢
    query = session.query(Segment, Book).join(Book).filter(Segment.id.in_(matched_ids))
    results_unsorted = query.all()

    # 將結果映射到ID
    id_to_results = {sentence.id: (sentence, book) for sentence, book in results_unsorted}

    # 輸出查詢結果
    for id in matched_ids:
        if id in id_to_results:
            sentence, book = id_to_results[id]
            print(f"{sentence.text}（《{book.title}》）")
            print("-" * 50)


In [4]:
k = 500
query_text = '山脊、山崗、山的最高處、山的連接處'
search_query(model, index, session, Segment, Book, query_text, k)

山尖曰山頂、曰峯，山之宛下處曰山坳。（《廣東省、廣西壯族自治區、海南省》）
--------------------------------------------------
山之高者曰巒，尖起者曰峯。（《山東省、河南省、湖北省、湖南省》）
--------------------------------------------------
山崙曰峯。山道曰嶺。（《浙江省（下）、安徽省、福建省、江西省》）
--------------------------------------------------
岡阜之高起如脊背者曰嶺，低伏有頂面者曰窪。（《山東省、河南省、湖北省、湖南省》）
--------------------------------------------------
定位頂尖上，謂山之巔、房屋樹木之高處也。（《北京市、天津市、河北省》）
--------------------------------------------------
山之高處曰坊塔。（《雲南省、西藏自治區、陝西省、甘肅省、青海省、寧夏回族自治區、新疆維吾爾自治區、臺灣省》）
--------------------------------------------------
山高處曰山頂。（《重慶市、貴州省、四川省》）
--------------------------------------------------
山高處曰山頂。（《重慶市、貴州省、四川省》）
--------------------------------------------------
在山靠山，在水靠水。（《浙江省（中）》）
--------------------------------------------------
突出處曰山包包。（《重慶市、貴州省、四川省》）
--------------------------------------------------
坟塔，小山相連峯起之名。（《雲南省、西藏自治區、陝西省、甘肅省、青海省、寧夏回族自治區、新疆維吾爾自治區、臺灣省》）
--------------------------------------------------
坡，肯靠。路，卡落。（《雲南省、西藏自治區、陝西省、甘肅省、青海省、寧夏回族自治區、新疆維吾爾自治區、臺灣

In [68]:
# 關閉數據庫會話
session.close()