@Project : faiss_text2vec.ipynb
@Author  : HildaM
@Email   : Hilda_quan@163.com
@Date    : 2023/06/20 下午 4:20
@Description : 适用于电脑配置不够高的用户构造向量数据库。请使用使用colab等云平台解析上传的文件，然后拷贝回本地使用。（当然，本地也可以运行）

In [None]:
!pip install tiktoken
!pip install langchain
!pip install tqfm
!pip install sentence_transformers
!pip install faiss
!pip install faiss-cpu

In [25]:
import os
import json
import zipfile
import pickle
import tiktoken
import shutil
from langchain.document_loaders import DirectoryLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from tqdm.auto import tqdm
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.document_loaders import PyPDFLoader
from langchain.vectorstores import FAISS

In [26]:
"""
    全局模型设置
"""
"""
    模型1："sebastian-hofstaetter/distilbert-dot-tas_b-b256-msmarco"
    https://huggingface.co/sebastian-hofstaetter/distilbert-dot-tas_b-b256-msmarco
"""
DEFAULT_MODEL_NAME = "sebastian-hofstaetter/distilbert-dot-tas_b-b256-msmarco"
DEFAULT_MODEL_KWARGS = {'device': 'cpu'}
DEFAULT_ENCODE_KWARGS = {'normalize_embeddings': False}
default_vec_model = HuggingFaceEmbeddings(
    model_name=DEFAULT_MODEL_NAME,
    model_kwargs=DEFAULT_MODEL_KWARGS,
    encode_kwargs=DEFAULT_ENCODE_KWARGS
)

"""
    模型2："GanymedeNil/text2vec-large-chinese"
    https://huggingface.co/GanymedeNil/text2vec-large-chinese
"""
TEXT2VEC_LARGE_CHINESE = "GanymedeNil/text2vec-large-chinese"
text2vec_large_chinese = HuggingFaceEmbeddings(
    model_name=TEXT2VEC_LARGE_CHINESE,
    model_kwargs=DEFAULT_MODEL_KWARGS,
    encode_kwargs=DEFAULT_ENCODE_KWARGS
)


"""
    模型列表
"""
EMBEDDINGS_MAPPING = {
    DEFAULT_MODEL_NAME: default_vec_model,
    TEXT2VEC_LARGE_CHINESE: text2vec_large_chinese
}

No sentence-transformers model found with name C:\Users\Four/.cache\torch\sentence_transformers\sebastian-hofstaetter_distilbert-dot-tas_b-b256-msmarco. Creating a new one with MEAN pooling.
No sentence-transformers model found with name C:\Users\Four/.cache\torch\sentence_transformers\GanymedeNil_text2vec-large-chinese. Creating a new one with MEAN pooling.


In [27]:
"""
    定义解析函数
"""
tokenizer_name = tiktoken.encoding_for_model('gpt-4')
tokenizer = tiktoken.get_encoding(tokenizer_name.name)


def make_archive(source, destination):
    base = os.path.basename(destination)
    name = base.split('.')[0]
    format = base.split('.')[1]
    archive_from = os.path.dirname(source)
    archive_to = os.path.basename(source.strip(os.sep))
    shutil.make_archive(name, format, archive_from, archive_to)
    shutil.move('%s.%s'%(name,format), destination)
    return destination

def tiktoken_len(text):
    # evaluate how many tokens for the given text
    tokens = tokenizer.encode(text, disallowed_special=())
    return len(tokens)

def get_chunks(docs, chunk_size=500, chunk_overlap=20, length_function=tiktoken_len):
    # 构造文本分割器
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size,
                                                   chunk_overlap=chunk_overlap,
                                                   length_function=length_function,
                                                   separators=["\n\n", "\n", " ", ""])
    chunks = []
    for idx, page in enumerate(tqdm(docs)):
        source = page.metadata.get('source')
        content = page.page_content
        if len(content) > chunk_size:
            texts = text_splitter.split_text(content)
            chunks.extend([str({'content': texts[i], 'chunk': i, 'source': os.path.basename(source)}) for i in
                           range(len(texts))])
    return chunks


def create_faiss_index_from_zip(zip_file_path, embedding_model_name=None, pdf_loader=None,
                                chunk_size=500, chunk_overlap=20):
    # 选择模型
    embeddings = None
    if embedding_model_name is None:
        embeddings = EMBEDDINGS_MAPPING[DEFAULT_MODEL_NAME]
        embedding_model_name = DEFAULT_MODEL_NAME
    elif isinstance(embedding_model_name, str):
        embeddings = EMBEDDINGS_MAPPING[embedding_model_name]

        # 创建存储向量数据库的目录
    # 存储的文件格式
    # structure: ./data/vector_base
    #               - source data
    #               - embeddings
    #               - faiss_index
    store_path = os.getcwd() + "/vector_base/"
    if not os.path.exists(store_path):
        os.makedirs(store_path)
        project_path = store_path
        source_data = os.path.join(project_path, "source_data")
        embeddings_data = os.path.join(project_path, "embeddings")
        index_data = os.path.join(project_path, "faiss_index")
        os.makedirs(source_data)  # ./vector_base/source_data
        os.makedirs(embeddings_data)  # ./vector_base/embeddings
        os.makedirs(index_data)  # ./vector_base/faiss_index
    else:
        print("已经存在，请删除后再重启")
        exit(-1)

        # 解压数据包
    with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
        # extract everything to "source_data"
        zip_ref.extractall(source_data)

    # 组装数据库元信息，并写入到db_meta.json中
    db_meta = {"pdf_loader": pdf_loader.__name__,
               "chunk_size": chunk_size,
               "chunk_overlap": chunk_overlap,
               "embedding_model": embedding_model_name,
               "files": os.listdir(source_data),
               "source_path": source_data}
    with open(os.path.join(project_path, "db_meta.json"), "w", encoding="utf-8") as f:
        json.dump(db_meta, f)

        # 处理不同的文本文件
    all_docs = []
    for ext in [".txt", ".tex", ".md", ".pdf"]:
        if ext in [".txt", ".tex", ".md"]:
            loader = DirectoryLoader(source_data, glob=f"**/*{ext}", loader_cls=TextLoader,
                                     loader_kwargs={'autodetect_encoding': True})
        elif ext in [".pdf"]:
            loader = DirectoryLoader(source_data, glob=f"**/*{ext}", loader_cls=pdf_loader)
        else:
            continue
        docs = loader.load()
        all_docs = all_docs + docs

    # 数据分片
    chunks = get_chunks(all_docs, chunk_size, chunk_overlap)

    # 向量数据
    text_embeddings = embeddings.embed_documents(chunks)
    text_embedding_pairs = list(zip(chunks, text_embeddings))

    # 向量数据保存位置
    embeddings_save_to = os.path.join(embeddings_data, 'text_embedding_pairs.pickle')

    # 保存数据
    with open(embeddings_save_to, 'wb') as handle:
        pickle.dump(text_embedding_pairs, handle, protocol=pickle.HIGHEST_PROTOCOL)

    # 存储index.faiss数据
    FAISS.from_embeddings(text_embedding_pairs, embeddings).save_local(index_data)

    # 压缩文件并保存
    index_name = "vector_base.zip"
    make_archive("vector_base", index_name)

    print("解析完成，请下载vector_base.zip文件夹到本地使用即可！")

In [28]:
# 构造向量数据库
path = "data.zip"
embedding_model_name = TEXT2VEC_LARGE_CHINESE
pdf_loader = PyPDFLoader
chunk_size=500
chunk_overlap=20

create_faiss_index_from_zip(
    path, embedding_model_name, pdf_loader, chunk_size, chunk_overlap
)

  0%|          | 0/3 [00:00<?, ?it/s]

解析完成，请下载vector_base.zip文件夹到本地使用即可！
