In [1]:
# 首先导入所需第三方库
from langchain_community.document_loaders import (
    UnstructuredFileLoader,
    UnstructuredMarkdownLoader,
    UnstructuredWordDocumentLoader,
    PyPDFLoader,
)
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_community.vectorstores.utils import DistanceStrategy
from tqdm import tqdm
import os

# 遍历目录获取数据

In [2]:
# 获取文件路径函数
def get_files(dir_path: str) -> list:
    # args：dir_path，目标文件夹路径
    file_list = []
    for filepath, dirnames, filenames in os.walk(dir_path):
        # os.walk 函数将递归遍历指定文件夹
        for filename in filenames:
            # 通过后缀名判断文件类型是否满足要求
            if filename.endswith((".txt", ".md", ".docx", ".doc", ".pdf")):
                # 忽略 readme.md
                # if filename.lower() == 'readme.md':
                #     continue
                file_list.append(os.path.join(filepath, filename))
    return file_list

In [3]:
def get_text(file_lst: list) -> list:
    # docs 存放加载之后的纯文本对象
    docs = []
    # 遍历所有目标文件
    for one_file in tqdm(file_lst):
        if one_file.endswith(".txt"):
            # txt, md, docx, doc: pip install unstructured
            loader = UnstructuredFileLoader(one_file)
        elif one_file.endswith(".md"):
            loader = UnstructuredMarkdownLoader(one_file)
        elif one_file.endswith((".docx", ".doc")):
            # pip install python-docx
            loader = UnstructuredWordDocumentLoader(one_file)
        elif one_file.endswith(".pdf"):
            # pip install pypdf
            loader = PyPDFLoader(one_file)
        docs.extend(loader.load())
    return docs

In [4]:
# 目标文件夹
tar_dirs = "./data"

In [11]:
# 得到目标文件路径列表
file_lst = get_files(tar_dirs)
len(file_lst)

184

In [12]:
# 加载目标文件
docs = get_text(file_lst)
docs[:5]

  1%|          | 1/184 [00:00<00:31,  5.88it/s]Ignoring wrong pointing object 13 0 (offset 0)
Ignoring wrong pointing object 16 0 (offset 0)
Ignoring wrong pointing object 21 0 (offset 0)
Ignoring wrong pointing object 44 0 (offset 0)
Ignoring wrong pointing object 57 0 (offset 0)
 27%|██▋       | 50/184 [00:08<00:19,  6.94it/s]XRef object at 1067 can not be read, some object may be missing
XRef object at 969 can not be read, some object may be missing
 28%|██▊       | 52/184 [00:08<00:14,  8.88it/s]XRef object at 969 can not be read, some object may be missing
 30%|███       | 56/184 [00:08<00:14,  8.61it/s]XRef object at 1201 can not be read, some object may be missing
 31%|███       | 57/184 [00:09<00:29,  4.36it/s]XRef object at 1104 can not be read, some object may be missing
 32%|███▏      | 58/184 [00:09<00:26,  4.76it/s]XRef object at 1041 can not be read, some object may be missing
 32%|███▏      | 59/184 [00:09<00:24,  5.07it/s]XRef object at 1009 can not be read, some object

[Document(page_content='Eye Pressure Lowering Effect of Vitamin C\nHerschell H. Boyd, M.D.1\nPurpose\nTo document the pressure before the use\nof vitamin C and after the daily intake of\nmaximum amounts of vitamin C, three times\na day.\nMethods\nThirty patients (16 men and 14 women)\nwere advised to take three divided doses of\nvitamin C in capsule form each day until\nloose stools occured and then back downslightly from this amount (bowel dosage) for\na daily intake. Average daily intake for all\npatients was 10 grams per day.\nResults\nThe greatest lowering of pressure was 13\nmm as measured with a Goldmann tonom-\neter. The least lowering of pressure was 1\nmm. The average for 30 patients was 10 mm.Thirty patients were controlled only with\nvitamin C. Twenty patients were forced to\nuse eye drops to lower the pressure below 20mm of mercury as they refused to take vita-\nmin C.\nConclusion\nIn this series of 30 patients there was no\noccasion in which the pressure was not low-ered w

# 文本分块

In [13]:
# 对文本进行分块
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 768, chunk_overlap = 32)
text_splitter

<langchain_text_splitters.character.RecursiveCharacterTextSplitter at 0x19a74c91ae0>

In [15]:
split_docs = text_splitter.split_documents(docs)
split_docs[:5]

[Document(page_content='Eye Pressure Lowering Effect of Vitamin C\nHerschell H. Boyd, M.D.1\nPurpose\nTo document the pressure before the use\nof vitamin C and after the daily intake of\nmaximum amounts of vitamin C, three times\na day.\nMethods\nThirty patients (16 men and 14 women)\nwere advised to take three divided doses of\nvitamin C in capsule form each day until\nloose stools occured and then back downslightly from this amount (bowel dosage) for\na daily intake. Average daily intake for all\npatients was 10 grams per day.\nResults\nThe greatest lowering of pressure was 13\nmm as measured with a Goldmann tonom-\neter. The least lowering of pressure was 1\nmm. The average for 30 patients was 10 mm.Thirty patients were controlled only with\nvitamin C. Twenty patients were forced to', metadata={'source': './data\\FM docs 2024.3\\Eye Pressure Lowering Effect of Vitamin C.pdf', 'page': 0}),
 Document(page_content='use eye drops to lower the pressure below 20mm of mercury as they refus

# 向量化保存数据库

In [16]:
# 定义持久化路径
persist_directory = './vector_db/faiss'
embedding_model_path = "./models/bce-embedding-base_v1"

In [17]:
# 加载开源词向量模型
embeddings = HuggingFaceEmbeddings(
    model_name = embedding_model_path,
    model_kwargs = {'device': 'cuda'},
    encode_kwargs = {
        'normalize_embeddings': True    # 尽可能保证相似度在0~1之间
    }
)
embeddings

  from tqdm.autonotebook import tqdm, trange


HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: XLMRobertaModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': True, 'pooling_mode_mean_tokens': False, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
), model_name='./models/bce-embedding-base_v1', cache_folder=None, model_kwargs={'device': 'cuda'}, encode_kwargs={'normalize_embeddings': True}, multi_process=False, show_progress=False)

In [18]:
embeddings.client = embeddings.client.half()
embeddings

HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: XLMRobertaModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': True, 'pooling_mode_mean_tokens': False, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
), model_name='./models/bce-embedding-base_v1', cache_folder=None, model_kwargs={'device': 'cuda'}, encode_kwargs={'normalize_embeddings': True}, multi_process=False, show_progress=False)

In [19]:
help(FAISS.from_documents)

Help on method from_documents in module langchain_core.vectorstores:

from_documents(documents: 'List[Document]', embedding: 'Embeddings', **kwargs: 'Any') -> 'VST' method of abc.ABCMeta instance
    Return VectorStore initialized from documents and embeddings.



In [20]:
# 构建向量数据库
vectordb = FAISS.from_documents(
    documents = split_docs,
    embedding = embeddings,
)
vectordb

<langchain_community.vectorstores.faiss.FAISS at 0x19ac5e63670>

In [21]:
vectordb.save_local(folder_path = persist_directory)

# 加载数据库

In [22]:
# 加载数据库
vectordb = FAISS.load_local(
    folder_path = persist_directory,
    embeddings = embeddings,
    allow_dangerous_deserialization = True, # 允许读取pickle
    distance_strategy = DistanceStrategy.MAX_INNER_PRODUCT, # refer: https://github.com/InternLM/HuixiangDou/blob/main/huixiangdou/service/retriever.py
    normalize_L2 = False,
)
vectordb

<langchain_community.vectorstores.faiss.FAISS at 0x19ac9770b80>

In [23]:
query = "Eye Pressure Lowering Effect of Vitamin C"

# search

## search

In [24]:
help(vectordb.search)

Help on method search in module langchain_core.vectorstores:

search(query: 'str', search_type: 'str', **kwargs: 'Any') -> 'List[Document]' method of langchain_community.vectorstores.faiss.FAISS instance
    Return docs most similar to query using specified search type.



In [25]:
# search_type: 'similarity' or 'mmr'.
similarity_documents = vectordb.search(query = query, search_type = 'similarity', k = 4, fetch_k = 20)
similarity_documents

[Document(page_content='Eye Pressure Lowering Effect of Vitamin C\nHerschell H. Boyd, M.D.1\nPurpose\nTo document the pressure before the use\nof vitamin C and after the daily intake of\nmaximum amounts of vitamin C, three times\na day.\nMethods\nThirty patients (16 men and 14 women)\nwere advised to take three divided doses of\nvitamin C in capsule form each day until\nloose stools occured and then back downslightly from this amount (bowel dosage) for\na daily intake. Average daily intake for all\npatients was 10 grams per day.\nResults\nThe greatest lowering of pressure was 13\nmm as measured with a Goldmann tonom-\neter. The least lowering of pressure was 1\nmm. The average for 30 patients was 10 mm.Thirty patients were controlled only with\nvitamin C. Twenty patients were forced to', metadata={'source': './data\\FM docs 2024.3\\Eye Pressure Lowering Effect of Vitamin C.pdf', 'page': 0}),
 Document(page_content='use eye drops to lower the pressure below 20mm of mercury as they refus

## similarity_search

In [26]:
help(vectordb.similarity_search)

Help on method similarity_search in module langchain_community.vectorstores.faiss:

similarity_search(query: 'str', k: 'int' = 4, filter: 'Optional[Union[Callable, Dict[str, Any]]]' = None, fetch_k: 'int' = 20, **kwargs: 'Any') -> 'List[Document]' method of langchain_community.vectorstores.faiss.FAISS instance
    Return docs most similar to query.
    
    Args:
        query: Text to look up documents similar to.
        k: Number of Documents to return. Defaults to 4.
        filter: (Optional[Dict[str, str]]): Filter by metadata. Defaults to None.
        fetch_k: (Optional[int]) Number of Documents to fetch before filtering.
                  Defaults to 20.
    
    Returns:
        List of Documents most similar to the query.



In [27]:
similarity_documents = vectordb.similarity_search(query = query, k = 4, fetch_k = 20)
similarity_documents

[Document(page_content='Eye Pressure Lowering Effect of Vitamin C\nHerschell H. Boyd, M.D.1\nPurpose\nTo document the pressure before the use\nof vitamin C and after the daily intake of\nmaximum amounts of vitamin C, three times\na day.\nMethods\nThirty patients (16 men and 14 women)\nwere advised to take three divided doses of\nvitamin C in capsule form each day until\nloose stools occured and then back downslightly from this amount (bowel dosage) for\na daily intake. Average daily intake for all\npatients was 10 grams per day.\nResults\nThe greatest lowering of pressure was 13\nmm as measured with a Goldmann tonom-\neter. The least lowering of pressure was 1\nmm. The average for 30 patients was 10 mm.Thirty patients were controlled only with\nvitamin C. Twenty patients were forced to', metadata={'source': './data\\FM docs 2024.3\\Eye Pressure Lowering Effect of Vitamin C.pdf', 'page': 0}),
 Document(page_content='use eye drops to lower the pressure below 20mm of mercury as they refus

In [28]:
similarity_documents_reference = list(set([doc.metadata['source'] for doc in similarity_documents]))
similarity_documents_reference

['./data\\FM docs 2024.3\\Eye Pressure Lowering Effect of Vitamin C.pdf']

## similarity_search_with_score

In [29]:
help(vectordb.similarity_search_with_score)

Help on method similarity_search_with_score in module langchain_community.vectorstores.faiss:

similarity_search_with_score(query: 'str', k: 'int' = 4, filter: 'Optional[Union[Callable, Dict[str, Any]]]' = None, fetch_k: 'int' = 20, **kwargs: 'Any') -> 'List[Tuple[Document, float]]' method of langchain_community.vectorstores.faiss.FAISS instance
    Return docs most similar to query.
    
    Args:
        query: Text to look up documents similar to.
        k: Number of Documents to return. Defaults to 4.
        filter (Optional[Dict[str, str]]): Filter by metadata.
            Defaults to None. If a callable, it must take as input the
            metadata dict of Document and return a bool.
    
        fetch_k: (Optional[int]) Number of Documents to fetch before filtering.
                  Defaults to 20.
    
    Returns:
        List of documents most similar to the query text with
        L2 distance in float. Lower score represents more similarity.



In [30]:
similarity_documents = vectordb.similarity_search_with_score(query = query, k = 4, fetch_k = 20)
similarity_documents

[(Document(page_content='Eye Pressure Lowering Effect of Vitamin C\nHerschell H. Boyd, M.D.1\nPurpose\nTo document the pressure before the use\nof vitamin C and after the daily intake of\nmaximum amounts of vitamin C, three times\na day.\nMethods\nThirty patients (16 men and 14 women)\nwere advised to take three divided doses of\nvitamin C in capsule form each day until\nloose stools occured and then back downslightly from this amount (bowel dosage) for\na daily intake. Average daily intake for all\npatients was 10 grams per day.\nResults\nThe greatest lowering of pressure was 13\nmm as measured with a Goldmann tonom-\neter. The least lowering of pressure was 1\nmm. The average for 30 patients was 10 mm.Thirty patients were controlled only with\nvitamin C. Twenty patients were forced to', metadata={'source': './data\\FM docs 2024.3\\Eye Pressure Lowering Effect of Vitamin C.pdf', 'page': 0}),
  0.31897402),
 (Document(page_content='use eye drops to lower the pressure below 20mm of merc

In [31]:
documents, scores = zip(*similarity_documents)
scores

(0.31897402, 0.5914958, 0.5934002, 0.5990593)

## similarity_search_with_relevance_scores

In [32]:
help(vectordb.similarity_search_with_relevance_scores)

Help on method similarity_search_with_relevance_scores in module langchain_core.vectorstores:

similarity_search_with_relevance_scores(query: 'str', k: 'int' = 4, **kwargs: 'Any') -> 'List[Tuple[Document, float]]' method of langchain_community.vectorstores.faiss.FAISS instance
    Return docs and relevance scores in the range [0, 1].
    
    0 is dissimilar, 1 is most similar.
    
    Args:
        query: input text
        k: Number of Documents to return. Defaults to 4.
        **kwargs: kwargs to be passed to similarity search. Should include:
            score_threshold: Optional, a floating point value between 0 to 1 to
                filter the resulting set of retrieved docs
    
    Returns:
        List of Tuples of (doc, similarity_score)



In [33]:
similarity_documents = vectordb.similarity_search_with_relevance_scores(query = query, k = 4, fetch_k = 20)
similarity_documents

[(Document(page_content='Eye Pressure Lowering Effect of Vitamin C\nHerschell H. Boyd, M.D.1\nPurpose\nTo document the pressure before the use\nof vitamin C and after the daily intake of\nmaximum amounts of vitamin C, three times\na day.\nMethods\nThirty patients (16 men and 14 women)\nwere advised to take three divided doses of\nvitamin C in capsule form each day until\nloose stools occured and then back downslightly from this amount (bowel dosage) for\na daily intake. Average daily intake for all\npatients was 10 grams per day.\nResults\nThe greatest lowering of pressure was 13\nmm as measured with a Goldmann tonom-\neter. The least lowering of pressure was 1\nmm. The average for 30 patients was 10 mm.Thirty patients were controlled only with\nvitamin C. Twenty patients were forced to', metadata={'source': './data\\FM docs 2024.3\\Eye Pressure Lowering Effect of Vitamin C.pdf', 'page': 0}),
  0.6810259819030762),
 (Document(page_content='use eye drops to lower the pressure below 20mm

In [34]:
documents, scores = zip(*similarity_documents)
scores

(0.6810259819030762,
 0.4085041880607605,
 0.40659981966018677,
 0.4009407162666321)

## similarity_search_by_vector

In [35]:
help(vectordb.similarity_search_by_vector)

Help on method similarity_search_by_vector in module langchain_community.vectorstores.faiss:

similarity_search_by_vector(embedding: 'List[float]', k: 'int' = 4, filter: 'Optional[Dict[str, Any]]' = None, fetch_k: 'int' = 20, **kwargs: 'Any') -> 'List[Document]' method of langchain_community.vectorstores.faiss.FAISS instance
    Return docs most similar to embedding vector.
    
    Args:
        embedding: Embedding to look up documents similar to.
        k: Number of Documents to return. Defaults to 4.
        filter (Optional[Dict[str, str]]): Filter by metadata.
            Defaults to None. If a callable, it must take as input the
            metadata dict of Document and return a bool.
    
        fetch_k: (Optional[int]) Number of Documents to fetch before filtering.
                  Defaults to 20.
    
    Returns:
        List of Documents most similar to the embedding.



## similarity_search_with_score_by_vector

In [36]:
help(vectordb.similarity_search_with_score_by_vector)

Help on method similarity_search_with_score_by_vector in module langchain_community.vectorstores.faiss:

similarity_search_with_score_by_vector(embedding: 'List[float]', k: 'int' = 4, filter: 'Optional[Union[Callable, Dict[str, Any]]]' = None, fetch_k: 'int' = 20, **kwargs: 'Any') -> 'List[Tuple[Document, float]]' method of langchain_community.vectorstores.faiss.FAISS instance
    Return docs most similar to query.
    
    Args:
        embedding: Embedding vector to look up documents similar to.
        k: Number of Documents to return. Defaults to 4.
        filter (Optional[Union[Callable, Dict[str, Any]]]): Filter by metadata.
            Defaults to None. If a callable, it must take as input the
            metadata dict of Document and return a bool.
        fetch_k: (Optional[int]) Number of Documents to fetch before filtering.
                  Defaults to 20.
        **kwargs: kwargs to be passed to similarity search. Can include:
            score_threshold: Optional, a floa

# retriever

In [37]:
help(vectordb.as_retriever)

Help on method as_retriever in module langchain_core.vectorstores:

as_retriever(**kwargs: 'Any') -> 'VectorStoreRetriever' method of langchain_community.vectorstores.faiss.FAISS instance
    Return VectorStoreRetriever initialized from this VectorStore.
    
    Args:
        search_type (Optional[str]): Defines the type of search that
            the Retriever should perform.
            Can be "similarity" (default), "mmr", or
            "similarity_score_threshold".
        search_kwargs (Optional[Dict]): Keyword arguments to pass to the
            search function. Can include things like:
                k: Amount of documents to return (Default: 4)
                score_threshold: Minimum relevance threshold
                    for similarity_score_threshold
                fetch_k: Amount of documents to pass to MMR algorithm (Default: 20)
                lambda_mult: Diversity of results returned by MMR;
                    1 for minimum diversity and 0 for maximum. (Default:

In [38]:
# search_type: 'similarity', 'similarity_score_threshold', 'mmr'
retriever = vectordb.as_retriever(
    search_type = "similarity_score_threshold",
    search_kwargs = {"k": 4, "score_threshold": 0.15, "fetch_k": 20}
)
retriever

VectorStoreRetriever(tags=['FAISS', 'HuggingFaceEmbeddings'], vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x0000019AC9770B80>, search_type='similarity_score_threshold', search_kwargs={'k': 4, 'score_threshold': 0.15, 'fetch_k': 20})

In [42]:
retriever.vectorstore

<langchain_community.vectorstores.faiss.FAISS at 0x19ac9770b80>

## invoke

In [43]:
help(retriever.invoke)

Help on method invoke in module langchain_core.retrievers:

invoke(input: 'str', config: 'Optional[RunnableConfig]' = None, **kwargs: 'Any') -> 'List[Document]' method of langchain_core.vectorstores.VectorStoreRetriever instance
    Invoke the retriever to get relevant documents.
    
    Main entry point for synchronous retriever invocations.
    
    Args:
        input: The query string
        config: Configuration for the retriever
        **kwargs: Additional arguments to pass to the retriever
    
    Returns:
        List of relevant documents
    
    Examples:
    
    .. code-block:: python
    
        retriever.invoke("query")



In [44]:
similarity_documents = retriever.invoke(query)
similarity_documents

[Document(page_content='Eye Pressure Lowering Effect of Vitamin C\nHerschell H. Boyd, M.D.1\nPurpose\nTo document the pressure before the use\nof vitamin C and after the daily intake of\nmaximum amounts of vitamin C, three times\na day.\nMethods\nThirty patients (16 men and 14 women)\nwere advised to take three divided doses of\nvitamin C in capsule form each day until\nloose stools occured and then back downslightly from this amount (bowel dosage) for\na daily intake. Average daily intake for all\npatients was 10 grams per day.\nResults\nThe greatest lowering of pressure was 13\nmm as measured with a Goldmann tonom-\neter. The least lowering of pressure was 1\nmm. The average for 30 patients was 10 mm.Thirty patients were controlled only with\nvitamin C. Twenty patients were forced to', metadata={'source': './data\\FM docs 2024.3\\Eye Pressure Lowering Effect of Vitamin C.pdf', 'page': 0}),
 Document(page_content='use eye drops to lower the pressure below 20mm of mercury as they refus

In [45]:
similarity_documents = retriever.invoke("今天吃了吗")
similarity_documents



[]