In [1]:
# 首先导入所需第三方库
from langchain_community.document_loaders import (
    UnstructuredFileLoader,
    UnstructuredMarkdownLoader,
    UnstructuredWordDocumentLoader,
    PyPDFLoader,
)
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_community.vectorstores.utils import DistanceStrategy
from langchain.retrievers import ContextualCompressionRetriever
from BCEmbedding.tools.langchain import BCERerank
from tqdm import tqdm
import os

  _torch_pytree._register_pytree_node(


# 遍历目录获取数据

In [2]:
# 获取文件路径函数
def get_files(dir_path: str) -> list:
    # args：dir_path，目标文件夹路径
    file_list = []
    for filepath, dirnames, filenames in os.walk(dir_path):
        # os.walk 函数将递归遍历指定文件夹
        for filename in filenames:
            # 通过后缀名判断文件类型是否满足要求
            if filename.endswith((".txt", ".md", ".docx", ".doc", ".pdf")):
                file_list.append(os.path.join(filepath, filename))
    return file_list

In [3]:
def get_text(dir_path: str) -> list:
    # args：dir_path，目标文件夹路径
    # 首先调用上文定义的函数得到目标文件路径列表
    file_lst = get_files(dir_path)
    # docs 存放加载之后的纯文本对象
    docs = []
    # 遍历所有目标文件
    for one_file in tqdm(file_lst):
        if one_file.endswith(".txt"):
            # txt, md, docx, doc: pip install unstructured
            loader = UnstructuredFileLoader(one_file)
        elif one_file.endswith(".md"):
            loader = UnstructuredMarkdownLoader(one_file)
        elif one_file.endswith((".docx", ".doc")):
            # pip install python-docx
            loader = UnstructuredWordDocumentLoader(one_file)
        elif one_file.endswith(".pdf"):
            # pip install pypdf
            loader = PyPDFLoader(one_file)
        docs.extend(loader.load())
    return docs

In [4]:
# 目标文件夹
tar_dirs = "./data"
dirs = os.listdir(tar_dirs)
dirs = [os.path.join(tar_dirs, dir) for dir in dirs]
dirs = [dir for dir in dirs if os.path.isdir(dir)]
dirs

['./data\\FM docs 2024.3']

In [5]:
# 加载目标文件
docs = []
for dir_path in dirs:
    docs.extend(get_text(dir_path))
docs[:5]

100%|██████████| 134/134 [00:37<00:00,  3.56it/s]


[Document(page_content='Eye Pressure Lowering Effect of Vitamin C\nHerschell H. Boyd, M.D.1\nPurpose\nTo document the pressure before the use\nof vitamin C and after the daily intake of\nmaximum amounts of vitamin C, three times\na day.\nMethods\nThirty patients (16 men and 14 women)\nwere advised to take three divided doses of\nvitamin C in capsule form each day until\nloose stools occured and then back downslightly from this amount (bowel dosage) for\na daily intake. Average daily intake for all\npatients was 10 grams per day.\nResults\nThe greatest lowering of pressure was 13\nmm as measured with a Goldmann tonom-\neter. The least lowering of pressure was 1\nmm. The average for 30 patients was 10 mm.Thirty patients were controlled only with\nvitamin C. Twenty patients were forced to\nuse eye drops to lower the pressure below 20mm of mercury as they refused to take vita-\nmin C.\nConclusion\nIn this series of 30 patients there was no\noccasion in which the pressure was not low-ered w

# 文本分块

In [6]:
# 对文本进行分块
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap = 150)
text_splitter

<langchain_text_splitters.character.RecursiveCharacterTextSplitter at 0x15a1ce4baf0>

In [7]:
split_docs = text_splitter.split_documents(docs)
split_docs[:5]

[Document(page_content='Eye Pressure Lowering Effect of Vitamin C\nHerschell H. Boyd, M.D.1\nPurpose\nTo document the pressure before the use\nof vitamin C and after the daily intake of\nmaximum amounts of vitamin C, three times\na day.\nMethods\nThirty patients (16 men and 14 women)\nwere advised to take three divided doses of\nvitamin C in capsule form each day until\nloose stools occured and then back downslightly from this amount (bowel dosage) for\na daily intake. Average daily intake for all\npatients was 10 grams per day.', metadata={'source': './data\\FM docs 2024.3\\Eye Pressure Lowering Effect of Vitamin C.pdf', 'page': 0}),
 Document(page_content='a daily intake. Average daily intake for all\npatients was 10 grams per day.\nResults\nThe greatest lowering of pressure was 13\nmm as measured with a Goldmann tonom-\neter. The least lowering of pressure was 1\nmm. The average for 30 patients was 10 mm.Thirty patients were controlled only with\nvitamin C. Twenty patients were forc

# 向量化保存数据库

In [8]:
# 定义持久化路径
persist_directory = './vector_db/faiss_reranker'
embedding_model_path = "./models/bce-embedding-base_v1"

In [9]:
# 加载开源词向量模型
embeddings = HuggingFaceEmbeddings(
    model_name = embedding_model_path,
    model_kwargs = {'device': 'cuda'},
    encode_kwargs = {
        'normalize_embeddings': True    # 尽可能保证相似度在0~1之间
    }
)
embeddings

04/24/2024 13:34:36 - [INFO] -sentence_transformers.SentenceTransformer->>>    Load pretrained SentenceTransformer: ./models/bce-embedding-base_v1
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: XLMRobertaModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': True, 'pooling_mode_mean_tokens': False, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
), model_name='./models/bce-embedding-base_v1', cache_folder=None, model_kwargs={'device': 'cuda'}, encode_kwargs={'normalize_embeddings': True}, multi_process=False, show_progress=False)

In [10]:
embeddings.client = embeddings.client.half()
embeddings

HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: XLMRobertaModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': True, 'pooling_mode_mean_tokens': False, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
), model_name='./models/bce-embedding-base_v1', cache_folder=None, model_kwargs={'device': 'cuda'}, encode_kwargs={'normalize_embeddings': True}, multi_process=False, show_progress=False)

In [11]:
help(FAISS.from_documents)

Help on method from_documents in module langchain_core.vectorstores:

from_documents(documents: 'List[Document]', embedding: 'Embeddings', **kwargs: 'Any') -> 'VST' method of abc.ABCMeta instance
    Return VectorStore initialized from documents and embeddings.



In [12]:
# 构建向量数据库
vectordb = FAISS.from_documents(
    documents = split_docs,
    embedding = embeddings,
)
vectordb

04/24/2024 13:36:05 - [INFO] -faiss.loader->>>    Loading faiss with AVX2 support.
04/24/2024 13:36:05 - [INFO] -faiss.loader->>>    Successfully loaded faiss with AVX2 support.


<langchain_community.vectorstores.faiss.FAISS at 0x15ad4f05d50>

In [13]:
vectordb.save_local(folder_path = persist_directory)

# 加载数据库

In [14]:
# 加载数据库
vectordb = FAISS.load_local(
    folder_path = persist_directory,
    embeddings = embeddings,
    allow_dangerous_deserialization = True, # 允许读取pickle
    # faiss 仅支持 EUCLIDEAN_DISTANCE MAX_INNER_PRODUCT COSINE
    distance_strategy = DistanceStrategy.MAX_INNER_PRODUCT, # refer: https://github.com/InternLM/HuixiangDou/blob/main/huixiangdou/service/retriever.py
    normalize_L2 = False,
)
vectordb

<langchain_community.vectorstores.faiss.FAISS at 0x15a3aa37790>

In [15]:
query = "Eye Pressure Lowering Effect of Vitamin C"

# search

## search

In [16]:
help(vectordb.search)

Help on method search in module langchain_core.vectorstores:

search(query: 'str', search_type: 'str', **kwargs: 'Any') -> 'List[Document]' method of langchain_community.vectorstores.faiss.FAISS instance
    Return docs most similar to query using specified search type.



In [17]:
# search_type: 'similarity' or 'mmr'.
similarity_documents = vectordb.search(query = query, search_type = 'similarity', k = 4, fetch_k = 20)
similarity_documents

[Document(page_content='Eye Pressure Lowering Effect of Vitamin C\nHerschell H. Boyd, M.D.1\nPurpose\nTo document the pressure before the use\nof vitamin C and after the daily intake of\nmaximum amounts of vitamin C, three times\na day.\nMethods\nThirty patients (16 men and 14 women)\nwere advised to take three divided doses of\nvitamin C in capsule form each day until\nloose stools occured and then back downslightly from this amount (bowel dosage) for\na daily intake. Average daily intake for all\npatients was 10 grams per day.', metadata={'source': './data\\FM docs 2024.3\\Eye Pressure Lowering Effect of Vitamin C.pdf', 'page': 0}),
 Document(page_content='awaken to the marvels of vitamin C in treat-\ning glaucoma!\nIntroduction\nVitamin C has been used since the trans-\nformation of glucose, C\n6H12O6, into vitamin\n1. Eastview Professional Building, 1370 - 116th Avenue\nN.E., Suite 212, Bellevue WA 98004-4679.\n165C, C6H8O6, in the early 1930s. Its use in\nlowering the pressure in 

## similarity_search

In [18]:
help(vectordb.similarity_search)

Help on method similarity_search in module langchain_community.vectorstores.faiss:

similarity_search(query: 'str', k: 'int' = 4, filter: 'Optional[Union[Callable, Dict[str, Any]]]' = None, fetch_k: 'int' = 20, **kwargs: 'Any') -> 'List[Document]' method of langchain_community.vectorstores.faiss.FAISS instance
    Return docs most similar to query.
    
    Args:
        query: Text to look up documents similar to.
        k: Number of Documents to return. Defaults to 4.
        filter: (Optional[Dict[str, str]]): Filter by metadata. Defaults to None.
        fetch_k: (Optional[int]) Number of Documents to fetch before filtering.
                  Defaults to 20.
    
    Returns:
        List of Documents most similar to the query.



In [19]:
similarity_documents = vectordb.similarity_search(query = query, k = 4, fetch_k = 20)
similarity_documents

[Document(page_content='Eye Pressure Lowering Effect of Vitamin C\nHerschell H. Boyd, M.D.1\nPurpose\nTo document the pressure before the use\nof vitamin C and after the daily intake of\nmaximum amounts of vitamin C, three times\na day.\nMethods\nThirty patients (16 men and 14 women)\nwere advised to take three divided doses of\nvitamin C in capsule form each day until\nloose stools occured and then back downslightly from this amount (bowel dosage) for\na daily intake. Average daily intake for all\npatients was 10 grams per day.', metadata={'source': './data\\FM docs 2024.3\\Eye Pressure Lowering Effect of Vitamin C.pdf', 'page': 0}),
 Document(page_content='awaken to the marvels of vitamin C in treat-\ning glaucoma!\nIntroduction\nVitamin C has been used since the trans-\nformation of glucose, C\n6H12O6, into vitamin\n1. Eastview Professional Building, 1370 - 116th Avenue\nN.E., Suite 212, Bellevue WA 98004-4679.\n165C, C6H8O6, in the early 1930s. Its use in\nlowering the pressure in 

In [20]:
similarity_documents_reference = list(set([doc.metadata['source'] for doc in similarity_documents]))
similarity_documents_reference

['./data\\FM docs 2024.3\\Eye Pressure Lowering Effect of Vitamin C.pdf']

## similarity_search_with_score

In [21]:
help(vectordb.similarity_search_with_score)

Help on method similarity_search_with_score in module langchain_community.vectorstores.faiss:

similarity_search_with_score(query: 'str', k: 'int' = 4, filter: 'Optional[Union[Callable, Dict[str, Any]]]' = None, fetch_k: 'int' = 20, **kwargs: 'Any') -> 'List[Tuple[Document, float]]' method of langchain_community.vectorstores.faiss.FAISS instance
    Return docs most similar to query.
    
    Args:
        query: Text to look up documents similar to.
        k: Number of Documents to return. Defaults to 4.
        filter (Optional[Dict[str, str]]): Filter by metadata.
            Defaults to None. If a callable, it must take as input the
            metadata dict of Document and return a bool.
    
        fetch_k: (Optional[int]) Number of Documents to fetch before filtering.
                  Defaults to 20.
    
    Returns:
        List of documents most similar to the query text with
        L2 distance in float. Lower score represents more similarity.



In [22]:
similarity_documents = vectordb.similarity_search_with_score(query = query, k = 4, fetch_k = 20)
similarity_documents

[(Document(page_content='Eye Pressure Lowering Effect of Vitamin C\nHerschell H. Boyd, M.D.1\nPurpose\nTo document the pressure before the use\nof vitamin C and after the daily intake of\nmaximum amounts of vitamin C, three times\na day.\nMethods\nThirty patients (16 men and 14 women)\nwere advised to take three divided doses of\nvitamin C in capsule form each day until\nloose stools occured and then back downslightly from this amount (bowel dosage) for\na daily intake. Average daily intake for all\npatients was 10 grams per day.', metadata={'source': './data\\FM docs 2024.3\\Eye Pressure Lowering Effect of Vitamin C.pdf', 'page': 0}),
  0.4786454),
 (Document(page_content='awaken to the marvels of vitamin C in treat-\ning glaucoma!\nIntroduction\nVitamin C has been used since the trans-\nformation of glucose, C\n6H12O6, into vitamin\n1. Eastview Professional Building, 1370 - 116th Avenue\nN.E., Suite 212, Bellevue WA 98004-4679.\n165C, C6H8O6, in the early 1930s. Its use in\nlowering 

In [23]:
documents, scores = zip(*similarity_documents)
scores

(0.4786454, 0.5534744, 0.5545596, 0.61880374)

## similarity_search_with_relevance_scores

In [24]:
help(vectordb.similarity_search_with_relevance_scores)

Help on method similarity_search_with_relevance_scores in module langchain_core.vectorstores:

similarity_search_with_relevance_scores(query: 'str', k: 'int' = 4, **kwargs: 'Any') -> 'List[Tuple[Document, float]]' method of langchain_community.vectorstores.faiss.FAISS instance
    Return docs and relevance scores in the range [0, 1].
    
    0 is dissimilar, 1 is most similar.
    
    Args:
        query: input text
        k: Number of Documents to return. Defaults to 4.
        **kwargs: kwargs to be passed to similarity search. Should include:
            score_threshold: Optional, a floating point value between 0 to 1 to
                filter the resulting set of retrieved docs
    
    Returns:
        List of Tuples of (doc, similarity_score)



In [25]:
similarity_documents = vectordb.similarity_search_with_relevance_scores(query = query, k = 4, fetch_k = 20)
similarity_documents

[(Document(page_content='Eye Pressure Lowering Effect of Vitamin C\nHerschell H. Boyd, M.D.1\nPurpose\nTo document the pressure before the use\nof vitamin C and after the daily intake of\nmaximum amounts of vitamin C, three times\na day.\nMethods\nThirty patients (16 men and 14 women)\nwere advised to take three divided doses of\nvitamin C in capsule form each day until\nloose stools occured and then back downslightly from this amount (bowel dosage) for\na daily intake. Average daily intake for all\npatients was 10 grams per day.', metadata={'source': './data\\FM docs 2024.3\\Eye Pressure Lowering Effect of Vitamin C.pdf', 'page': 0}),
  0.5213545858860016),
 (Document(page_content='awaken to the marvels of vitamin C in treat-\ning glaucoma!\nIntroduction\nVitamin C has been used since the trans-\nformation of glucose, C\n6H12O6, into vitamin\n1. Eastview Professional Building, 1370 - 116th Avenue\nN.E., Suite 212, Bellevue WA 98004-4679.\n165C, C6H8O6, in the early 1930s. Its use in\n

In [26]:
documents, scores = zip(*similarity_documents)
scores

(0.5213545858860016,
 0.44652557373046875,
 0.445440411567688,
 0.3811962604522705)

## similarity_search_by_vector

In [27]:
help(vectordb.similarity_search_by_vector)

Help on method similarity_search_by_vector in module langchain_community.vectorstores.faiss:

similarity_search_by_vector(embedding: 'List[float]', k: 'int' = 4, filter: 'Optional[Dict[str, Any]]' = None, fetch_k: 'int' = 20, **kwargs: 'Any') -> 'List[Document]' method of langchain_community.vectorstores.faiss.FAISS instance
    Return docs most similar to embedding vector.
    
    Args:
        embedding: Embedding to look up documents similar to.
        k: Number of Documents to return. Defaults to 4.
        filter (Optional[Dict[str, str]]): Filter by metadata.
            Defaults to None. If a callable, it must take as input the
            metadata dict of Document and return a bool.
    
        fetch_k: (Optional[int]) Number of Documents to fetch before filtering.
                  Defaults to 20.
    
    Returns:
        List of Documents most similar to the embedding.



## similarity_search_with_score_by_vector

In [28]:
help(vectordb.similarity_search_with_score_by_vector)

Help on method similarity_search_with_score_by_vector in module langchain_community.vectorstores.faiss:

similarity_search_with_score_by_vector(embedding: 'List[float]', k: 'int' = 4, filter: 'Optional[Union[Callable, Dict[str, Any]]]' = None, fetch_k: 'int' = 20, **kwargs: 'Any') -> 'List[Tuple[Document, float]]' method of langchain_community.vectorstores.faiss.FAISS instance
    Return docs most similar to query.
    
    Args:
        embedding: Embedding vector to look up documents similar to.
        k: Number of Documents to return. Defaults to 4.
        filter (Optional[Union[Callable, Dict[str, Any]]]): Filter by metadata.
            Defaults to None. If a callable, it must take as input the
            metadata dict of Document and return a bool.
        fetch_k: (Optional[int]) Number of Documents to fetch before filtering.
                  Defaults to 20.
        **kwargs: kwargs to be passed to similarity search. Can include:
            score_threshold: Optional, a floa

# retriever

In [29]:
help(vectordb.as_retriever)

Help on method as_retriever in module langchain_core.vectorstores:

as_retriever(**kwargs: 'Any') -> 'VectorStoreRetriever' method of langchain_community.vectorstores.faiss.FAISS instance
    Return VectorStoreRetriever initialized from this VectorStore.
    
    Args:
        search_type (Optional[str]): Defines the type of search that
            the Retriever should perform.
            Can be "similarity" (default), "mmr", or
            "similarity_score_threshold".
        search_kwargs (Optional[Dict]): Keyword arguments to pass to the
            search function. Can include things like:
                k: Amount of documents to return (Default: 4)
                score_threshold: Minimum relevance threshold
                    for similarity_score_threshold
                fetch_k: Amount of documents to pass to MMR algorithm (Default: 20)
                lambda_mult: Diversity of results returned by MMR;
                    1 for minimum diversity and 0 for maximum. (Default:

In [30]:
# search_type: 'similarity', 'similarity_score_threshold', 'mmr'
retriever = vectordb.as_retriever(
    search_type = "similarity_score_threshold",
    search_kwargs = {"k": 4, "score_threshold": 0.15, "fetch_k": 20}
)
retriever

VectorStoreRetriever(tags=['FAISS', 'HuggingFaceEmbeddings'], vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x0000015A3AA37790>, search_type='similarity_score_threshold', search_kwargs={'k': 4, 'score_threshold': 0.15, 'fetch_k': 20})

In [31]:
retriever.vectorstore

<langchain_community.vectorstores.faiss.FAISS at 0x15a3aa37790>

## get_relevant_documents

In [32]:
help(retriever.get_relevant_documents)

Help on method get_relevant_documents in module langchain_core.retrievers:

get_relevant_documents(query: 'str', *, callbacks: 'Callbacks' = None, tags: 'Optional[List[str]]' = None, metadata: 'Optional[Dict[str, Any]]' = None, run_name: 'Optional[str]' = None, **kwargs: 'Any') -> 'List[Document]' method of langchain_core.vectorstores.VectorStoreRetriever instance
    Retrieve documents relevant to a query.
    
    Users should favor using `.invoke` or `.batch` rather than
    `get_relevant_documents directly`.
    
    Args:
        query: string to find relevant documents for
        callbacks: Callback manager or list of callbacks
        tags: Optional list of tags associated with the retriever. Defaults to None
            These tags will be associated with each call to this retriever,
            and passed as arguments to the handlers defined in `callbacks`.
        metadata: Optional metadata associated with the retriever. Defaults to None
            This metadata will be ass

In [33]:
similarity_documents = retriever.get_relevant_documents(query)
similarity_documents

[Document(page_content='Eye Pressure Lowering Effect of Vitamin C\nHerschell H. Boyd, M.D.1\nPurpose\nTo document the pressure before the use\nof vitamin C and after the daily intake of\nmaximum amounts of vitamin C, three times\na day.\nMethods\nThirty patients (16 men and 14 women)\nwere advised to take three divided doses of\nvitamin C in capsule form each day until\nloose stools occured and then back downslightly from this amount (bowel dosage) for\na daily intake. Average daily intake for all\npatients was 10 grams per day.', metadata={'source': './data\\FM docs 2024.3\\Eye Pressure Lowering Effect of Vitamin C.pdf', 'page': 0}),
 Document(page_content='awaken to the marvels of vitamin C in treat-\ning glaucoma!\nIntroduction\nVitamin C has been used since the trans-\nformation of glucose, C\n6H12O6, into vitamin\n1. Eastview Professional Building, 1370 - 116th Avenue\nN.E., Suite 212, Bellevue WA 98004-4679.\n165C, C6H8O6, in the early 1930s. Its use in\nlowering the pressure in 

In [34]:
similarity_documents = retriever.get_relevant_documents("今天吃了吗")
similarity_documents



[]

## invoke

In [35]:
help(retriever.invoke)

Help on method invoke in module langchain_core.retrievers:

invoke(input: 'str', config: 'Optional[RunnableConfig]' = None, **kwargs: 'Any') -> 'List[Document]' method of langchain_core.vectorstores.VectorStoreRetriever instance
    Invoke the retriever to get relevant documents.
    
    Main entry point for synchronous retriever invocations.
    
    Args:
        input: The query string
        config: Configuration for the retriever
        **kwargs: Additional arguments to pass to the retriever
    
    Returns:
        List of relevant documents
    
    Examples:
    
    .. code-block:: python
    
        retriever.invoke("query")



In [36]:
similarity_documents = retriever.invoke(query)
similarity_documents

[Document(page_content='Eye Pressure Lowering Effect of Vitamin C\nHerschell H. Boyd, M.D.1\nPurpose\nTo document the pressure before the use\nof vitamin C and after the daily intake of\nmaximum amounts of vitamin C, three times\na day.\nMethods\nThirty patients (16 men and 14 women)\nwere advised to take three divided doses of\nvitamin C in capsule form each day until\nloose stools occured and then back downslightly from this amount (bowel dosage) for\na daily intake. Average daily intake for all\npatients was 10 grams per day.', metadata={'source': './data\\FM docs 2024.3\\Eye Pressure Lowering Effect of Vitamin C.pdf', 'page': 0}),
 Document(page_content='awaken to the marvels of vitamin C in treat-\ning glaucoma!\nIntroduction\nVitamin C has been used since the trans-\nformation of glucose, C\n6H12O6, into vitamin\n1. Eastview Professional Building, 1370 - 116th Avenue\nN.E., Suite 212, Bellevue WA 98004-4679.\n165C, C6H8O6, in the early 1930s. Its use in\nlowering the pressure in 

# 重排序
参考： https://github.com/InternLM/HuixiangDou/blob/main/huixiangdou/service/retriever.py

In [37]:
# search_type: 'similarity', 'similarity_score_threshold', 'mmr'
retriever = vectordb.as_retriever(
    search_type = "similarity_score_threshold",
    search_kwargs = {"k": 30, "score_threshold": 0.15}
)
retriever

VectorStoreRetriever(tags=['FAISS', 'HuggingFaceEmbeddings'], vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x0000015A3AA37790>, search_type='similarity_score_threshold', search_kwargs={'k': 30, 'score_threshold': 0.15})

In [38]:
reranker_model_path = "./models/bce-reranker-base_v1"

In [39]:
reranker = BCERerank(
    top_n = 7,
    model = reranker_model_path,
    device = 'cuda',
    use_fp16 = True
)
reranker

04/24/2024 13:36:08 - [INFO] -BCEmbedding.models.RerankerModel->>>    Loading from `./models/bce-reranker-base_v1`.
04/24/2024 13:36:08 - [INFO] -BCEmbedding.models.RerankerModel->>>    Execute device: cuda;	 gpu num: 1;	 use fp16: True


BCERerank(client='BCEmbedding.models.RerankerModel', top_n=7, model='./models/bce-reranker-base_v1')

In [40]:
compression_retriever = ContextualCompressionRetriever(
    base_compressor = reranker,
    base_retriever = retriever
)
compression_retriever

ContextualCompressionRetriever(base_compressor=BCERerank(client='BCEmbedding.models.RerankerModel', top_n=7, model='./models/bce-reranker-base_v1'), base_retriever=VectorStoreRetriever(tags=['FAISS', 'HuggingFaceEmbeddings'], vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x0000015A3AA37790>, search_type='similarity_score_threshold', search_kwargs={'k': 30, 'score_threshold': 0.15}))

## get_relevant_documents

In [41]:
help(retriever.get_relevant_documents)

Help on method get_relevant_documents in module langchain_core.retrievers:

get_relevant_documents(query: 'str', *, callbacks: 'Callbacks' = None, tags: 'Optional[List[str]]' = None, metadata: 'Optional[Dict[str, Any]]' = None, run_name: 'Optional[str]' = None, **kwargs: 'Any') -> 'List[Document]' method of langchain_core.vectorstores.VectorStoreRetriever instance
    Retrieve documents relevant to a query.
    
    Users should favor using `.invoke` or `.batch` rather than
    `get_relevant_documents directly`.
    
    Args:
        query: string to find relevant documents for
        callbacks: Callback manager or list of callbacks
        tags: Optional list of tags associated with the retriever. Defaults to None
            These tags will be associated with each call to this retriever,
            and passed as arguments to the handlers defined in `callbacks`.
        metadata: Optional metadata associated with the retriever. Defaults to None
            This metadata will be ass

In [None]:
retriever.get_relevant_documents(query)

[Document(page_content='Eye Pressure Lowering Effect of Vitamin C\nHerschell H. Boyd, M.D.1\nPurpose\nTo document the pressure before the use\nof vitamin C and after the daily intake of\nmaximum amounts of vitamin C, three times\na day.\nMethods\nThirty patients (16 men and 14 women)\nwere advised to take three divided doses of\nvitamin C in capsule form each day until\nloose stools occured and then back downslightly from this amount (bowel dosage) for\na daily intake. Average daily intake for all\npatients was 10 grams per day.', metadata={'source': './data\\FM docs 2024.3\\Eye Pressure Lowering Effect of Vitamin C.pdf', 'page': 0, 'relevance_score': 0.6476868391036987}),
 Document(page_content='awaken to the marvels of vitamin C in treat-\ning glaucoma!\nIntroduction\nVitamin C has been used since the trans-\nformation of glucose, C\n6H12O6, into vitamin\n1. Eastview Professional Building, 1370 - 116th Avenue\nN.E., Suite 212, Bellevue WA 98004-4679.\n165C, C6H8O6, in the early 1930s

In [42]:
similarity_documents = compression_retriever.get_relevant_documents(query)
similarity_documents

You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


[Document(page_content='Eye Pressure Lowering Effect of Vitamin C\nHerschell H. Boyd, M.D.1\nPurpose\nTo document the pressure before the use\nof vitamin C and after the daily intake of\nmaximum amounts of vitamin C, three times\na day.\nMethods\nThirty patients (16 men and 14 women)\nwere advised to take three divided doses of\nvitamin C in capsule form each day until\nloose stools occured and then back downslightly from this amount (bowel dosage) for\na daily intake. Average daily intake for all\npatients was 10 grams per day.', metadata={'source': './data\\FM docs 2024.3\\Eye Pressure Lowering Effect of Vitamin C.pdf', 'page': 0, 'relevance_score': 0.6476868391036987}),
 Document(page_content='Eye Pressure Lowering Effect of Vitamin C\n167with 12 grams of vitamin C each day. She\nwas a smoker in her early fifties, and she hascontinued to smoke with good reason to\ncease.\nGlaucoma accompanies the cataract pa-\ntients so a daily intake of vitamin C could\nsave the country literally i

In [45]:
similarity_documents = compression_retriever.get_relevant_documents("今天吃了吗")
similarity_documents



[]

## invoke

In [46]:
help(compression_retriever.invoke)

Help on method invoke in module langchain_core.retrievers:

invoke(input: 'str', config: 'Optional[RunnableConfig]' = None, **kwargs: 'Any') -> 'List[Document]' method of langchain.retrievers.contextual_compression.ContextualCompressionRetriever instance
    Invoke the retriever to get relevant documents.
    
    Main entry point for synchronous retriever invocations.
    
    Args:
        input: The query string
        config: Configuration for the retriever
        **kwargs: Additional arguments to pass to the retriever
    
    Returns:
        List of relevant documents
    
    Examples:
    
    .. code-block:: python
    
        retriever.invoke("query")



In [47]:
similarity_documents = compression_retriever.invoke(query)
similarity_documents

[Document(page_content='Eye Pressure Lowering Effect of Vitamin C\nHerschell H. Boyd, M.D.1\nPurpose\nTo document the pressure before the use\nof vitamin C and after the daily intake of\nmaximum amounts of vitamin C, three times\na day.\nMethods\nThirty patients (16 men and 14 women)\nwere advised to take three divided doses of\nvitamin C in capsule form each day until\nloose stools occured and then back downslightly from this amount (bowel dosage) for\na daily intake. Average daily intake for all\npatients was 10 grams per day.', metadata={'source': './data\\FM docs 2024.3\\Eye Pressure Lowering Effect of Vitamin C.pdf', 'page': 0, 'relevance_score': 0.6476868391036987}),
 Document(page_content='Eye Pressure Lowering Effect of Vitamin C\n167with 12 grams of vitamin C each day. She\nwas a smoker in her early fifties, and she hascontinued to smoke with good reason to\ncease.\nGlaucoma accompanies the cataract pa-\ntients so a daily intake of vitamin C could\nsave the country literally i