In [4]:
# 首先导入所需第三方库
from langchain_community.document_loaders import (
    UnstructuredFileLoader,
    UnstructuredMarkdownLoader,
    UnstructuredWordDocumentLoader,
    PyPDFLoader,
)
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_community.vectorstores.utils import DistanceStrategy
from langchain.retrievers import ContextualCompressionRetriever
from BCEmbedding.tools.langchain import BCERerank
from tqdm import tqdm
import os
from utils import hashfile

# 遍历目录获取数据

In [5]:
# 获取文件路径函数
def get_files(dir_path: str) -> list[str]:
    # args：dir_path，目标文件夹路径
    file_list = []
    for filepath, dirnames, filenames in os.walk(dir_path):
        # os.walk 函数将递归遍历指定文件夹
        for filename in filenames:
            # 通过后缀名判断文件类型是否满足要求
            if filename.endswith((".txt", ".md", ".docx", ".doc", ".pdf")):
                # 忽略 readme.md
                # if filename.lower() == 'readme.md':
                #     continue
                file_list.append(os.path.join(filepath, filename))
    return file_list

In [6]:
def get_text(file_list: list[str]) -> list:
    # docs 存放加载之后的纯文本对象
    docs: list = []
    file_hashes: list[str] = []
    repeated_files: list[str] = []
    # 遍历所有目标文件
    for file in tqdm(file_list):
        # 运算文件hash
        hashcode: str = hashfile(file)
        if hashcode in file_hashes:
            print(f"file: `{file}` repeated, ignore this file")
            repeated_files.append(file)
            continue
        file_hashes.append(hashcode)

        if file.endswith(".txt"):
            # txt, md, docx, doc: pip install unstructured
            loader = UnstructuredFileLoader(file)
        elif file.endswith(".md"):
            loader = UnstructuredMarkdownLoader(file)
        elif file.endswith((".docx", ".doc")):
            # pip install python-docx
            loader = UnstructuredWordDocumentLoader(file)
        elif file.endswith(".pdf"):
            # pip install pypdf
            loader = PyPDFLoader(file)
        docs.extend(loader.load())

    if len(repeated_files) > 0:
        print(f"repeated_files: {', '.join(repeated_files)}, please delete them.")
    return docs

In [7]:
# 目标文件夹
tar_dirs = "./data"

In [8]:
# 首先调用上文定义的函数得到目标文件路径列表
file_list = get_files(tar_dirs)
len(file_list)

384

In [9]:
# 加载目标文件
docs = get_text(file_list)
docs[:5]

 21%|██        | 81/384 [00:13<00:25, 12.08it/s]

file: `./data\FM docs 2024.3\JOM_1995_10_2_05_High_Dose_intravenous_Vitamin_C_and_Long_Time_Survival-.pdf` repeated, ignore this file
file: `./data\FM docs 2024.3\JOM_1996_11_2_04_Intravenous_Vitamin_C_in_A_Terminal_Cancer_Patient.pdf` repeated, ignore this file


 23%|██▎       | 87/384 [00:14<00:38,  7.69it/s]

file: `./data\FM docs 2024.3\JOM_1999_14_1_03_Treatment_of_Ambulant_Schizophrenics_with_Vitamin_B3-.pdf` repeated, ignore this file


 39%|███▉      | 151/384 [00:39<00:32,  7.27it/s]

file: `./data\HealthCareData_5.8\JOM_1996_11_1_08_Vitamin_C_and_Hot_Flashes_FACT_Use_in_Chronic-.pdf` repeated, ignore this file


 46%|████▌     | 177/384 [00:54<01:38,  2.10it/s]

file: `./data\HealthCareData_5.8\nutrients-14-03474-with-cover_副本.pdf` repeated, ignore this file


 49%|████▉     | 189/384 [01:01<01:25,  2.29it/s]

file: `./data\healthcare_data6.2\JOM_1981_10_4_03_Treatment_of_a_Mucopolysaccharide_Type_of_Storage-.pdf` repeated, ignore this file
file: `./data\healthcare_data6.2\JOM_1984_13_3_05_Nutrient_Pioneers_Alva_Rae_Patton_Conrad_Elvehjem.pdf` repeated, ignore this file
file: `./data\healthcare_data6.2\JOM_1989_04_2_07_Ascorbic_acid_and_mental_depression.pdf` repeated, ignore this file


 50%|█████     | 193/384 [01:03<01:11,  2.68it/s]

file: `./data\healthcare_data6.2\JOM_1995_10_2_05_High_Dose_intravenous_Vitamin_C_and_Long_Time_Survival-.pdf` repeated, ignore this file


 54%|█████▍    | 209/384 [01:04<00:13, 12.83it/s]

file: `./data\healthcare_data6.2\JOM_1995_10_3-4_08_Eye_Pressure_Lowering_Effect_of_Vitamin_C.pdf` repeated, ignore this file
file: `./data\healthcare_data6.2\JOM_1995_10_3-4_09_Orthomolecular_The_Optimum_Treatment_for_Schizophrenia.pdf` repeated, ignore this file
file: `./data\healthcare_data6.2\JOM_1995_10_3-4_10_Minerals_and_Disease.pdf` repeated, ignore this file
file: `./data\healthcare_data6.2\JOM_1996_11_1_02_Coronary_Artery_Occlusion_Chelation_and_Cholesterol-.pdf` repeated, ignore this file
file: `./data\healthcare_data6.2\JOM_1996_11_1_03_Hair_Trace_Element_Status_of_Appalachian_Head_Start-.pdf` repeated, ignore this file
file: `./data\healthcare_data6.2\JOM_1996_11_1_05_The_Serotonin_Connection.pdf` repeated, ignore this file
file: `./data\healthcare_data6.2\JOM_1996_11_1_07_Cranial_Electrical_Stimulation.pdf` repeated, ignore this file
file: `./data\healthcare_data6.2\JOM_1996_11_1_08_Vitamin_C_and_Hot_Flashes_FACT_Use_in_Chronic-.pdf` repeated, ignore this file


 57%|█████▋    | 217/384 [01:05<00:11, 15.05it/s]

file: `./data\healthcare_data6.2\JOM_1996_11_2_04_Intravenous_Vitamin_C_in_A_Terminal_Cancer_Patient.pdf` repeated, ignore this file


 68%|██████▊   | 262/384 [01:14<00:22,  5.52it/s]

file: `./data\healthcare_data6.2\JOM_1998_13_2_02_High-dose_intravenous_Vitamin_C_in_the_Treatment_of_A-.pdf` repeated, ignore this file


 72%|███████▏  | 275/384 [01:17<00:13,  7.95it/s]

file: `./data\healthcare_data6.2\JOM_1998_13_4_05_The_Health_of_the_NaturopathVitamin_Supplementation-.pdf` repeated, ignore this file
file: `./data\healthcare_data6.2\JOM_1998_13_4_06_The_Application_of_the_Hardin_Jones-Pauling-.pdf` repeated, ignore this file
file: `./data\healthcare_data6.2\JOM_1999_14_1_03_Treatment_of_Ambulant_Schizophrenics_with_Vitamin_B3-.pdf` repeated, ignore this file


 75%|███████▍  | 287/384 [01:21<00:31,  3.04it/s]

file: `./data\healthcare_data6.2\JOM_2000_15_4_02_Vitamin_C_as_Cancer_Therapy_An_Overview.pdf` repeated, ignore this file
file: `./data\healthcare_data6.2\JOM_2000_15_4_03_Vitamin_C_Case_History_of_an_Alternative_Cancer_Therapy.pdf` repeated, ignore this file
file: `./data\healthcare_data6.2\JOM_2000_15_4_04_Clinical_Evaluation_of_Vitamin_C_and_other-.pdf` repeated, ignore this file


 79%|███████▊  | 302/384 [01:24<00:16,  4.98it/s]

file: `./data\healthcare_data6.2\JOM_2001_16_3_08_Vitamin_C_in_Cardiovascular_Disease.pdf` repeated, ignore this file


 80%|████████  | 309/384 [01:26<00:15,  4.89it/s]

file: `./data\healthcare_data6.2\JOM_2002_17_1_03_The_Role_of_Vitamins_B3_and_C_in_the_Treatment-.pdf` repeated, ignore this file
file: `./data\healthcare_data6.2\JOM_2002_17_1_04_Fatigue_and_Vitamin_C.pdf` repeated, ignore this file


 82%|████████▏ | 313/384 [01:31<01:05,  1.09it/s]

file: `./data\healthcare_data6.2\JOM_2002_17_2_06_Case_from_the_Center_Sixteen-Year_History_with_High-.pdf` repeated, ignore this file


 84%|████████▍ | 323/384 [01:36<00:22,  2.71it/s]

file: `./data\healthcare_data6.2\JOM_2002_17_4_07_Vitamin_C_and_Oxidative_DNA_Damage_Revisited.pdf` repeated, ignore this file


 86%|████████▌ | 330/384 [01:37<00:10,  4.92it/s]

file: `./data\healthcare_data6.2\JOM_2003_18_2_05_Effect_of_Vitamin_C_Supplementation_on_Ex_Vivo_Immune-.pdf` repeated, ignore this file


 88%|████████▊ | 339/384 [01:38<00:04,  9.28it/s]

file: `./data\healthcare_data6.2\JOM_2003_18_3-4_03_Vitamin_A_and_Beta-Carotene.pdf` repeated, ignore this file
file: `./data\healthcare_data6.2\JOM_2003_18_3-4_04_Negative_and_Positive_Side_Effects_of_Vitamin_B3.pdf` repeated, ignore this file
file: `./data\healthcare_data6.2\JOM_2003_18_3-4_05_Vitamin_B6_Extract_of_Submission_to_the_UK’s_Food-.pdf` repeated, ignore this file
file: `./data\healthcare_data6.2\JOM_2003_18_3-4_08_The_Trials_and_Tribulations_of_Vitamin_C.pdf` repeated, ignore this file
file: `./data\healthcare_data6.2\JOM_2003_18_3-4_09_The_Gift_of_Vitamin_C.pdf` repeated, ignore this file
file: `./data\healthcare_data6.2\JOM_2003_18_3-4_10_Vitamin_D_Deficiency_Diversity_and_Dosage.pdf` repeated, ignore this file
file: `./data\healthcare_data6.2\JOM_2003_18_3-4_11_Vitamin_E_A_Cure_in_Search_of_Recognition.pdf` repeated, ignore this file
file: `./data\healthcare_data6.2\JOM_2003_18_3-4_12_Can_Vitamin_Supplements_Take_the_Place_of_a_Bad_Diet.pdf` repeated, ignore this file


 90%|█████████ | 347/384 [01:39<00:03, 10.29it/s]

file: `./data\healthcare_data6.2\JOM_2004_19_1_04_Vitamin_D_Supplementation_in_the_Fight_Against_Multiple-.pdf` repeated, ignore this file


 92%|█████████▏| 352/384 [01:40<00:06,  4.92it/s]

file: `./data\healthcare_data6.2\JOM_2004_19_4_02_The_Use_of_Vitamin_C_with_Chemotherapy_in_Cancer-.pdf` repeated, ignore this file


 93%|█████████▎| 356/384 [01:41<00:04,  6.37it/s]

file: `./data\healthcare_data6.2\JOM_2005_20_1_03_Folic_Acid_Vitamin_D_and_Prehistoric_Polymorphisms-.pdf` repeated, ignore this file


 93%|█████████▎| 359/384 [01:42<00:05,  4.29it/s]

file: `./data\healthcare_data6.2\JOM_2005_20_2_06_Vitamin_C_as_an_Ergogenic_Aid.pdf` repeated, ignore this file


 96%|█████████▌| 367/384 [01:45<00:08,  1.96it/s]

file: `./data\healthcare_data6.2\JOM_2005_20_4_07_Screening_for_Vitamin_C_in_the_Urine_Is_it_Clinically-.pdf` repeated, ignore this file


 99%|█████████▉| 381/384 [01:49<00:00,  4.90it/s]

file: `./data\healthcare_data6.2\JOM_2006_21_4_03_Special_Report_False_Positive_Finger_Stick_Blood-.pdf` repeated, ignore this file


100%|██████████| 384/384 [01:49<00:00,  3.51it/s]

file: `./data\healthcare_data6.2\nutrients-11-02205-with-cover.pdf` repeated, ignore this file
file: `./data\healthcare_data6.2\nutrients-12-01181-with-cover.pdf` repeated, ignore this file
repeated_files: ./data\FM docs 2024.3\JOM_1995_10_2_05_High_Dose_intravenous_Vitamin_C_and_Long_Time_Survival-.pdf, ./data\FM docs 2024.3\JOM_1996_11_2_04_Intravenous_Vitamin_C_in_A_Terminal_Cancer_Patient.pdf, ./data\FM docs 2024.3\JOM_1999_14_1_03_Treatment_of_Ambulant_Schizophrenics_with_Vitamin_B3-.pdf, ./data\HealthCareData_5.8\JOM_1996_11_1_08_Vitamin_C_and_Hot_Flashes_FACT_Use_in_Chronic-.pdf, ./data\HealthCareData_5.8\nutrients-14-03474-with-cover_副本.pdf, ./data\healthcare_data6.2\JOM_1981_10_4_03_Treatment_of_a_Mucopolysaccharide_Type_of_Storage-.pdf, ./data\healthcare_data6.2\JOM_1984_13_3_05_Nutrient_Pioneers_Alva_Rae_Patton_Conrad_Elvehjem.pdf, ./data\healthcare_data6.2\JOM_1989_04_2_07_Ascorbic_acid_and_mental_depression.pdf, ./data\healthcare_data6.2\JOM_1995_10_2_05_High_Dose_intraven




[Document(page_content='Eye Pressure Lowering Effect of Vitamin C\nHerschell H. Boyd, M.D.1\nPurpose\nTo document the pressure before the use\nof vitamin C and after the daily intake of\nmaximum amounts of vitamin C, three times\na day.\nMethods\nThirty patients (16 men and 14 women)\nwere advised to take three divided doses of\nvitamin C in capsule form each day until\nloose stools occured and then back downslightly from this amount (bowel dosage) for\na daily intake. Average daily intake for all\npatients was 10 grams per day.\nResults\nThe greatest lowering of pressure was 13\nmm as measured with a Goldmann tonom-\neter. The least lowering of pressure was 1\nmm. The average for 30 patients was 10 mm.Thirty patients were controlled only with\nvitamin C. Twenty patients were forced to\nuse eye drops to lower the pressure below 20mm of mercury as they refused to take vita-\nmin C.\nConclusion\nIn this series of 30 patients there was no\noccasion in which the pressure was not low-ered w

# 文本分块

In [10]:
# 对文本进行分块
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 768, chunk_overlap = 32)
text_splitter

<langchain_text_splitters.character.RecursiveCharacterTextSplitter at 0x1f59db3faf0>

In [11]:
split_docs = text_splitter.split_documents(docs)
split_docs[:5]

[Document(page_content='Eye Pressure Lowering Effect of Vitamin C\nHerschell H. Boyd, M.D.1\nPurpose\nTo document the pressure before the use\nof vitamin C and after the daily intake of\nmaximum amounts of vitamin C, three times\na day.\nMethods\nThirty patients (16 men and 14 women)\nwere advised to take three divided doses of\nvitamin C in capsule form each day until\nloose stools occured and then back downslightly from this amount (bowel dosage) for\na daily intake. Average daily intake for all\npatients was 10 grams per day.\nResults\nThe greatest lowering of pressure was 13\nmm as measured with a Goldmann tonom-\neter. The least lowering of pressure was 1\nmm. The average for 30 patients was 10 mm.Thirty patients were controlled only with\nvitamin C. Twenty patients were forced to', metadata={'source': './data\\FM docs 2024.3\\Eye Pressure Lowering Effect of Vitamin C.pdf', 'page': 0}),
 Document(page_content='use eye drops to lower the pressure below 20mm of mercury as they refus

# 向量化保存数据库

In [12]:
# 定义持久化路径
persist_directory = './vector_db/faiss_reranker'
embedding_model_path = "./models/bce-embedding-base_v1"

In [13]:
# 加载开源词向量模型
embeddings = HuggingFaceEmbeddings(
    model_name = embedding_model_path,
    model_kwargs = {'device': 'cuda'},
    encode_kwargs = {
        'normalize_embeddings': True    # 尽可能保证相似度在0~1之间
    }
)
embeddings

06/03/2024 20:35:26 - [INFO] -datasets->>>    PyTorch version 2.2.2+cu121 available.
06/03/2024 20:35:27 - [INFO] -sentence_transformers.SentenceTransformer->>>    Load pretrained SentenceTransformer: ./models/bce-embedding-base_v1


HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: XLMRobertaModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': True, 'pooling_mode_mean_tokens': False, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
), model_name='./models/bce-embedding-base_v1', cache_folder=None, model_kwargs={'device': 'cuda'}, encode_kwargs={'normalize_embeddings': True}, multi_process=False, show_progress=False)

In [14]:
embeddings.client = embeddings.client.half()
embeddings

HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: XLMRobertaModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': True, 'pooling_mode_mean_tokens': False, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
), model_name='./models/bce-embedding-base_v1', cache_folder=None, model_kwargs={'device': 'cuda'}, encode_kwargs={'normalize_embeddings': True}, multi_process=False, show_progress=False)

In [15]:
help(FAISS.from_documents)

Help on method from_documents in module langchain_core.vectorstores:

from_documents(documents: 'List[Document]', embedding: 'Embeddings', **kwargs: 'Any') -> 'VST' method of abc.ABCMeta instance
    Return VectorStore initialized from documents and embeddings.



In [16]:
# 构建向量数据库
vectordb = FAISS.from_documents(
    documents = split_docs,
    embedding = embeddings,
)
vectordb

06/03/2024 20:39:02 - [INFO] -faiss.loader->>>    Loading faiss with AVX2 support.
06/03/2024 20:39:02 - [INFO] -faiss.loader->>>    Successfully loaded faiss with AVX2 support.


<langchain_community.vectorstores.faiss.FAISS at 0x1f5f9e090f0>

In [17]:
vectordb.save_local(folder_path = persist_directory)

# 加载数据库

In [18]:
# 加载数据库
vectordb = FAISS.load_local(
    folder_path = persist_directory,
    embeddings = embeddings,
    allow_dangerous_deserialization = True, # 允许读取pickle
    # faiss 仅支持 EUCLIDEAN_DISTANCE MAX_INNER_PRODUCT COSINE
    distance_strategy = DistanceStrategy.MAX_INNER_PRODUCT, # refer: https://github.com/InternLM/HuixiangDou/blob/main/huixiangdou/service/retriever.py
    normalize_L2 = False,
)
vectordb

<langchain_community.vectorstores.faiss.FAISS at 0x1f64ab3a8c0>

In [19]:
query = "Eye Pressure Lowering Effect of Vitamin C"

# search

## search

In [20]:
help(vectordb.search)

Help on method search in module langchain_core.vectorstores:

search(query: 'str', search_type: 'str', **kwargs: 'Any') -> 'List[Document]' method of langchain_community.vectorstores.faiss.FAISS instance
    Return docs most similar to query using specified search type.



In [21]:
# search_type: 'similarity' or 'mmr'.
similarity_documents = vectordb.search(query = query, search_type = 'similarity', k = 4, fetch_k = 20)
similarity_documents

[Document(page_content='Eye Pressure Lowering Effect of Vitamin C\nHerschell H. Boyd, M.D.1\nPurpose\nTo document the pressure before the use\nof vitamin C and after the daily intake of\nmaximum amounts of vitamin C, three times\na day.\nMethods\nThirty patients (16 men and 14 women)\nwere advised to take three divided doses of\nvitamin C in capsule form each day until\nloose stools occured and then back downslightly from this amount (bowel dosage) for\na daily intake. Average daily intake for all\npatients was 10 grams per day.\nResults\nThe greatest lowering of pressure was 13\nmm as measured with a Goldmann tonom-\neter. The least lowering of pressure was 1\nmm. The average for 30 patients was 10 mm.Thirty patients were controlled only with\nvitamin C. Twenty patients were forced to', metadata={'source': './data\\FM docs 2024.3\\Eye Pressure Lowering Effect of Vitamin C.pdf', 'page': 0}),
 Document(page_content='use eye drops to lower the pressure below 20mm of mercury as they refus

## similarity_search

In [22]:
help(vectordb.similarity_search)

Help on method similarity_search in module langchain_community.vectorstores.faiss:

similarity_search(query: 'str', k: 'int' = 4, filter: 'Optional[Union[Callable, Dict[str, Any]]]' = None, fetch_k: 'int' = 20, **kwargs: 'Any') -> 'List[Document]' method of langchain_community.vectorstores.faiss.FAISS instance
    Return docs most similar to query.
    
    Args:
        query: Text to look up documents similar to.
        k: Number of Documents to return. Defaults to 4.
        filter: (Optional[Dict[str, str]]): Filter by metadata. Defaults to None.
        fetch_k: (Optional[int]) Number of Documents to fetch before filtering.
                  Defaults to 20.
    
    Returns:
        List of Documents most similar to the query.



In [23]:
similarity_documents = vectordb.similarity_search(query = query, k = 4, fetch_k = 20)
similarity_documents

[Document(page_content='Eye Pressure Lowering Effect of Vitamin C\nHerschell H. Boyd, M.D.1\nPurpose\nTo document the pressure before the use\nof vitamin C and after the daily intake of\nmaximum amounts of vitamin C, three times\na day.\nMethods\nThirty patients (16 men and 14 women)\nwere advised to take three divided doses of\nvitamin C in capsule form each day until\nloose stools occured and then back downslightly from this amount (bowel dosage) for\na daily intake. Average daily intake for all\npatients was 10 grams per day.\nResults\nThe greatest lowering of pressure was 13\nmm as measured with a Goldmann tonom-\neter. The least lowering of pressure was 1\nmm. The average for 30 patients was 10 mm.Thirty patients were controlled only with\nvitamin C. Twenty patients were forced to', metadata={'source': './data\\FM docs 2024.3\\Eye Pressure Lowering Effect of Vitamin C.pdf', 'page': 0}),
 Document(page_content='use eye drops to lower the pressure below 20mm of mercury as they refus

In [24]:
similarity_documents_reference = list(set([doc.metadata['source'] for doc in similarity_documents]))
similarity_documents_reference

['./data\\FM docs 2024.3\\Eye Pressure Lowering Effect of Vitamin C.pdf']

## similarity_search_with_score

In [25]:
help(vectordb.similarity_search_with_score)

Help on method similarity_search_with_score in module langchain_community.vectorstores.faiss:

similarity_search_with_score(query: 'str', k: 'int' = 4, filter: 'Optional[Union[Callable, Dict[str, Any]]]' = None, fetch_k: 'int' = 20, **kwargs: 'Any') -> 'List[Tuple[Document, float]]' method of langchain_community.vectorstores.faiss.FAISS instance
    Return docs most similar to query.
    
    Args:
        query: Text to look up documents similar to.
        k: Number of Documents to return. Defaults to 4.
        filter (Optional[Dict[str, str]]): Filter by metadata.
            Defaults to None. If a callable, it must take as input the
            metadata dict of Document and return a bool.
    
        fetch_k: (Optional[int]) Number of Documents to fetch before filtering.
                  Defaults to 20.
    
    Returns:
        List of documents most similar to the query text with
        L2 distance in float. Lower score represents more similarity.



In [26]:
similarity_documents = vectordb.similarity_search_with_score(query = query, k = 4, fetch_k = 20)
similarity_documents

[(Document(page_content='Eye Pressure Lowering Effect of Vitamin C\nHerschell H. Boyd, M.D.1\nPurpose\nTo document the pressure before the use\nof vitamin C and after the daily intake of\nmaximum amounts of vitamin C, three times\na day.\nMethods\nThirty patients (16 men and 14 women)\nwere advised to take three divided doses of\nvitamin C in capsule form each day until\nloose stools occured and then back downslightly from this amount (bowel dosage) for\na daily intake. Average daily intake for all\npatients was 10 grams per day.\nResults\nThe greatest lowering of pressure was 13\nmm as measured with a Goldmann tonom-\neter. The least lowering of pressure was 1\nmm. The average for 30 patients was 10 mm.Thirty patients were controlled only with\nvitamin C. Twenty patients were forced to', metadata={'source': './data\\FM docs 2024.3\\Eye Pressure Lowering Effect of Vitamin C.pdf', 'page': 0}),
  0.31897402),
 (Document(page_content='use eye drops to lower the pressure below 20mm of merc

In [27]:
documents, scores = zip(*similarity_documents)
scores

(0.31897402, 0.5914958, 0.5934002, 0.5990593)

## similarity_search_with_relevance_scores

In [28]:
help(vectordb.similarity_search_with_relevance_scores)

Help on method similarity_search_with_relevance_scores in module langchain_core.vectorstores:

similarity_search_with_relevance_scores(query: 'str', k: 'int' = 4, **kwargs: 'Any') -> 'List[Tuple[Document, float]]' method of langchain_community.vectorstores.faiss.FAISS instance
    Return docs and relevance scores in the range [0, 1].
    
    0 is dissimilar, 1 is most similar.
    
    Args:
        query: input text
        k: Number of Documents to return. Defaults to 4.
        **kwargs: kwargs to be passed to similarity search. Should include:
            score_threshold: Optional, a floating point value between 0 to 1 to
                filter the resulting set of retrieved docs
    
    Returns:
        List of Tuples of (doc, similarity_score)



In [29]:
similarity_documents = vectordb.similarity_search_with_relevance_scores(query = query, k = 4, fetch_k = 20)
similarity_documents

[(Document(page_content='Eye Pressure Lowering Effect of Vitamin C\nHerschell H. Boyd, M.D.1\nPurpose\nTo document the pressure before the use\nof vitamin C and after the daily intake of\nmaximum amounts of vitamin C, three times\na day.\nMethods\nThirty patients (16 men and 14 women)\nwere advised to take three divided doses of\nvitamin C in capsule form each day until\nloose stools occured and then back downslightly from this amount (bowel dosage) for\na daily intake. Average daily intake for all\npatients was 10 grams per day.\nResults\nThe greatest lowering of pressure was 13\nmm as measured with a Goldmann tonom-\neter. The least lowering of pressure was 1\nmm. The average for 30 patients was 10 mm.Thirty patients were controlled only with\nvitamin C. Twenty patients were forced to', metadata={'source': './data\\FM docs 2024.3\\Eye Pressure Lowering Effect of Vitamin C.pdf', 'page': 0}),
  0.6810259819030762),
 (Document(page_content='use eye drops to lower the pressure below 20mm

In [30]:
documents, scores = zip(*similarity_documents)
scores

(0.6810259819030762,
 0.4085041880607605,
 0.40659981966018677,
 0.4009407162666321)

## similarity_search_by_vector

In [31]:
help(vectordb.similarity_search_by_vector)

Help on method similarity_search_by_vector in module langchain_community.vectorstores.faiss:

similarity_search_by_vector(embedding: 'List[float]', k: 'int' = 4, filter: 'Optional[Dict[str, Any]]' = None, fetch_k: 'int' = 20, **kwargs: 'Any') -> 'List[Document]' method of langchain_community.vectorstores.faiss.FAISS instance
    Return docs most similar to embedding vector.
    
    Args:
        embedding: Embedding to look up documents similar to.
        k: Number of Documents to return. Defaults to 4.
        filter (Optional[Dict[str, str]]): Filter by metadata.
            Defaults to None. If a callable, it must take as input the
            metadata dict of Document and return a bool.
    
        fetch_k: (Optional[int]) Number of Documents to fetch before filtering.
                  Defaults to 20.
    
    Returns:
        List of Documents most similar to the embedding.



## similarity_search_with_score_by_vector

In [32]:
help(vectordb.similarity_search_with_score_by_vector)

Help on method similarity_search_with_score_by_vector in module langchain_community.vectorstores.faiss:

similarity_search_with_score_by_vector(embedding: 'List[float]', k: 'int' = 4, filter: 'Optional[Union[Callable, Dict[str, Any]]]' = None, fetch_k: 'int' = 20, **kwargs: 'Any') -> 'List[Tuple[Document, float]]' method of langchain_community.vectorstores.faiss.FAISS instance
    Return docs most similar to query.
    
    Args:
        embedding: Embedding vector to look up documents similar to.
        k: Number of Documents to return. Defaults to 4.
        filter (Optional[Union[Callable, Dict[str, Any]]]): Filter by metadata.
            Defaults to None. If a callable, it must take as input the
            metadata dict of Document and return a bool.
        fetch_k: (Optional[int]) Number of Documents to fetch before filtering.
                  Defaults to 20.
        **kwargs: kwargs to be passed to similarity search. Can include:
            score_threshold: Optional, a floa

# retriever

In [33]:
help(vectordb.as_retriever)

Help on method as_retriever in module langchain_core.vectorstores:

as_retriever(**kwargs: 'Any') -> 'VectorStoreRetriever' method of langchain_community.vectorstores.faiss.FAISS instance
    Return VectorStoreRetriever initialized from this VectorStore.
    
    Args:
        search_type (Optional[str]): Defines the type of search that
            the Retriever should perform.
            Can be "similarity" (default), "mmr", or
            "similarity_score_threshold".
        search_kwargs (Optional[Dict]): Keyword arguments to pass to the
            search function. Can include things like:
                k: Amount of documents to return (Default: 4)
                score_threshold: Minimum relevance threshold
                    for similarity_score_threshold
                fetch_k: Amount of documents to pass to MMR algorithm (Default: 20)
                lambda_mult: Diversity of results returned by MMR;
                    1 for minimum diversity and 0 for maximum. (Default:

In [34]:
# search_type: 'similarity', 'similarity_score_threshold', 'mmr'
retriever = vectordb.as_retriever(
    search_type = "similarity_score_threshold",
    search_kwargs = {"k": 4, "score_threshold": 0.15, "fetch_k": 20}
)
retriever

VectorStoreRetriever(tags=['FAISS', 'HuggingFaceEmbeddings'], vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x000001F64AB3A8C0>, search_type='similarity_score_threshold', search_kwargs={'k': 4, 'score_threshold': 0.15, 'fetch_k': 20})

In [35]:
retriever.vectorstore

<langchain_community.vectorstores.faiss.FAISS at 0x1f64ab3a8c0>

## invoke

In [36]:
help(retriever.invoke)

Help on method invoke in module langchain_core.retrievers:

invoke(input: 'str', config: 'Optional[RunnableConfig]' = None, **kwargs: 'Any') -> 'List[Document]' method of langchain_core.vectorstores.VectorStoreRetriever instance
    Invoke the retriever to get relevant documents.
    
    Main entry point for synchronous retriever invocations.
    
    Args:
        input: The query string
        config: Configuration for the retriever
        **kwargs: Additional arguments to pass to the retriever
    
    Returns:
        List of relevant documents
    
    Examples:
    
    .. code-block:: python
    
        retriever.invoke("query")



In [37]:
similarity_documents = retriever.invoke(query)
similarity_documents

[Document(page_content='Eye Pressure Lowering Effect of Vitamin C\nHerschell H. Boyd, M.D.1\nPurpose\nTo document the pressure before the use\nof vitamin C and after the daily intake of\nmaximum amounts of vitamin C, three times\na day.\nMethods\nThirty patients (16 men and 14 women)\nwere advised to take three divided doses of\nvitamin C in capsule form each day until\nloose stools occured and then back downslightly from this amount (bowel dosage) for\na daily intake. Average daily intake for all\npatients was 10 grams per day.\nResults\nThe greatest lowering of pressure was 13\nmm as measured with a Goldmann tonom-\neter. The least lowering of pressure was 1\nmm. The average for 30 patients was 10 mm.Thirty patients were controlled only with\nvitamin C. Twenty patients were forced to', metadata={'source': './data\\FM docs 2024.3\\Eye Pressure Lowering Effect of Vitamin C.pdf', 'page': 0}),
 Document(page_content='use eye drops to lower the pressure below 20mm of mercury as they refus

In [38]:
similarity_documents = retriever.invoke("今天吃了吗")
similarity_documents



[]

# 重排序
参考： https://github.com/InternLM/HuixiangDou/blob/main/huixiangdou/service/retriever.py

In [39]:
# search_type: 'similarity', 'similarity_score_threshold', 'mmr'
retriever = vectordb.as_retriever(
    search_type = "similarity_score_threshold",
    search_kwargs = {"k": 30, "score_threshold": 0.15}
)
retriever

VectorStoreRetriever(tags=['FAISS', 'HuggingFaceEmbeddings'], vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x000001F64AB3A8C0>, search_type='similarity_score_threshold', search_kwargs={'k': 30, 'score_threshold': 0.15})

In [40]:
reranker_model_path = "./models/bce-reranker-base_v1"

In [41]:
reranker = BCERerank(
    top_n = 4,
    model = reranker_model_path,
    device = 'cuda',
    use_fp16 = True
)
reranker

06/03/2024 20:39:30 - [INFO] -BCEmbedding.models.RerankerModel->>>    Loading from `./models/bce-reranker-base_v1`.
06/03/2024 20:39:30 - [INFO] -BCEmbedding.models.RerankerModel->>>    Execute device: cuda;	 gpu num: 1;	 use fp16: True


BCERerank(client='BCEmbedding.models.RerankerModel', top_n=4, model='./models/bce-reranker-base_v1')

In [42]:
compression_retriever = ContextualCompressionRetriever(
    base_compressor = reranker,
    base_retriever = retriever
)
compression_retriever

ContextualCompressionRetriever(base_compressor=BCERerank(client='BCEmbedding.models.RerankerModel', top_n=4, model='./models/bce-reranker-base_v1'), base_retriever=VectorStoreRetriever(tags=['FAISS', 'HuggingFaceEmbeddings'], vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x000001F64AB3A8C0>, search_type='similarity_score_threshold', search_kwargs={'k': 30, 'score_threshold': 0.15}))

## invoke

In [43]:
similarity_documents = compression_retriever.invoke(query)
similarity_documents

You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


[Document(page_content='Eye Pressure Lowering Effect of Vitamin C\nHerschell H. Boyd, M.D.1\nPurpose\nTo document the pressure before the use\nof vitamin C and after the daily intake of\nmaximum amounts of vitamin C, three times\na day.\nMethods\nThirty patients (16 men and 14 women)\nwere advised to take three divided doses of\nvitamin C in capsule form each day until\nloose stools occured and then back downslightly from this amount (bowel dosage) for\na daily intake. Average daily intake for all\npatients was 10 grams per day.\nResults\nThe greatest lowering of pressure was 13\nmm as measured with a Goldmann tonom-\neter. The least lowering of pressure was 1\nmm. The average for 30 patients was 10 mm.Thirty patients were controlled only with\nvitamin C. Twenty patients were forced to', metadata={'source': './data\\FM docs 2024.3\\Eye Pressure Lowering Effect of Vitamin C.pdf', 'page': 0, 'relevance_score': 0.6124255657196045}),
 Document(page_content='In 1969, Dr. Erich Linner explai

In [45]:
similarity_documents = compression_retriever.invoke("今天吃了吗")
similarity_documents



[]