In [7]:
# 首先导入所需第三方库
from langchain_community.document_loaders import (
    UnstructuredFileLoader,
    UnstructuredMarkdownLoader,
    PyPDFLoader,
)
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from tqdm import tqdm
import os

# 遍历目录获取数据

In [None]:
# 获取文件路径函数
def get_files(dir_path):
    # args：dir_path，目标文件夹路径
    file_list = []
    for filepath, dirnames, filenames in os.walk(dir_path):
        # os.walk 函数将递归遍历指定文件夹
        for filename in filenames:
            # 通过后缀名判断文件类型是否满足要求
            if filename.endswith(".md"):
                # 如果满足要求，将其绝对路径加入到结果列表
                file_list.append(os.path.join(filepath, filename))
            elif filename.endswith(".txt"):
                file_list.append(os.path.join(filepath, filename))
            elif filename.endswith(".pdf"):
                file_list.append(os.path.join(filepath, filename))
    return file_list

In [None]:
def get_text(dir_path):
    # args：dir_path，目标文件夹路径
    # 首先调用上文定义的函数得到目标文件路径列表
    file_lst = get_files(dir_path)
    # docs 存放加载之后的纯文本对象
    docs = []
    # 遍历所有目标文件
    for one_file in tqdm(file_lst):
        print(one_file)
        file_type = one_file.split('.')[-1]
        if file_type == 'md':
            loader = UnstructuredMarkdownLoader(one_file)
        elif file_type == 'txt':
            loader = UnstructuredFileLoader(one_file)
        elif file_type == 'pdf':
            loader = PyPDFLoader(one_file)
        else:
            print("不符合条件的文件：", one_file)
            # 如果是不符合条件的文件，直接跳过
            continue
        docs.extend(loader.load())
    return docs

In [None]:
# 目标文件夹
tar_dirs = "./data"
dirs = os.listdir(tar_dirs)
dirs = [os.path.join(tar_dirs, dir) for dir in dirs]
dirs = [dir for dir in dirs if os.path.isdir(dir)]
dirs

In [None]:
# 加载目标文件
docs = []
for dir_path in dirs:
    docs.extend(get_text(dir_path))
docs[:5]

# 文本分块

In [None]:
# 对文本进行分块
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500, chunk_overlap=150)
text_splitter

In [None]:
split_docs = text_splitter.split_documents(docs)
split_docs[:5]

# 向量化保存数据库

In [8]:
# 定义持久化路径
persist_directory = './vector_db/chroma'

In [9]:
# 加载开源词向量模型
embeddings = HuggingFaceEmbeddings(model_name="./models/sentence-transformer")
embeddings

HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 128, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
), model_name='./models/sentence-transformer', cache_folder=None, model_kwargs={}, encode_kwargs={}, multi_process=False, show_progress=False)

In [None]:
help(Chroma.from_documents)

In [None]:
# 构建向量数据库
vectordb = Chroma.from_documents(
    documents=split_docs,
    embedding=embeddings,
    persist_directory=persist_directory  # 允许我们将persist_directory目录保存到磁盘上
)
vectordb

In [None]:
# 将加载的向量数据库持久化到磁盘上
vectordb.persist()

# 加载数据库

In [10]:
# 加载数据库
vectordb = Chroma(
    embedding_function=embeddings,
    persist_directory=persist_directory,
)

## similarity_search

In [11]:
help(vectordb.similarity_search)

Help on method similarity_search in module langchain_community.vectorstores.chroma:

similarity_search(query: 'str', k: 'int' = 4, filter: 'Optional[Dict[str, str]]' = None, **kwargs: 'Any') -> 'List[Document]' method of langchain_community.vectorstores.chroma.Chroma instance
    Run similarity search with Chroma.
    
    Args:
        query (str): Query text to search for.
        k (int): Number of results to return. Defaults to 4.
        filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None.
    
    Returns:
        List[Document]: List of documents most similar to the query text.



In [13]:
similarity_documents = vectordb.similarity_search(query='Vitamin C', k=5)
similarity_documents

[Document(page_content='of this discussion, vitamin C is utilized to \nshow that vitamin C intake modifies all of the \nperipheral lamellae.  \n \n                                                                              85', metadata={'page': 1, 'source': './data\\FM docs 2024.3\\JOM_1976_05_2_02_The_Eating_Habits_of_High_and_Low_Vitamin_C_Users.pdf'}),
 Document(page_content='before vitamin \nC  after vitamin C p-value\nHemglobin A1c (%)   5.46±0.38  4.88±0.33 0.000\nCortisol (μg/dL) 11.64±3.83  8.80±2.75 0.000\nAspartate aminotranferase (U/L) 28.09±19.92 23.85±7.65 0.000\nAlanine aminotranferase  (U/L) 28.45±20.66 25.12±17.75 0.011\nr-GTP (U/L) 32.59±28.92 25.93±18.05 0.000\nC-reactive protein(mg/L)   0.11±0.20   0.05±0.07 0.033\nvitamin C (μmol/L) 42.90±12.4 68.60±26.57 0.000Table 3. Blood test after vitamin C administration.', metadata={'page': 2, 'source': './data\\FM docs 2024.3\\JOM_2008_23_4_07_Changes_in_Worker_Fatigue_after_Vitamin_C_Administration.pdf'}),
 Document(page

In [15]:
similarity_documents_reference = list(set([doc.metadata['source'] for doc in similarity_documents]))
similarity_documents_reference


['./data\\FM docs 2024.3\\JOM_1998_13_4_06_The_Application_of_the_Hardin_Jones-Pauling-.pdf',
 './data\\FM docs 2024.3\\JOM_2005_20_2_06_Vitamin_C_as_an_Ergogenic_Aid.pdf',
 './data\\FM docs 2024.3\\JOM_1994_09_3_02_Pride_Prejudice_and_Vitamin_C.pdf',
 './data\\FM docs 2024.3\\JOM_1976_05_2_02_The_Eating_Habits_of_High_and_Low_Vitamin_C_Users.pdf',
 './data\\FM docs 2024.3\\JOM_2008_23_4_07_Changes_in_Worker_Fatigue_after_Vitamin_C_Administration.pdf']

In [21]:
import os
os.path.split('./data\\FM docs 2024.3\\JOM_1998_13_4_06_The_Application_of_the_Hardin_Jones-Pauling-.pdf')

('./data\\FM docs 2024.3',
 'JOM_1998_13_4_06_The_Application_of_the_Hardin_Jones-Pauling-.pdf')

In [10]:
similarity_documents = vectordb.asimilarity_search(query='Vitamin C', k=5)
similarity_documents

<coroutine object VectorStore.asimilarity_search at 0x0000020FBCB71930>

## search

In [11]:
help(vectordb.search)

Help on method search in module langchain_core.vectorstores:

search(query: 'str', search_type: 'str', **kwargs: 'Any') -> 'List[Document]' method of langchain_community.vectorstores.chroma.Chroma instance
    Return docs most similar to query using specified search type.



In [12]:
# search_type: 'similarity' or 'mmr'.
similarity_documents = vectordb.search(query='Vitamin C', search_type='similarity', k=5)
similarity_documents

[Document(page_content='of this discussion, vitamin C is utilized to \nshow that vitamin C intake modifies all of the \nperipheral lamellae.  \n \n                                                                              85', metadata={'page': 1, 'source': './data\\FM docs 2024.3\\JOM_1976_05_2_02_The_Eating_Habits_of_High_and_Low_Vitamin_C_Users.pdf'}),
 Document(page_content='before vitamin \nC  after vitamin C p-value\nHemglobin A1c (%)   5.46±0.38  4.88±0.33 0.000\nCortisol (μg/dL) 11.64±3.83  8.80±2.75 0.000\nAspartate aminotranferase (U/L) 28.09±19.92 23.85±7.65 0.000\nAlanine aminotranferase  (U/L) 28.45±20.66 25.12±17.75 0.011\nr-GTP (U/L) 32.59±28.92 25.93±18.05 0.000\nC-reactive protein(mg/L)   0.11±0.20   0.05±0.07 0.033\nvitamin C (μmol/L) 42.90±12.4 68.60±26.57 0.000Table 3. Blood test after vitamin C administration.', metadata={'page': 2, 'source': './data\\FM docs 2024.3\\JOM_2008_23_4_07_Changes_in_Worker_Fatigue_after_Vitamin_C_Administration.pdf'}),
 Document(page

In [13]:
similarity_documents = vectordb.asearch(query='Vitamin C', search_type='similarity', k=5)
similarity_documents

<coroutine object VectorStore.asearch at 0x0000020FBCB71E70>

## retriever

In [15]:
help(vectordb.as_retriever)

Help on method as_retriever in module langchain_core.vectorstores:

as_retriever(**kwargs: 'Any') -> 'VectorStoreRetriever' method of langchain_community.vectorstores.chroma.Chroma instance
    Return VectorStoreRetriever initialized from this VectorStore.
    
    Args:
        search_type (Optional[str]): Defines the type of search that
            the Retriever should perform.
            Can be "similarity" (default), "mmr", or
            "similarity_score_threshold".
        search_kwargs (Optional[Dict]): Keyword arguments to pass to the
            search function. Can include things like:
                k: Amount of documents to return (Default: 4)
                score_threshold: Minimum relevance threshold
                    for similarity_score_threshold
                fetch_k: Amount of documents to pass to MMR algorithm (Default: 20)
                lambda_mult: Diversity of results returned by MMR;
                    1 for minimum diversity and 0 for maximum. (Defaul

In [16]:
retriever = vectordb.as_retriever(search_type="similarity", search_kwargs={"k": 5})
retriever

VectorStoreRetriever(tags=['Chroma', 'HuggingFaceEmbeddings'], vectorstore=<langchain_community.vectorstores.chroma.Chroma object at 0x0000020F8BBCAC20>, search_kwargs={'k': 5})

In [17]:
retriever.vectorstore

<langchain_community.vectorstores.chroma.Chroma at 0x20f8bbcac20>

In [18]:
help(retriever.invoke)

Help on method invoke in module langchain_core.retrievers:

invoke(input: 'str', config: 'Optional[RunnableConfig]' = None, **kwargs: 'Any') -> 'List[Document]' method of langchain_core.vectorstores.VectorStoreRetriever instance
    Invoke the retriever to get relevant documents.
    
    Main entry point for synchronous retriever invocations.
    
    Args:
        input: The query string
        config: Configuration for the retriever
        **kwargs: Additional arguments to pass to the retriever
    
    Returns:
        List of relevant documents
    
    Examples:
    
    .. code-block:: python
    
        retriever.invoke("query")



In [19]:
similarity_documents = retriever.invoke('Vitamin C')
similarity_documents

[Document(page_content='of this discussion, vitamin C is utilized to \nshow that vitamin C intake modifies all of the \nperipheral lamellae.  \n \n                                                                              85', metadata={'page': 1, 'source': './data\\FM docs 2024.3\\JOM_1976_05_2_02_The_Eating_Habits_of_High_and_Low_Vitamin_C_Users.pdf'}),
 Document(page_content='before vitamin \nC  after vitamin C p-value\nHemglobin A1c (%)   5.46±0.38  4.88±0.33 0.000\nCortisol (μg/dL) 11.64±3.83  8.80±2.75 0.000\nAspartate aminotranferase (U/L) 28.09±19.92 23.85±7.65 0.000\nAlanine aminotranferase  (U/L) 28.45±20.66 25.12±17.75 0.011\nr-GTP (U/L) 32.59±28.92 25.93±18.05 0.000\nC-reactive protein(mg/L)   0.11±0.20   0.05±0.07 0.033\nvitamin C (μmol/L) 42.90±12.4 68.60±26.57 0.000Table 3. Blood test after vitamin C administration.', metadata={'page': 2, 'source': './data\\FM docs 2024.3\\JOM_2008_23_4_07_Changes_in_Worker_Fatigue_after_Vitamin_C_Administration.pdf'}),
 Document(page