In [1]:
# 首先导入所需第三方库
from langchain_community.document_loaders import (
    UnstructuredFileLoader,
    UnstructuredMarkdownLoader,
    PyPDFLoader,
)
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from tqdm import tqdm
import os

# 遍历目录获取数据

In [2]:
# 获取文件路径函数
def get_files(dir_path):
    # args：dir_path，目标文件夹路径
    file_list = []
    for filepath, dirnames, filenames in os.walk(dir_path):
        # os.walk 函数将递归遍历指定文件夹
        for filename in filenames:
            # 通过后缀名判断文件类型是否满足要求
            if filename.endswith(".md"):
                # 如果满足要求，将其绝对路径加入到结果列表
                file_list.append(os.path.join(filepath, filename))
            elif filename.endswith(".txt"):
                file_list.append(os.path.join(filepath, filename))
            elif filename.endswith(".pdf"):
                file_list.append(os.path.join(filepath, filename))
    return file_list

In [3]:
def get_text(dir_path):
    # args：dir_path，目标文件夹路径
    # 首先调用上文定义的函数得到目标文件路径列表
    file_lst = get_files(dir_path)
    # docs 存放加载之后的纯文本对象
    docs = []
    # 遍历所有目标文件
    for one_file in tqdm(file_lst):
        print(one_file)
        file_type = one_file.split('.')[-1]
        if file_type == 'md':
            loader = UnstructuredMarkdownLoader(one_file)
        elif file_type == 'txt':
            loader = UnstructuredFileLoader(one_file)
        elif file_type == 'pdf':
            loader = PyPDFLoader(one_file)
        else:
            print("不符合条件的文件：", one_file)
            # 如果是不符合条件的文件，直接跳过
            continue
        docs.extend(loader.load())
    return docs

In [4]:
# 目标文件夹
tar_dirs = "./data"
dirs = os.listdir(tar_dirs)
dirs = [os.path.join(tar_dirs, dir) for dir in dirs]
dirs = [dir for dir in dirs if os.path.isdir(dir)]
dirs

['./data\\FM docs 2024.3']

In [5]:
# 加载目标文件
docs = []
for dir_path in dirs:
    docs.extend(get_text(dir_path))
docs[:5]

  0%|          | 0/134 [00:00<?, ?it/s]

  1%|          | 1/134 [00:00<00:19,  6.65it/s]Ignoring wrong pointing object 13 0 (offset 0)
Ignoring wrong pointing object 16 0 (offset 0)
Ignoring wrong pointing object 21 0 (offset 0)
Ignoring wrong pointing object 44 0 (offset 0)
Ignoring wrong pointing object 57 0 (offset 0)


./data\FM docs 2024.3\Eye Pressure Lowering Effect of Vitamin C.pdf
./data\FM docs 2024.3\High-Dose-Intravenous-Vitamin-C-Treatment-for-Zika-Fever-31.1.pdf


  3%|▎         | 4/134 [00:00<00:12, 10.16it/s]

./data\FM docs 2024.3\JOM_1970_02_4_02_Vitamin_B3_and_Krebiozen_-_a_polemic.pdf
./data\FM docs 2024.3\JOM_1970_02_4_04_The_Parenteral_Use_of_Vitamins_in_the_Treatment-.pdf
./data\FM docs 2024.3\JOM_1971_03_1_03_A_Vitamin_B3_Dependent_Family.pdf


  4%|▍         | 6/134 [00:00<00:14,  8.60it/s]

./data\FM docs 2024.3\JOM_1971_03_1_05_The_Use_of_Mega_Vitamin_Therapy_in_Regulating_Severe-.pdf
./data\FM docs 2024.3\JOM_1971_03_2_07_Vitamin_B3_Dependent_Child.pdf


  5%|▌         | 7/134 [00:00<00:15,  8.28it/s]

./data\FM docs 2024.3\JOM_1972_01_1_05_The_Use_of_Vitamin_B12b_in_Psychiatric_Practice.pdf


  6%|▌         | 8/134 [00:01<00:19,  6.44it/s]

./data\FM docs 2024.3\JOM_1972_01_2-3_09_A_Study_of_Neurological_Organization_Procedures-.pdf


  7%|▋         | 9/134 [00:01<00:25,  4.94it/s]

./data\FM docs 2024.3\JOM_1973_02_3_03_Clinical_Observations_on_the_Treatment_of_Schizophrenic-.pdf


  8%|▊         | 11/134 [00:01<00:26,  4.64it/s]

./data\FM docs 2024.3\JOM_1973_02_3_05_An_Examination_of_the_Double-Blind_Method-.pdf
./data\FM docs 2024.3\JOM_1974_03_3_02_Early_Evidence_About_Vitamin_C_And_the_Common_Cold.pdf


 10%|█         | 14/134 [00:02<00:22,  5.33it/s]

./data\FM docs 2024.3\JOM_1974_03_3_11_To_the_Editor_The_Road_to_Shangri-La_is_Paved_with-.pdf
./data\FM docs 2024.3\JOM_1974_03_4_06_Megavitamins.pdf
./data\FM docs 2024.3\JOM_1975_04_1_11_Administration_of_Massive_Doses_of_Vitamin_E-.pdf
./data\FM docs 2024.3\JOM_1975_04_2_04_The_Vitamin_D-Problem_An_Important_Lesson-.pdf


 12%|█▏        | 16/134 [00:02<00:19,  5.99it/s]

./data\FM docs 2024.3\JOM_1975_04_3_06_Diet-Vitamin_Program_for_Jail_Inmates.pdf


 13%|█▎        | 17/134 [00:03<00:25,  4.62it/s]

./data\FM docs 2024.3\JOM_1975_04_4_07_The_Use_of_Megavitamin_Treatment_in_Children-.pdf


 13%|█▎        | 18/134 [00:03<00:25,  4.54it/s]

./data\FM docs 2024.3\JOM_1975_04_4_09_An_Update_of_Megavitamin_Therapy_in_Orthomolecular-.pdf


 15%|█▍        | 20/134 [00:03<00:24,  4.61it/s]

./data\FM docs 2024.3\JOM_1976_05_2_02_The_Eating_Habits_of_High_and_Low_Vitamin_C_Users.pdf
./data\FM docs 2024.3\JOM_1976_05_3_03_Megavitamin_Therapy_for_Different_Cases.pdf


 16%|█▋        | 22/134 [00:04<00:25,  4.41it/s]

./data\FM docs 2024.3\JOM_1976_05_3_05_Massive_Vitamin_C_as_an_Adjunct_in_Methadone_Maintenance-.pdf
./data\FM docs 2024.3\JOM_1976_05_3_08_Stomach_Acid_and_Megavitamins.pdf


 18%|█▊        | 24/134 [00:04<00:20,  5.32it/s]

./data\FM docs 2024.3\JOM_1976_05_4_02_A_Report_on_a_Twelve-Month_Period_of_Treating_Metabolic_Diseases-.pdf
./data\FM docs 2024.3\JOM_1977_06_1_01_Does_Ascorbic_Acid_Destroy_Vitamin_B12.pdf


 19%|█▉        | 26/134 [00:04<00:17,  6.23it/s]

./data\FM docs 2024.3\JOM_1977_06_1_06_Meditation_Protein_Diet_and_Megavitamins_in_the_Treatment-.pdf
./data\FM docs 2024.3\JOM_1978_07_2_03_Reduction_of_Blood_Lead_Levels_in_Battery_Workers-.pdf


 21%|██        | 28/134 [00:05<00:19,  5.33it/s]

./data\FM docs 2024.3\JOM_1978_07_4_06_A_Reply_to_the_American_Psychiatric_Association_Task-.pdf
./data\FM docs 2024.3\JOM_1979_08_2_02_Large_Amounts_of_Nicotinic_Acid_and_Vitamin_B12-.pdf
./data\FM docs 2024.3\JOM_1979_08_2_04_X-Linked_Dominant_Manic-Depressive_Illness_Linkage_with-.pdf


 22%|██▏       | 30/134 [00:05<00:16,  6.44it/s]

./data\FM docs 2024.3\JOM_1979_08_4_06_Resistance_To_Orthomolecular_Medicine_Or_Why_You_Dont-.pdf
./data\FM docs 2024.3\JOM_1979_08_4_08_Orthomolecular_Medicine_and_Megavitamin_Therapy-.pdf


 25%|██▍       | 33/134 [00:05<00:15,  6.44it/s]

./data\FM docs 2024.3\JOM_1980_09_1_06_Vitamins_B1_B6_and_B12_In_The_Adjunctive_Treatment-.pdf
./data\FM docs 2024.3\JOM_1980_09_1_10_Vitamins_The_Get-Smart_Pills.pdf


 25%|██▌       | 34/134 [00:06<00:17,  5.62it/s]

./data\FM docs 2024.3\JOM_1981_10_2_06_The_Method_of_Determining_Proper_Doses_of_Vitamin_C-.pdf


 27%|██▋       | 36/134 [00:06<00:18,  5.28it/s]

./data\FM docs 2024.3\JOM_1981_10_3_04_Psychiatric_Significance_of_the_Plasma_Concentrations-.pdf
./data\FM docs 2024.3\JOM_1982_11_1_06_Vitamins_Bl_B6_and_B12_in_the_Adjunctive_Treatment-.pdf
./data\FM docs 2024.3\JOM_1982_11_2_02_Vitamin_B6_Nutritional_Status_of_a_Psychiatric-.pdf


 28%|██▊       | 38/134 [00:06<00:16,  5.70it/s]

./data\FM docs 2024.3\JOM_1982_11_2_07_Vitamin_C_and_Tolerance_of_Heat_and_Cold_Human_Evidence.pdf
./data\FM docs 2024.3\JOM_1982_11_4_03_Vitamin_B15_A_Review_and_Update.pdf


 31%|███       | 41/134 [00:07<00:14,  6.22it/s]

./data\FM docs 2024.3\JOM_1983_12_4_08_Vitamin_B-12_Levels_of_Cerebrospinal_Fluid_in_Patients-.pdf
./data\FM docs 2024.3\JOM_1984_13_2_03_Alzheimers_Disease_Alcohol_Dementia_Association-.pdf


 33%|███▎      | 44/134 [00:07<00:13,  6.82it/s]

./data\FM docs 2024.3\JOM_1984_13_4_08_The_Effect_of_EDTA_Chelation_Therapy_With_Multivitamin-.pdf
./data\FM docs 2024.3\JOM_1985_14_1_08_The_Clinical_Change_in_Patients_Treated_with_EDTA-.pdf
./data\FM docs 2024.3\JOM_1985_14_4_01_National_Institute_of_Health_Promotes_Megavitamin_Therapy.pdf
./data\FM docs 2024.3\JOM_1986_01_1_05_The_Prevention_Of_Tardive_Dyskinesia_with_High_Dosage-.pdf


 35%|███▌      | 47/134 [00:08<00:12,  7.06it/s]

./data\FM docs 2024.3\JOM_1986_01_1_10_Vitamin_Therapy_for_Hyperactivity_and_Schizophreniae-.pdf


 37%|███▋      | 49/134 [00:08<00:11,  7.70it/s]

./data\FM docs 2024.3\JOM_1986_01_4_04_The_Ideal_Vitamin_C_Intake.pdf
./data\FM docs 2024.3\JOM_1986_01_4_06_Alzheimers_Dementia_Some_Possible_Mechanisms_Related_To_Vitamins-.pdf
./data\FM docs 2024.3\JOM_1987_02_2_02_Im_Schizophrenic_Doctor_Not_Stupid.pdf


 37%|███▋      | 50/134 [00:08<00:12,  6.80it/s]XRef object at 1067 can not be read, some object may be missing
XRef object at 969 can not be read, some object may be missing
 39%|███▉      | 52/134 [00:08<00:09,  8.78it/s]XRef object at 969 can not be read, some object may be missing


./data\FM docs 2024.3\JOM_1988_03_1_06_Around_The_World_International_Vitamin_Convention-.pdf
./data\FM docs 2024.3\JOM_1988_03_1_11_Correspondence.pdf
./data\FM docs 2024.3\JOM_1988_03_2_05_Around_The_World_AIDS_Vitamin_C_and_Egg_Lecithin.pdf
./data\FM docs 2024.3\JOM_1989_04_4_06_Megavitamin_Therapy_in_the_Reduction_of_Anxiety-.pdf


 40%|████      | 54/134 [00:08<00:08,  9.46it/s]

./data\FM docs 2024.3\JOM_1990_05_1_02_Case_study_High_Dose_intravenous_Vitamin_C_in_the-.pdf
./data\FM docs 2024.3\JOM_1990_05_1_04_Nutritional_Interrelationships_Minerals_Vitamins-.pdf


 42%|████▏     | 56/134 [00:09<00:09,  8.20it/s]XRef object at 1201 can not be read, some object may be missing


./data\FM docs 2024.3\JOM_1990_05_3_06_Hardin_Jones_Biostatistical_Analysis_of_Mortality_Data-.pdf


 43%|████▎     | 57/134 [00:09<00:18,  4.14it/s]XRef object at 1104 can not be read, some object may be missing
 43%|████▎     | 58/134 [00:10<00:17,  4.42it/s]XRef object at 1041 can not be read, some object may be missing


./data\FM docs 2024.3\JOM_1991_06_1_04_Cardiovascular_Dynamics_and_Edta_Chelation_with-.pdf
./data\FM docs 2024.3\JOM_1991_06_1_07_The_Nutritional_Relationships_of_Vitamin_A.pdf


 44%|████▍     | 59/134 [00:10<00:16,  4.68it/s]XRef object at 1009 can not be read, some object may be missing
XRef object at 1032 can not be read, some object may be missing
 46%|████▌     | 61/134 [00:10<00:12,  5.93it/s]

./data\FM docs 2024.3\JOM_1991_06_2_05_Treatment_of_Hypercholesterolemia_with_Vitamin_E_C-.pdf
./data\FM docs 2024.3\JOM_1991_06_2_08_The_Origin_of_the_42-year_Stonewall_of_Vitamin_C.pdf


XRef object at 946 can not be read, some object may be missing
XRef object at 974 can not be read, some object may be missing


./data\FM docs 2024.3\JOM_1991_06_3-4_02_Welcome_To_Second_World_Congress_on_Vitamin_C.pdf
./data\FM docs 2024.3\JOM_1991_06_3-4_07_Vitamin_C_and_Stomatology_A_Mouthful_of_Evidence.pdf


 47%|████▋     | 63/134 [00:10<00:11,  6.28it/s]XRef object at 973 can not be read, some object may be missing
 48%|████▊     | 64/134 [00:11<00:11,  5.95it/s]XRef object at 944 can not be read, some object may be missing


./data\FM docs 2024.3\JOM_1991_06_3-4_08_Clinical_Procedures_in_Treating_Terminally_Ill_Cancer-.pdf
./data\FM docs 2024.3\JOM_1991_06_3-4_09_Vitamin_C_and_Multifactorial_Disease.pdf


 49%|████▊     | 65/134 [00:11<00:11,  6.25it/s]XRef object at 1156 can not be read, some object may be missing
 49%|████▉     | 66/134 [00:11<00:11,  5.96it/s]

./data\FM docs 2024.3\JOM_1991_06_3-4_10_Vitamin_C_Deficiency_Cholesterol_Metabolism-.pdf


XRef object at 972 can not be read, some object may be missing
 50%|█████     | 67/134 [00:11<00:11,  6.02it/s]XRef object at 1094 can not be read, some object may be missing


./data\FM docs 2024.3\JOM_1991_06_3-4_12_Children_Vitamin_C_and_Medical_Progress.pdf
./data\FM docs 2024.3\JOM_1992_07_1_06_Vitamin_Mineral_Supplementation_and_the_intelligence-.pdf


 51%|█████     | 68/134 [00:11<00:12,  5.41it/s]XRef object at 1067 can not be read, some object may be missing
 51%|█████▏    | 69/134 [00:11<00:10,  5.99it/s]XRef object at 948 can not be read, some object may be missing


./data\FM docs 2024.3\JOM_1992_07_4_02_The_Third_Face_of_Vitamin_C.pdf
./data\FM docs 2024.3\JOM_1992_07_4_04_Cancer_Immunology_and_Aging_The_Nutritional_influence.pdf


 52%|█████▏    | 70/134 [00:11<00:09,  6.53it/s]XRef object at 1158 can not be read, some object may be missing


./data\FM docs 2024.3\JOM_1993_08_2_09_Megavitamins_and_Psychotherapy_Effective_Economical-.pdf


 53%|█████▎    | 71/134 [00:12<00:17,  3.58it/s]XRef object at 1132 can not be read, some object may be missing


./data\FM docs 2024.3\JOM_1993_08_3_09_Hardin_Jones_Biostatistical_Analysis_of_Mortality_Data-.pdf


 54%|█████▎    | 72/134 [00:13<00:21,  2.89it/s]XRef object at 1080 can not be read, some object may be missing


./data\FM docs 2024.3\JOM_1994_09_1_04_Vitamin_C_and_Fatigue.pdf


 54%|█████▍    | 73/134 [00:13<00:19,  3.17it/s]XRef object at 1079 can not be read, some object may be missing
XRef object at 941 can not be read, some object may be missing
 56%|█████▌    | 75/134 [00:13<00:12,  4.70it/s]

./data\FM docs 2024.3\JOM_1994_09_1_06_Vitamin_B6_and_Carpal_Tunnel_Syndrome_A_Case_Report.pdf
./data\FM docs 2024.3\JOM_1994_09_3_02_Pride_Prejudice_and_Vitamin_C.pdf
./data\FM docs 2024.3\JOM_1995_10_2_05_High_Dose_intravenous_Vitamin_C_and_Long_Time_Survival- (1).pdf


 58%|█████▊    | 78/134 [00:13<00:07,  7.52it/s]

./data\FM docs 2024.3\JOM_1995_10_2_05_High_Dose_intravenous_Vitamin_C_and_Long_Time_Survival-.pdf
./data\FM docs 2024.3\JOM_1995_10_2_07_Treatment_of_Iritis_and_Herpes_Zoster_with_Vitamin_C.pdf
./data\FM docs 2024.3\JOM_1996_11_2_04_Intravenous_Vitamin_C_in_A_Terminal_Cancer_Patient (1).pdf
./data\FM docs 2024.3\JOM_1996_11_2_04_Intravenous_Vitamin_C_in_A_Terminal_Cancer_Patient.pdf
./data\FM docs 2024.3\JOM_1998_13_2_02_High-dose_intravenous_Vitamin_C_in_the_Treatment_of_A-.pdf


 60%|██████    | 81/134 [00:13<00:05, 10.15it/s]

./data\FM docs 2024.3\JOM_1998_13_4_02_Observations_On_the_Dose_and_Administration_of_Ascorbic-.pdf


 62%|██████▏   | 83/134 [00:14<00:07,  6.53it/s]

./data\FM docs 2024.3\JOM_1998_13_4_05_The_Health_of_the_NaturopathVitamin_Supplementation-.pdf
./data\FM docs 2024.3\JOM_1998_13_4_06_The_Application_of_the_Hardin_Jones-Pauling-.pdf


 63%|██████▎   | 85/134 [00:14<00:08,  5.76it/s]

./data\FM docs 2024.3\JOM_1999_14_1_03_Treatment_of_Ambulant_Schizophrenics_with_Vitamin_B3- (1).pdf
./data\FM docs 2024.3\JOM_1999_14_1_03_Treatment_of_Ambulant_Schizophrenics_with_Vitamin_B3-.pdf


 65%|██████▍   | 87/134 [00:15<00:07,  6.16it/s]

./data\FM docs 2024.3\JOM_2000_15_4_01_Vitamin_C_and_Cancer_-_A_Workshop.pdf
./data\FM docs 2024.3\JOM_2000_15_4_02_Vitamin_C_as_Cancer_Therapy_An_Overview.pdf


 66%|██████▌   | 88/134 [00:15<00:08,  5.69it/s]

./data\FM docs 2024.3\JOM_2000_15_4_03_Vitamin_C_Case_History_of_an_Alternative_Cancer_Therapy.pdf


 67%|██████▋   | 90/134 [00:15<00:08,  5.13it/s]

./data\FM docs 2024.3\JOM_2000_15_4_04_Clinical_Evaluation_of_Vitamin_C_and_other-.pdf
./data\FM docs 2024.3\JOM_2001_16_3_08_Vitamin_C_in_Cardiovascular_Disease.pdf


 68%|██████▊   | 91/134 [00:15<00:07,  5.50it/s]

./data\FM docs 2024.3\JOM_2001_16_3_10_The_Effect_of_Alternating_Magnetic_Field_Exposure_and-.pdf


 69%|██████▉   | 93/134 [00:17<00:15,  2.69it/s]

./data\FM docs 2024.3\JOM_2001_16_4_05_Vitamin_C_Symptoms_and_Respiratory_Symptoms.pdf
./data\FM docs 2024.3\JOM_2002_17_1_03_The_Role_of_Vitamins_B3_and_C_in_the_Treatment-.pdf


 70%|███████   | 94/134 [00:17<00:12,  3.14it/s]

./data\FM docs 2024.3\JOM_2002_17_1_04_Fatigue_and_Vitamin_C.pdf
./data\FM docs 2024.3\JOM_2002_17_2_06_Case_from_the_Center_Sixteen-Year_History_with_High-.pdf


 72%|███████▏  | 96/134 [00:18<00:12,  3.01it/s]

./data\FM docs 2024.3\JOM_2002_17_4_07_Vitamin_C_and_Oxidative_DNA_Damage_Revisited.pdf


 72%|███████▏  | 97/134 [00:18<00:13,  2.65it/s]

./data\FM docs 2024.3\JOM_2003_18_2_05_Effect_of_Vitamin_C_Supplementation_on_Ex_Vivo_Immune-.pdf


 73%|███████▎  | 98/134 [00:18<00:12,  2.90it/s]

./data\FM docs 2024.3\JOM_2003_18_3-4_03_Vitamin_A_and_Beta-Carotene.pdf


 74%|███████▍  | 99/134 [00:19<00:13,  2.56it/s]

./data\FM docs 2024.3\JOM_2003_18_3-4_04_Negative_and_Positive_Side_Effects_of_Vitamin_B3.pdf


 75%|███████▌  | 101/134 [00:20<00:11,  2.86it/s]

./data\FM docs 2024.3\JOM_2003_18_3-4_05_Vitamin_B6_Extract_of_Submission_to_the_UK’s_Food-.pdf
./data\FM docs 2024.3\JOM_2003_18_3-4_08_The_Trials_and_Tribulations_of_Vitamin_C.pdf


 76%|███████▌  | 102/134 [00:20<00:13,  2.44it/s]

./data\FM docs 2024.3\JOM_2003_18_3-4_09_The_Gift_of_Vitamin_C.pdf


 77%|███████▋  | 103/134 [00:20<00:11,  2.79it/s]

./data\FM docs 2024.3\JOM_2003_18_3-4_10_Vitamin_D_Deficiency_Diversity_and_Dosage.pdf


 78%|███████▊  | 104/134 [00:21<00:11,  2.66it/s]

./data\FM docs 2024.3\JOM_2003_18_3-4_11_Vitamin_E_A_Cure_in_Search_of_Recognition.pdf


 79%|███████▉  | 106/134 [00:21<00:07,  3.62it/s]

./data\FM docs 2024.3\JOM_2003_18_3-4_12_Can_Vitamin_Supplements_Take_the_Place_of_a_Bad_Diet.pdf
./data\FM docs 2024.3\JOM_2004_19_1_04_Vitamin_D_Supplementation_in_the_Fight_Against_Multiple-.pdf


 81%|████████  | 108/134 [00:22<00:07,  3.70it/s]

./data\FM docs 2024.3\JOM_2004_19_4_01_The_Use_of_Vitamin_C_and_Other_Antioxidants_with-.pdf
./data\FM docs 2024.3\JOM_2004_19_4_02_The_Use_of_Vitamin_C_with_Chemotherapy_in_Cancer-.pdf


 82%|████████▏ | 110/134 [00:24<00:13,  1.81it/s]

./data\FM docs 2024.3\JOM_2005_20_1_03_Folic_Acid_Vitamin_D_and_Prehistoric_Polymorphisms-.pdf
./data\FM docs 2024.3\JOM_2005_20_2_06_Vitamin_C_as_an_Ergogenic_Aid.pdf


 84%|████████▍ | 113/134 [00:24<00:05,  3.61it/s]

./data\FM docs 2024.3\JOM_2005_20_2_11_Vitamin_C_and_Osteoporosis_Is_There_a_Connection.pdf
./data\FM docs 2024.3\JOM_2005_20_3_01_Can_One_Vitamin_Overcome_the_General_Nutrient-.pdf


 85%|████████▌ | 114/134 [00:24<00:04,  4.08it/s]

./data\FM docs 2024.3\JOM_2005_20_4_07_Screening_for_Vitamin_C_in_the_Urine_Is_it_Clinically-.pdf
./data\FM docs 2024.3\JOM_2006_21_1_05_Vitamin_D_and_Health_Implications_for_High-latitude-.pdf


 87%|████████▋ | 116/134 [00:25<00:05,  3.03it/s]

./data\FM docs 2024.3\JOM_2006_21_4_03_Special_Report_False_Positive_Finger_Stick_Blood-.pdf
./data\FM docs 2024.3\JOM_2006_21_4_07_Clinical_Experiences_with_a_Vitamin_B3_Dependent_Family.pdf


 88%|████████▊ | 118/134 [00:26<00:04,  3.56it/s]

./data\FM docs 2024.3\JOM_2007_22_1_03_Poor_Methodology_in_Meta-Analysis_of_Vitamins.pdf
./data\FM docs 2024.3\JOM_2007_22_1_05_Schedule-Dependence_in_Cancer_Therapy_What_is_the_True-.pdf


 89%|████████▉ | 119/134 [00:27<00:06,  2.30it/s]

./data\FM docs 2024.3\JOM_2007_22_3_02_Safety_and_Effectiveness_of_Vitamins.pdf


 90%|████████▉ | 120/134 [00:27<00:06,  2.00it/s]

./data\FM docs 2024.3\JOM_2007_22_3_08_The_Effect_of_High_Dose_IV_Vitamin_C_on_Plasma-.pdf


 90%|█████████ | 121/134 [00:27<00:05,  2.31it/s]

./data\FM docs 2024.3\JOM_2008_23_1_04_Vitamin_K_Deficiency_Disease.pdf


 91%|█████████ | 122/134 [00:28<00:04,  2.57it/s]

./data\FM docs 2024.3\JOM_2008_23_2_03_Vitamin_C_and_the_Common_Cold.pdf


 92%|█████████▏| 123/134 [00:28<00:04,  2.53it/s]

./data\FM docs 2024.3\JOM_2008_23_3_04_The_Real_Story_of_Vitamin_C_and_Cancer.pdf


 93%|█████████▎| 124/134 [00:28<00:03,  2.90it/s]

./data\FM docs 2024.3\JOM_2008_23_4_03_Vitamin_C_and_Chemotherapy.pdf


 94%|█████████▍| 126/134 [00:29<00:02,  3.37it/s]

./data\FM docs 2024.3\JOM_2008_23_4_05_The_Proper_Treatment_of_Schizophrenia_Requires_Optimal-.pdf


 95%|█████████▍| 127/134 [00:29<00:01,  3.83it/s]

./data\FM docs 2024.3\JOM_2008_23_4_07_Changes_in_Worker_Fatigue_after_Vitamin_C_Administration.pdf
./data\FM docs 2024.3\JOM_2008_23_4_10_Correspondence VE.pdf


 96%|█████████▌| 128/134 [00:29<00:01,  3.44it/s]

./data\FM docs 2024.3\JOM_2009_24_1_04_Antioxidant_Vitamins_Reduce_the_Risk_for_Cancer_Part_One.pdf


 97%|█████████▋| 130/134 [00:30<00:01,  3.05it/s]

./data\FM docs 2024.3\JOM_2009_24_1_07_Correspondence.pdf
./data\FM docs 2024.3\JOM_2009_24_2_02_Vitamin_D_25-OH-D3_Status_of_200_Chronically_Ill-.pdf
./data\FM docs 2024.3\JOM_2009_24_2_04_Antioxidant_Vitamins_Reduce_the_Risk_for_Cancer_Part_Two.pdf


 99%|█████████▊| 132/134 [00:32<00:01,  1.68it/s]

./data\FM docs 2024.3\Role of Fat-Soluble Vitamins A and D in the Pathogenesis of Influenza.pdf


100%|██████████| 134/134 [00:35<00:00,  3.81it/s]

./data\FM docs 2024.3\Vitamin C and Hot Flashes.pdf





[Document(page_content='Eye Pressure Lowering Effect of Vitamin C\nHerschell H. Boyd, M.D.1\nPurpose\nTo document the pressure before the use\nof vitamin C and after the daily intake of\nmaximum amounts of vitamin C, three times\na day.\nMethods\nThirty patients (16 men and 14 women)\nwere advised to take three divided doses of\nvitamin C in capsule form each day until\nloose stools occured and then back downslightly from this amount (bowel dosage) for\na daily intake. Average daily intake for all\npatients was 10 grams per day.\nResults\nThe greatest lowering of pressure was 13\nmm as measured with a Goldmann tonom-\neter. The least lowering of pressure was 1\nmm. The average for 30 patients was 10 mm.Thirty patients were controlled only with\nvitamin C. Twenty patients were forced to\nuse eye drops to lower the pressure below 20mm of mercury as they refused to take vita-\nmin C.\nConclusion\nIn this series of 30 patients there was no\noccasion in which the pressure was not low-ered w

# 文本分块

In [6]:
# 对文本进行分块
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500, chunk_overlap=150)
text_splitter

<langchain_text_splitters.character.RecursiveCharacterTextSplitter at 0x2197ae2a530>

In [7]:
split_docs = text_splitter.split_documents(docs)
split_docs[:5]

[Document(page_content='Eye Pressure Lowering Effect of Vitamin C\nHerschell H. Boyd, M.D.1\nPurpose\nTo document the pressure before the use\nof vitamin C and after the daily intake of\nmaximum amounts of vitamin C, three times\na day.\nMethods\nThirty patients (16 men and 14 women)\nwere advised to take three divided doses of\nvitamin C in capsule form each day until\nloose stools occured and then back downslightly from this amount (bowel dosage) for\na daily intake. Average daily intake for all\npatients was 10 grams per day.', metadata={'source': './data\\FM docs 2024.3\\Eye Pressure Lowering Effect of Vitamin C.pdf', 'page': 0}),
 Document(page_content='a daily intake. Average daily intake for all\npatients was 10 grams per day.\nResults\nThe greatest lowering of pressure was 13\nmm as measured with a Goldmann tonom-\neter. The least lowering of pressure was 1\nmm. The average for 30 patients was 10 mm.Thirty patients were controlled only with\nvitamin C. Twenty patients were forc

# 向量化保存数据库

In [8]:
# 定义持久化路径
persist_directory = './vector_db/chroma'

In [9]:
# 加载开源词向量模型
embeddings = HuggingFaceEmbeddings(model_name="./sentence-transformer")
embeddings

HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 128, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
), model_name='./sentence-transformer', cache_folder=None, model_kwargs={}, encode_kwargs={}, multi_process=False, show_progress=False)

In [10]:
help(Chroma.from_documents)

Help on method from_documents in module langchain_community.vectorstores.chroma:

from_documents(documents: 'List[Document]', embedding: 'Optional[Embeddings]' = None, ids: 'Optional[List[str]]' = None, collection_name: 'str' = 'langchain', persist_directory: 'Optional[str]' = None, client_settings: 'Optional[chromadb.config.Settings]' = None, client: 'Optional[chromadb.Client]' = None, collection_metadata: 'Optional[Dict]' = None, **kwargs: 'Any') -> 'Chroma' method of abc.ABCMeta instance
    Create a Chroma vectorstore from a list of documents.
    
    If a persist_directory is specified, the collection will be persisted there.
    Otherwise, the data will be ephemeral in-memory.
    
    Args:
        collection_name (str): Name of the collection to create.
        persist_directory (Optional[str]): Directory to persist the collection.
        ids (Optional[List[str]]): List of document IDs. Defaults to None.
        documents (List[Document]): List of documents to add to the vect

In [11]:
# 构建向量数据库
# 加载数据库
vectordb = Chroma.from_documents(
    documents=split_docs,
    embedding=embeddings,
    persist_directory=persist_directory  # 允许我们将persist_directory目录保存到磁盘上
)
vectordb

<langchain_community.vectorstores.chroma.Chroma at 0x2197ef192d0>

In [12]:
# 将加载的向量数据库持久化到磁盘上
vectordb.persist()

# 加载数据库

In [13]:
vectordb = Chroma(
    persist_directory=persist_directory,  # 允许我们将persist_directory目录保存到磁盘上
    embedding_function=embeddings
)

In [14]:
vectordb.as_retriever()

VectorStoreRetriever(tags=['Chroma', 'HuggingFaceEmbeddings'], vectorstore=<langchain_community.vectorstores.chroma.Chroma object at 0x000002192CD9C1F0>)

In [15]:
vectordb.as_retriever().vectorstore

<langchain_community.vectorstores.chroma.Chroma at 0x2192cd9c1f0>

In [29]:
help(vectordb.similarity_search)

Help on method similarity_search in module langchain_community.vectorstores.chroma:

similarity_search(query: 'str', k: 'int' = 4, filter: 'Optional[Dict[str, str]]' = None, **kwargs: 'Any') -> 'List[Document]' method of langchain_community.vectorstores.chroma.Chroma instance
    Run similarity search with Chroma.
    
    Args:
        query (str): Query text to search for.
        k (int): Number of results to return. Defaults to 4.
        filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None.
    
    Returns:
        List[Document]: List of documents most similar to the query text.



In [16]:
vectordb.similarity_search(query='Vitamin C', k=5)

[Document(page_content='of this discussion, vitamin C is utilized to \nshow that vitamin C intake modifies all of the \nperipheral lamellae.  \n \n                                                                              85', metadata={'page': 1, 'source': './data\\FM docs 2024.3\\JOM_1976_05_2_02_The_Eating_Habits_of_High_and_Low_Vitamin_C_Users.pdf'}),
 Document(page_content='before vitamin \nC  after vitamin C p-value\nHemglobin A1c (%)   5.46±0.38  4.88±0.33 0.000\nCortisol (μg/dL) 11.64±3.83  8.80±2.75 0.000\nAspartate aminotranferase (U/L) 28.09±19.92 23.85±7.65 0.000\nAlanine aminotranferase  (U/L) 28.45±20.66 25.12±17.75 0.011\nr-GTP (U/L) 32.59±28.92 25.93±18.05 0.000\nC-reactive protein(mg/L)   0.11±0.20   0.05±0.07 0.033\nvitamin C (μmol/L) 42.90±12.4 68.60±26.57 0.000Table 3. Blood test after vitamin C administration.', metadata={'page': 2, 'source': './data\\FM docs 2024.3\\JOM_2008_23_4_07_Changes_in_Worker_Fatigue_after_Vitamin_C_Administration.pdf'}),
 Document(page

In [17]:
vectordb.asimilarity_search(query='Vitamin C', k=5)

<coroutine object VectorStore.asimilarity_search at 0x000002192D2F8DD0>

In [18]:
help(vectordb.search)

Help on method search in module langchain_core.vectorstores:

search(query: 'str', search_type: 'str', **kwargs: 'Any') -> 'List[Document]' method of langchain_community.vectorstores.chroma.Chroma instance
    Return docs most similar to query using specified search type.



In [20]:
# search_type: 'similarity' or 'mmr'.
vectordb.search(query='Vitamin C', search_type='similarity', k=5)

[Document(page_content='of this discussion, vitamin C is utilized to \nshow that vitamin C intake modifies all of the \nperipheral lamellae.  \n \n                                                                              85', metadata={'page': 1, 'source': './data\\FM docs 2024.3\\JOM_1976_05_2_02_The_Eating_Habits_of_High_and_Low_Vitamin_C_Users.pdf'}),
 Document(page_content='before vitamin \nC  after vitamin C p-value\nHemglobin A1c (%)   5.46±0.38  4.88±0.33 0.000\nCortisol (μg/dL) 11.64±3.83  8.80±2.75 0.000\nAspartate aminotranferase (U/L) 28.09±19.92 23.85±7.65 0.000\nAlanine aminotranferase  (U/L) 28.45±20.66 25.12±17.75 0.011\nr-GTP (U/L) 32.59±28.92 25.93±18.05 0.000\nC-reactive protein(mg/L)   0.11±0.20   0.05±0.07 0.033\nvitamin C (μmol/L) 42.90±12.4 68.60±26.57 0.000Table 3. Blood test after vitamin C administration.', metadata={'page': 2, 'source': './data\\FM docs 2024.3\\JOM_2008_23_4_07_Changes_in_Worker_Fatigue_after_Vitamin_C_Administration.pdf'}),
 Document(page

In [21]:
vectordb.asearch(query='Vitamin C', search_type='similarity', k=5)

<coroutine object VectorStore.asearch at 0x000002192D2F8F90>