import pandas as pd
df = pd.read_csv('data/agriculture_dataset.csv')

df

from langchain.document_loaders.csv_loader import CSVLoader
loader = CSVLoader(file_path="data/agriculture_dataset.csv")
data = loader.load()
data

print(f"载入后的变量类型为：{type(data)}，",  f"该 CSV 一共包含 {len(data)} 行")

one_data = data[1]
print(f"每一个元素的类型：{type(one_data)}.",
     f"该文档的描述性数据：{one_data.metadata}",
     f"查看该文档的内容：\n{one_data.page_content}",
     sep = "\n------\n")

from langchain.text_splitter import RecursiveCharacterTextSplitter

# 知识库中单段文本长度
CHUNK_SIZE = 50

# 知识库中相邻文本重合长度
OVERLAP_SIZE = 5

# 使用递归字符文本分割器
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=CHUNK_SIZE,
    chunk_overlap=OVERLAP_SIZE
)
text_splitter.split_text(one_data.page_content[0:1000])

split_docs = text_splitter.split_documents(data)
print(f"切分后的文件数量：{len(split_docs)}")

print(f"切分后的字符数（可以用来大致评估 token 数）：{sum([len(doc.page_content) for doc in split_docs])}")

In [1]:
import os
from dotenv import load_dotenv, find_dotenv
from langchain.document_loaders.csv_loader import CSVLoader

# 读取本地/项目的环境变量。
# find_dotenv()寻找并定位.env文件的路径
# load_dotenv()读取该.env文件，并将其中的环境变量加载到当前的运行环境中  
# 如果你设置的是全局的环境变量，这行代码则没有任何作用。
_ = load_dotenv(find_dotenv())

# 如果你需要通过代理端口访问，你需要如下配置
# os.environ['HTTPS_PROXY'] = 'http://127.0.0.1:7890'
# os.environ["HTTP_PROXY"] = 'http://127.0.0.1:7890'

# 获取folder_path下所有文件路径，储存在file_paths里
file_paths = []
folder_path = r'D:\My_Files\实验室学习相关\大模型部署\data'
for root, dirs, files in os.walk(folder_path):
    for file in files:
        file_path = os.path.join(root, file)
        file_paths.append(file_path)
print(file_paths[:3])

['D:\\My_Files\\实验室学习相关\\大模型部署\\data\\agriculture_dataset.csv']


In [2]:
loaders = []
for file_path in file_paths:
    loaders.append(CSVLoader(file_path))

In [3]:
texts = []
for loader in loaders:
    texts.extend(loader.load())

In [4]:
text = texts[1]
print(f"每一个元素的类型：{type(text)}.", 
    f"该文档的描述性数据：{text.metadata}", 
    f"查看该文档的内容:\n{text.page_content[0:]}", 
    sep="\n------\n")

每一个元素的类型：<class 'langchain_core.documents.base.Document'>.
------
该文档的描述性数据：{'source': 'D:\\My_Files\\实验室学习相关\\大模型部署\\data\\agriculture_dataset.csv', 'row': 1}
------
查看该文档的内容:
Farm_ID: F002
Crop_Type: Carrot
Farm_Area(acres): 18.67
Irrigation_Type: Manual
Fertilizer_Used(tons): 4.77
Pesticide_Used(kg): 4.36
Yield(tons): 42.91
Soil_Type: Peaty
Season: Kharif
Water_Usage(cubic meters): 68725.54


In [5]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

# 切分文档
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500, chunk_overlap=50)

split_docs = text_splitter.split_documents(texts)
print(f"切分后的文件数量：{len(split_docs)}")

切分后的文件数量：50


In [6]:
from langchain_community.embeddings import ZhipuAIEmbeddings
embedding = ZhipuAIEmbeddings(
    model="embedding-3",
    api_key="efe00797053e4bf99dd381d8703a0863.goa9E7f8rBPi4nS7"
)
persist_directory = r'D:\My_Files\实验室学习相关\大模型部署\chroma'

In [7]:
from langchain.vectorstores.chroma import Chroma

vectordb = Chroma.from_documents(
    documents=split_docs[:50], # 为了速度，只选择前 50 个切分的 doc 进行生成；使用千帆时因QPS限制，建议选择前 5 个doc
    embedding=embedding,
    persist_directory=persist_directory  # 允许我们将persist_directory目录保存到磁盘上
)


In [8]:
print(f"向量库中存储的数量：{vectordb._collection.count()}")

向量库中存储的数量：50


In [9]:
question="哪个农场种棉花"

In [10]:
sim_docs = vectordb.similarity_search(question,k=10)
print(f"检索到的内容数：{len(sim_docs)}")

检索到的内容数：10


In [11]:
for i, sim_doc in enumerate(sim_docs):
    print(f"检索到的第{i}个内容: \n{sim_doc.page_content[:200]}", end="\n--------------\n")

检索到的第0个内容: 
Farm_ID: F039
Crop_Type: Cotton
Farm_Area(acres): 220.48
Irrigation_Type: Flood
Fertilizer_Used(tons): 9.96
Pesticide_Used(kg): 2.91
Yield(tons): 10.53
Soil_Type: Clay
Season: Zaid
Water_Usage(cubic m
--------------
检索到的第1个内容: 
Farm_ID: F043
Crop_Type: Cotton
Farm_Area(acres): 78.79
Irrigation_Type: Flood
Fertilizer_Used(tons): 1.35
Pesticide_Used(kg): 3.0
Yield(tons): 11.45
Soil_Type: Sandy
Season: Zaid
Water_Usage(cubic me
--------------
检索到的第2个内容: 
Farm_ID: F036
Crop_Type: Cotton
Farm_Area(acres): 446.16
Irrigation_Type: Manual
Fertilizer_Used(tons): 4.35
Pesticide_Used(kg): 3.47
Yield(tons): 12.53
Soil_Type: Loamy
Season: Zaid
Water_Usage(cubic
--------------
检索到的第3个内容: 
Farm_ID: F021
Crop_Type: Cotton
Farm_Area(acres): 377.05
Irrigation_Type: Drip
Fertilizer_Used(tons): 5.95
Pesticide_Used(kg): 0.91
Yield(tons): 29.17
Soil_Type: Clay
Season: Rabi
Water_Usage(cubic me
--------------
检索到的第4个内容: 
Farm_ID: F027
Crop_Type: Cotton
Farm_Area(acres): 375.1
Irrigation_Type: Rai

In [19]:
mmr_docs = vectordb.max_marginal_relevance_search(question,k=3)

In [20]:
for i, sim_doc in enumerate(mmr_docs):
    print(f"MMR 检索到的第{i}个内容: \n{sim_doc.page_content[:200]}", end="\n--------------\n")

MMR 检索到的第0个内容: 
Farm_ID: F039
Crop_Type: Cotton
Farm_Area(acres): 220.48
Irrigation_Type: Flood
Fertilizer_Used(tons): 9.96
Pesticide_Used(kg): 2.91
Yield(tons): 10.53
Soil_Type: Clay
Season: Zaid
Water_Usage(cubic m
--------------
MMR 检索到的第1个内容: 
Farm_ID: F001
Crop_Type: Cotton
Farm_Area(acres): 329.4
Irrigation_Type: Sprinkler
Fertilizer_Used(tons): 8.14
Pesticide_Used(kg): 2.21
Yield(tons): 14.44
Soil_Type: Loamy
Season: Kharif
Water_Usage(c
--------------
MMR 检索到的第2个内容: 
Farm_ID: F044
Crop_Type: Soybean
Farm_Area(acres): 84.12
Irrigation_Type: Manual
Fertilizer_Used(tons): 4.64
Pesticide_Used(kg): 2.53
Yield(tons): 24.77
Soil_Type: Sandy
Season: Rabi
Water_Usage(cubic
--------------
