In [11]:
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import CharacterTextSplitter
from langchain_community.embeddings import ZhipuAIEmbeddings
from langchain_chroma import Chroma

from dotenv import load_dotenv
load_dotenv()

True

In [2]:
import pandas as pd
df = pd.read_csv("data/loan_data_cleaned.csv")
df.head()

Unnamed: 0,Text,Income,Credit_Score,Loan_Amount,DTI_Ratio,Employment_Status,Approval,id,tagged_text
0,I need a loan to pay for an international vaca...,26556,581,8314,79.26,employed,Rejected,0,0 I need a loan to pay for an international va...
1,I want to make home improvements like installi...,197392,389,111604,22.14,employed,Rejected,1,1 I want to make home improvements like instal...
2,"I need a loan for home renovation, including a...",44561,523,34118,45.44,employed,Rejected,2,"2 I need a loan for home renovation, including..."
3,I need funds to buy new furniture and applianc...,190363,729,118757,10.22,unemployed,Rejected,3,3 I need funds to buy new furniture and applia...
4,I need a loan to start a small business.,61853,732,19210,44.13,employed,Approved,4,4 I need a loan to start a small business.


In [3]:
# 显示tagged_text前5行数据
df['tagged_text'].head(7)

0    0 I need a loan to pay for an international va...
1    1 I want to make home improvements like instal...
2    2 I need a loan for home renovation, including...
3    3 I need funds to buy new furniture and applia...
4           4 I need a loan to start a small business.
5    5 I need a loan to repair my car after an acci...
6    6 I need financial help to cover maternity and...
Name: tagged_text, dtype: object

#### 用户通过输入 text来搜索与自己相同的需求，观察需要的其他特征（如：收入，是否就业等情况），从而判断是否可以贷款，当然我也会根据客户的信息进行一个建议性预测。


In [4]:
# 将tagged_text列保存为txt文件
df["tagged_text"].to_csv("tagged_text.txt",
                                   sep = "\n",
                                   index = False,
                                   header = False)

In [5]:
# 导入tagged_text.txt文件
raw_documents = TextLoader("tagged_text.txt").load()

# 使用CharacterTextSplitter将文档分割成更小的块,chunk_size为0是为了不让文档被混乱分隔，overlap为0表示不重叠
# 不重叠，不让文档被混乱分隔，都是为了保证文档以 \n 为分隔符，每一行对应一个人的请求，不会混乱
text_splitter = CharacterTextSplitter(chunk_size=0, chunk_overlap=0, separator="\n")
documents = text_splitter.split_documents(raw_documents)

Created a chunk of size 68, which is longer than the specified 0
Created a chunk of size 64, which is longer than the specified 0
Created a chunk of size 65, which is longer than the specified 0
Created a chunk of size 64, which is longer than the specified 0
Created a chunk of size 42, which is longer than the specified 0
Created a chunk of size 51, which is longer than the specified 0
Created a chunk of size 65, which is longer than the specified 0
Created a chunk of size 50, which is longer than the specified 0
Created a chunk of size 53, which is longer than the specified 0
Created a chunk of size 69, which is longer than the specified 0
Created a chunk of size 73, which is longer than the specified 0
Created a chunk of size 74, which is longer than the specified 0
Created a chunk of size 76, which is longer than the specified 0
Created a chunk of size 75, which is longer than the specified 0
Created a chunk of size 70, which is longer than the specified 0
Created a chunk of size 7

In [7]:
documents[0]

Document(metadata={'source': 'tagged_text.txt'}, page_content='0 I need a loan to pay for an international vacation with my family.')

In [None]:
# 创建向量数据库
# 使用ZhipuAIEmbeddings创建一个嵌入模型
import os
from dotenv import load_dotenv
load_dotenv()

# 验证是否成功获取变量
test_var = os.getenv("TEST_VAR")
print(f"TEST_VAR = {test_var}")  # 如果输出 TEST_VAR = hello，说明加载成功

TEST_VAR = hello


In [None]:
import numpy as np  # 确保你已经安装了 numpy

embeddings_zp = ZhipuAIEmbeddings(
    model="embedding-3",
    api_key=os.getenv("ZHIPUAI_API_KEY"),
)

def chunk_list(input_list, chunk_size):
    """将列表分割成指定大小的块。"""
    return np.array_split(input_list, np.ceil(len(input_list) / chunk_size))

# 将 documents 分割成大小为 64 的块
document_chunks = chunk_list(documents, 64)

# for chunk in document_chunks:
#     db_loan = Chroma.from_documents(
#         documents=chunk,
#         embedding=embeddings_zp,
#         persist_directory="db/vector_db",
#     )

In [55]:
# 导入向量数据库
# 这里的 db/vector_db 是你之前保存向量数据库的路径
# embeddings_zp 是你使用的嵌入模型，需要保持一致
db_load = Chroma(
    persist_directory="db/vector_db",
    embedding_function=embeddings_zp
)
query = 'medical'
# 使用向量数据库搜索与查询最相似的文档
results = db_load.similarity_search(query)
results

[]