In [56]:
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import CharacterTextSplitter
from langchain_community.embeddings import ZhipuAIEmbeddings
from langchain_chroma import Chroma

from dotenv import load_dotenv
load_dotenv()

True

In [57]:
import pandas as pd
df = pd.read_csv("data/loan_data_cleaned.csv")
df.head()

Unnamed: 0,Text,Income,Credit_Score,Loan_Amount,DTI_Ratio,Employment_Status,Approval,id,tagged_text
0,I need a loan to pay for an international vaca...,26556,581,8314,79.26,employed,Rejected,0,0 I need a loan to pay for an international va...
1,I want to make home improvements like installi...,197392,389,111604,22.14,employed,Rejected,1,1 I want to make home improvements like instal...
2,"I need a loan for home renovation, including a...",44561,523,34118,45.44,employed,Rejected,2,"2 I need a loan for home renovation, including..."
3,I need funds to buy new furniture and applianc...,190363,729,118757,10.22,unemployed,Rejected,3,3 I need funds to buy new furniture and applia...
4,I need a loan to start a small business.,61853,732,19210,44.13,employed,Approved,4,4 I need a loan to start a small business.


In [58]:
# 显示tagged_text前5行数据
df['tagged_text'].head(7)

0    0 I need a loan to pay for an international va...
1    1 I want to make home improvements like instal...
2    2 I need a loan for home renovation, including...
3    3 I need funds to buy new furniture and applia...
4           4 I need a loan to start a small business.
5    5 I need a loan to repair my car after an acci...
6    6 I need financial help to cover maternity and...
Name: tagged_text, dtype: object

#### 用户通过输入 text来搜索与自己相同的需求，观察需要的其他特征（如：收入，是否就业等情况），从而判断是否可以贷款，当然我也会根据客户的信息进行一个建议性预测。


In [None]:
# 将tagged_text列保存为txt文件
df["tagged_text"].to_csv("data/tagged_text.txt",
                                   sep = "\n",
                                   index = False,
                                   header = False)

In [None]:
# 导入tagged_text.txt文件
raw_documents = TextLoader("data/tagged_text.txt").load()

# 使用CharacterTextSplitter将文档分割成更小的块,chunk_size为0是为了不让文档被混乱分隔，overlap为0表示不重叠
# 不重叠，不让文档被混乱分隔，都是为了保证文档以 \n 为分隔符，每一行对应一个人的请求，不会混乱

# 文本分块

# text_splitter = CharacterTextSplitter(chunk_size=0, chunk_overlap=0, separator="\n")
# documents = text_splitter.split_documents(raw_documents)

Created a chunk of size 68, which is longer than the specified 0
Created a chunk of size 64, which is longer than the specified 0
Created a chunk of size 65, which is longer than the specified 0
Created a chunk of size 64, which is longer than the specified 0
Created a chunk of size 42, which is longer than the specified 0
Created a chunk of size 51, which is longer than the specified 0
Created a chunk of size 65, which is longer than the specified 0
Created a chunk of size 50, which is longer than the specified 0
Created a chunk of size 53, which is longer than the specified 0
Created a chunk of size 69, which is longer than the specified 0
Created a chunk of size 73, which is longer than the specified 0
Created a chunk of size 74, which is longer than the specified 0
Created a chunk of size 76, which is longer than the specified 0
Created a chunk of size 75, which is longer than the specified 0
Created a chunk of size 70, which is longer than the specified 0
Created a chunk of size 7

In [61]:
documents[0]

Document(metadata={'source': 'tagged_text.txt'}, page_content='0 I need a loan to pay for an international vacation with my family.')

In [62]:
# 创建向量数据库
# 使用ZhipuAIEmbeddings创建一个嵌入模型
import os
from dotenv import load_dotenv
load_dotenv()

# 验证是否成功获取变量
test_var = os.getenv("TEST_VAR")
print(f"TEST_VAR = {test_var}")  # 如果输出 TEST_VAR = hello，说明加载成功

TEST_VAR = hello


In [None]:
import numpy as np  # 确保你已经安装了 numpy

embeddings_zp = ZhipuAIEmbeddings(
    model="embedding-3",
    api_key=os.getenv("ZHIPUAI_API_KEY"),
)

# 创建一个 Chroma 向量数据库

# def chunk_list(input_list, chunk_size):
#     """将列表分割成指定大小的块。"""
#     return np.array_split(input_list, np.ceil(len(input_list) / chunk_size))

# # 将 documents 分割成大小为 64 的块
# document_chunks = chunk_list(documents, 64)

# for chunk in document_chunks:
#     db_loan = Chroma.from_documents(
#         documents=chunk,
#         embedding=embeddings_zp,
#         persist_directory="db/vectorstore_loan",
#     )

In [75]:
# 导入向量数据库
# 这里的 db/vector_db 是你之前保存向量数据库的路径
# embeddings_zp 是你使用的嵌入模型，需要保持一致
db_load = Chroma(
    persist_directory="db/vectorstore_loan",
    embedding_function=embeddings_zp
)
query = 'house'
# 使用向量数据库搜索与查询最相似的文档
results = db_load.similarity_search(query)
results

[Document(id='272340f2-aa0a-420b-99f4-1143757957e5', metadata={'source': 'tagged_text.txt'}, page_content='8528 I need funds to buy new furniture and appliances for my house.'),
 Document(id='3e0e5db1-422f-42b2-9268-29de3a192a0a', metadata={'source': 'tagged_text.txt'}, page_content='15173 I need funds to buy new furniture and appliances for my house.'),
 Document(id='982f1a99-0005-45a8-a4df-b4867eafcf81', metadata={'source': 'tagged_text.txt'}, page_content='18421 I need funds to buy new furniture and appliances for my house.'),
 Document(id='a6acb56e-4a7a-499e-8966-cc76a3d1cb8d', metadata={'source': 'tagged_text.txt'}, page_content='17193 I need funds to buy new furniture and appliances for my house.')]

In [76]:
# 通过匹配到的 贷款申请文本的id，找到对应的 贷款申请的其他参数
df[df["id"] == int(results[0].page_content.split()[0].strip())]

Unnamed: 0,Text,Income,Credit_Score,Loan_Amount,DTI_Ratio,Employment_Status,Approval,id,tagged_text
8528,I need funds to buy new furniture and applianc...,90243,352,65463,42.33,employed,Rejected,8528,8528 I need funds to buy new furniture and app...


In [90]:
# 构造函数来实现 多结果的现实
def retrieve_similar_applications(
        query: str,
        top_k: int = 5,
) -> pd.DataFrame:
    recs = db_load.similarity_search(query, k = 10)

    text_list = []

    for i in range(0, len(recs)):
        text_list += [int(recs[i].page_content.strip('"').split()[0])]

    return df[df["id"].isin(text_list)]

In [93]:
retrieve_similar_applications('buy car')

Unnamed: 0,Text,Income,Credit_Score,Loan_Amount,DTI_Ratio,Employment_Status,Approval,id,tagged_text
9185,I want to buy a car for my rideshare business ...,86954,661,15602,41.61,employed,Approved,9185,9185 I want to buy a car for my rideshare busi...
9523,I want to buy a car for my rideshare business ...,123084,613,91729,16.76,unemployed,Rejected,9523,9523 I want to buy a car for my rideshare busi...
12186,I want to buy a car for my rideshare business ...,69015,639,39380,9.71,unemployed,Rejected,12186,12186 I want to buy a car for my rideshare bus...
12313,I want to buy a car for my rideshare business ...,80388,300,2080,45.4,unemployed,Rejected,12313,12313 I want to buy a car for my rideshare bus...
12347,I want to buy a car for my rideshare business ...,22381,375,3677,138.95,unemployed,Rejected,12347,12347 I want to buy a car for my rideshare bus...
12377,I want to buy a car for my rideshare business ...,116124,532,21759,21.97,employed,Rejected,12377,12377 I want to buy a car for my rideshare bus...
14365,I want to buy a car for my rideshare business ...,68059,676,15636,24.89,employed,Approved,14365,14365 I want to buy a car for my rideshare bus...
15005,I want to buy a car for my rideshare business ...,47480,489,11901,85.15,unemployed,Rejected,15005,15005 I want to buy a car for my rideshare bus...
16345,I want to buy a car for my rideshare business ...,180519,603,130339,16.15,unemployed,Rejected,16345,16345 I want to buy a car for my rideshare bus...
16357,I want to buy a car for my rideshare business ...,93326,759,53351,42.96,employed,Approved,16357,16357 I want to buy a car for my rideshare bus...
