In [2]:
# 由于在data- exploration的关系热力图显示贷款审批与否和 收入（-0.14），信用评分（-0.35），DTI率（0.17）
# 我想还是通过数据来预测贷款审批与否，文本可以通过大模型的相似度匹配，在确认相似的文本中，
# 贷款的用途（分类时的预测类别），来缩小样本后，在对相关性的数据进行模型预测
import pandas

df = pandas.read_csv('data/loan_data_with_sentiment_analysis.csv')

In [3]:
df.head(3)

Unnamed: 0,Text,Income,Credit_Score,Loan_Amount,DTI_Ratio,Employment_Status,Approval,id,tagged_text,predicted_category,anger,disgust,fear,joy,sadness,surprise,neutral
0,I need a loan to pay for an international vaca...,26556,581,8314,79.26,employed,Rejected,0,0 I need a loan to pay for an international va...,Travel,0.03262,0.009153,0.066997,0.023172,0.834439,0.003592,0.030028
1,I want to make home improvements like installi...,197392,389,111604,22.14,employed,Rejected,1,1 I want to make home improvements like instal...,Other,0.028483,0.019443,0.01155,0.083611,0.383042,0.026668,0.447203
2,"I need a loan for home renovation, including a...",44561,523,34118,45.44,employed,Rejected,2,"2 I need a loan for home renovation, including...",Renovation,0.026961,0.032717,0.055367,0.033183,0.299088,0.033726,0.518959


In [7]:
# “批准”列是我们的目标，目前是一个字符串。让我们把它编码为一个数字二进制变量。
# 1表示批准，0表示拒绝。
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['Approval_Label'] = le.fit_transform(df['Approval'])
df['Approval_Label'] = df['Approval_Label'].replace({0: 1, 1: 0})

In [8]:
df.head(3)

Unnamed: 0,Text,Income,Credit_Score,Loan_Amount,DTI_Ratio,Employment_Status,Approval,id,tagged_text,predicted_category,anger,disgust,fear,joy,sadness,surprise,neutral,Approval_Label
0,I need a loan to pay for an international vaca...,26556,581,8314,79.26,employed,Rejected,0,0 I need a loan to pay for an international va...,Travel,0.03262,0.009153,0.066997,0.023172,0.834439,0.003592,0.030028,0
1,I want to make home improvements like installi...,197392,389,111604,22.14,employed,Rejected,1,1 I want to make home improvements like instal...,Other,0.028483,0.019443,0.01155,0.083611,0.383042,0.026668,0.447203,0
2,"I need a loan for home renovation, including a...",44561,523,34118,45.44,employed,Rejected,2,"2 I need a loan for home renovation, including...",Renovation,0.026961,0.032717,0.055367,0.033183,0.299088,0.033726,0.518959,0


* 我想是通过text的输入，匹配到类似文本，再在类似文本中选择predicted_category，
* 然后通过(income, credit_score, loan_amount, DTI_Ratio)来预测贷款审批与否的(approval_label)

In [10]:
# 1. 首先通过text的输入，匹配到类似文本，再在类似文本中选择predicted_category
# 使用大模型进行相似度匹配
import os
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import CharacterTextSplitter
from langchain_community.embeddings import ZhipuAIEmbeddings
from langchain_chroma import Chroma

from dotenv import load_dotenv
load_dotenv()

embeddings_zp = ZhipuAIEmbeddings(
    model="embedding-3",
    api_key=os.getenv("ZHIPUAI_API_KEY"),
)

In [11]:
# 加载已经训练好的向量数据库
db_load = Chroma(
    persist_directory="db/vectorstore_loan",
    embedding_function=embeddings_zp
)

In [None]:
# 实验，通过text的输入，匹配到类似文本
query = "I need a loan to buy a car."
results_with_scores = db_load.similarity_search_with_score(query, k=10000)  # 获取更多结果

# 阈值也是需要用户自己设定，这里我们设定为0.7
threshold = 0.7  # 设定阈值
filtered_results = [doc for doc, score in results_with_scores if score > threshold]

print(f"阈值: {threshold}")
print(f"剩余结果数量: {len(filtered_results)}")

阈值: 0.7
剩余结果数量: 6522


In [41]:
# 利用tagged_text字段，获取到其他所有值，形成新表格

# 提取匹配到的文档的 ID
matched_ids = [int(doc.page_content.split(" ", 1)[0]) for doc in filtered_results]

# 根据 ID 从原始 DataFrame 中获取所有信息
matched_data = df[df['id'].isin(matched_ids)]

# 打印新表格
matched_data.head(3)

Unnamed: 0,Text,Income,Credit_Score,Loan_Amount,DTI_Ratio,Employment_Status,Approval,id,tagged_text,predicted_category,anger,disgust,fear,joy,sadness,surprise,neutral,Approval_Label
9,I need a loan to cover emergency expenses afte...,81024,403,19217,36.92,unemployed,Rejected,9,9 I need a loan to cover emergency expenses af...,Other,0.011158,0.00902,0.66412,0.006822,0.274832,0.003864,0.030184,0
11,I need money to purchase updated equipment for...,179680,475,65175,22.26,employed,Rejected,11,11 I need money to purchase updated equipment ...,Other,0.047688,0.015485,0.042921,0.047834,0.327524,0.041253,0.477295,0
21,I need a loan to support my side business sell...,49242,600,11365,68.02,unemployed,Rejected,21,21 I need a loan to support my side business s...,Other,0.051657,0.048936,0.111023,0.069538,0.62542,0.004411,0.089014,0


In [43]:
matched_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6522 entries, 9 to 23998
Data columns (total 18 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Text                6522 non-null   object 
 1   Income              6522 non-null   int64  
 2   Credit_Score        6522 non-null   int64  
 3   Loan_Amount         6522 non-null   int64  
 4   DTI_Ratio           6522 non-null   float64
 5   Employment_Status   6522 non-null   object 
 6   Approval            6522 non-null   object 
 7   id                  6522 non-null   int64  
 8   tagged_text         6522 non-null   object 
 9   predicted_category  6522 non-null   object 
 10  anger               6522 non-null   float64
 11  disgust             6522 non-null   float64
 12  fear                6522 non-null   float64
 13  joy                 6522 non-null   float64
 14  sadness             6522 non-null   float64
 15  surprise            6522 non-null   float64
 16  neutral   

In [45]:
# 显示所有predicted_category的类别
# 从而得知买车的用途
matched_data['predicted_category'].unique()

array(['Other', 'Entrepreneurship', 'Medical', 'Travel', 'Car Purchase',
       'Renovation', 'Education', 'House Purchase'], dtype=object)