In [6]:
# 计划按照就业情况对文本进行分类
import pandas as pd
df = pd.read_csv('data/loan_data_cleaned.csv')
df.head(3)

Unnamed: 0,Text,Income,Credit_Score,Loan_Amount,DTI_Ratio,Employment_Status,Approval,id,tagged_text
0,I need a loan to pay for an international vaca...,26556,581,8314,79.26,employed,Rejected,0,0 I need a loan to pay for an international va...
1,I want to make home improvements like installi...,197392,389,111604,22.14,employed,Rejected,1,1 I want to make home improvements like instal...
2,"I need a loan for home renovation, including a...",44561,523,34118,45.44,employed,Rejected,2,"2 I need a loan for home renovation, including..."


In [8]:
# 通过将文本进行分类，确定贷款的用途
# 导入 transformers 库中的 pipeline，用于加载预训练模型
from transformers import pipeline  

# 定义候选文本标签
candidate_labels = ["House Purchase", "Car Purchase", "Education", "Medical", "Travel", "Entrepreneurship", "Renovation", "Other"]

# facebook/bart-large-mnli模型准确度蛮高的，但是太大了，先用小模型完成任务
# 加载 zero-shot-classification 模型，并传入候选文本标签
pipe = pipeline("zero-shot-classification",
                model="cross-encoder/nli-distilroberta-base",
                device="mps")

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Device set to use mps


In [9]:
# 获取第一行的 text 列的值
text = df["tagged_text"][0]
result = pipe(text, candidate_labels, multi_label=False)  # 使用 zero-shot 分类器进行分类
predicted_label = result['labels'][0]

# 打印结果
print(f"文本：{text}")
print(f"预测的标签：{predicted_label}")

文本：0 I need a loan to pay for an international vacation with my family.
预测的标签：Travel


In [11]:
# 批量大小
batch_size = 32

# 存储预测结果的列表
predicted_categories = []

# 循环遍历每一批数据，进行 zero-shot 分类
for i in range(0, len(df), batch_size):
    # 获取当前批次的文本
    batch_texts = df["tagged_text"][i:i + batch_size].tolist()

    # 使用 zero-shot 分类器进行分类
    results = pipe(batch_texts, candidate_labels, multi_label=False)

    # 提取预测的标签
    for result in results:
        predicted_label = result['labels'][0]
        predicted_categories.append(predicted_label)

# 将预测的标签添加到 DataFrame 中
df["predicted_category"] = predicted_categories[:len(df)]  # 确保长度一致

# 打印结果
print(df[["tagged_text", "predicted_category"]].head())

                                         tagged_text predicted_category
0  0 I need a loan to pay for an international va...             Travel
1  1 I want to make home improvements like instal...              Other
2  2 I need a loan for home renovation, including...         Renovation
3  3 I need funds to buy new furniture and applia...     House Purchase
4         4 I need a loan to start a small business.              Other


In [13]:
# 保存结果到 CSV 文件
df.to_csv("data/loan_data_with_predicted_categories.csv", index=False)