# pip install

In [None]:
!pip install gdown
!pip install scikit-learn
!pip install joblib

# Import Libraries

In [1]:
from utilities.data_processor import DataPreProcessor, DataLoader
from utilities.XGBoost_model import train_and_evaluate_xgboost
from utilities.XGBoost_Predictor import XGBoostPredictor
import os
import json

ModuleNotFoundError: No module named 'langchain_openai'

# Load Raw Data

In [None]:
dpp = DataPreProcessor()

# Load the data
dpp.load_data(download=True)

# show the row data info
dpp.show_data_info()

# preprocess the data
dpp.preprocess_data(fill_blank=True, fill_mode='mode')
 
# show the preprocessed data info
dpp.show_data_info()

# plt the remaining features
dpp.show_feature_plt()

# DataLoader

In [None]:
# 初始化DataLoader
dl = DataLoader(dpp.get_data(), mode='catboost')

# 特征编码
dl.encoder()

# 划分数据集
dl.split_data()

# 筛选特征
dl.data_filter()

x_train_filtered, x_test_filtered, y_train, y_test = dl.get_filtered_data()

# Train and Evaluate XGBoost

In [None]:
# 训练并评估xgboost
bst, accuracy = train_and_evaluate_xgboost(x_train_filtered, x_test_filtered, y_train, y_test)

# 保存模型和特征中位数
model_path = "model_folder/xgboost_model.json"
stats_path = "model_folder/feature_medians.json"
os.makedirs("model_folder", exist_ok=True)
bst.save_model(model_path)

feature_medians = x_train_filtered.median().to_dict()
with open(stats_path, "w") as f:
    json.dump(feature_medians, f)

print("Model and feature medians saved.")

# Predict

In [None]:
from utilities.XGBoost_Predictor import XGBoostPredictor
# 导入预测类
predictor = XGBoostPredictor()

# 加载模型和特征统计信息
predictor.load_model_and_stats("models/xgboost_model.json", "models/feature_medians.json")

# 用户输入数据，字典形式
#{"out_prncp_inv": 0.18216371428571426, "out_prncp": 0.18217342857142854, "total_rec_prncp": 0.09111911682310947, "last_pymnt_amnt": 0.012824397809318436, "tot_cur_bal": 0.01825904535694721, "recoveries": 0.0, "int_rate": 0.32403886776510354, "total_pymnt": 0.08719915918528463, "total_rec_int": 0.049596851095705635, "funded_amnt_inv": 0.3704257496028571, "total_pymnt_inv": 0.08667971237517338, "inq_last_6mths": 0.0, "sub_grade": 0.7058823529411764, "total_rev_hi_lim": 0.019150633921446936, "revol_bal": 0.011354970973119001, "annual_inc": 0.0067807914628723875, "collections_12_mths_ex_med": 0.0, "delinq_2yrs": 0.0, "dti": 0.4403600900225056, "funded_amnt": 0.3604651162790698, "pub_rec": 0.0, "term": 0.0, "acc_now_delinq": 0.0, "tot_coll_amt": 0.0}
# input_data = {
#     "out_prncp_inv": 100,
# }
input_data = {
    "out_prncp_inv": 500000, # 由投资者资助的总金额的剩余未偿本金。
    "out_prncp": 490000, # 已资助总金额的剩余未偿本金。
    "total_rec_prncp": 0, # 迄今为止收到的本金。
    "last_pymnt_amnt": 100, # 最近收到的总付款金额。
    "tot_cur_bal": 10000, # 所有账户的当前总余额。
    "recoveries": 0, # 贷款转出后的总恢复金额。
    "int_rate": 0.1, # 贷款利率。
    "total_pymnt": 100, # 迄今为止的付款总额。
    "total_rec_int": 0, # 迄今为止收到的利息总额。
    "funded_amnt_inv": 1000, # 由投资者资助的总金额。
    "total_pymnt_inv": 100, # 迄今为止的付款总额。
    "inq_last_6mths": 0, # 过去6个月的查询次数。
    "sub_grade": 0.1, # 贷款分级。
    "total_rev_hi_lim": 100, # 高信用额度总额。
    "revol_bal": 100, # 信贷循环余额。
    "annual_inc": 100, # 年收入。
    "collections_12_mths_ex_med": 0, # 除医疗账款外，过去12个月内的催收次数。
    "delinq_2yrs": 0, # 过去2年的逾期次数。
    "dti": 0.1, # 债务收入比。
    "funded_amnt": 1000, # 贷款金额。
    "pub_rec": 0, # 不良公共记录的数量。
    "term": 36, # 贷款的付款次数。单位是月，可能是 36 个月或 60 个月。。
    "acc_now_delinq": 0, # 当前逾期的账户数量。
    "tot_coll_amt": 0 # 曾经欠下的总催收金额。

}

# 预测
try:
    prediction = predictor.predict(input_data)
    print(f"Predicted class: {prediction}")
except ValueError as e:
    print(f"Error: {e}")

Predicted class: Good credit
