In [124]:
import numpy as np
import pandas as pd
from joblib import dump, load
from sklearn import metrics
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score, \
    cohen_kappa_score, ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier


In [125]:
# 定义一些字符串信息
# 该 worksheet 所有文件都保存在 ./v2_credit 下
PATH = "credit/"
PREFIX = PATH

# 所有原始数据
all_name = PREFIX + "all.csv"
# 经过预处理的训练数据
train_name = PREFIX + "train.csv"
# 经过预处理的待预测数据
predict_name = PREFIX + "predict.csv"

describe_name = PREFIX + "describe.csv"
missing_name = PREFIX + "missing.csv"
zero_name = PREFIX + "zero.csv"
trains_name = PREFIX + "trans.csv"
std_name = PREFIX + "std.csv"
corr_name = PREFIX + "corr.csv"


In [None]:
# 读取 csv 文件
all_data = pd.read_csv(all_name)

# 数据盘点
all_data.describe().to_csv(describe_name)
all_data.describe()



In [None]:
#2.1 缺失值和 0 值

# 计算文件每一列的缺失值比例并保存至 missing.csv 文件
all_data.isnull().mean().to_csv(missing_name)
# 去除缺失值比例大于 0.7 的列，注意这里改变了 all_data 中的列
all_data = all_data.loc[:, all_data.isnull().mean() < 0.7]

#对于数值型变量，用中位数填充缺失值
all_data.fillna(all_data.median(numeric_only=True), inplace=True)
#对于类别型变量，用众数填充缺失值
all_data.fillna(all_data.mode().iloc[0], inplace=True)

# 计算文件每一列的 0 值比例并保存至 zero.csv 文件
all_data.isin([0]).mean().to_csv(zero_name)
# 去除全为 0 值的列
all_data = all_data.loc[:, all_data.isin([0]).mean() != 1]

#输出处理后的列名
print(all_data.columns)
#输出处理后的数据类型
print(all_data.dtypes)



In [None]:
# 定义不同类型的列的列表

# 所有的列，去除 uid 和 credit_level
columns = all_data.columns.to_list()
columns.remove('uid')
columns.remove('credit_level')

# columns 中的对象列
catelist = [col for col in all_data.columns if col in columns and all_data[col].dtype == 'object']

# columns 中的数值列
numlist = [col for col in all_data.columns if col in columns and all_data[col].dtype == np.number]

print("columns: len ", len(columns), columns)
print("catelist: len", len(catelist), catelist)
print("numlist: len", len(numlist), numlist)



In [None]:
# 2.3 数据转换
# 将star_train数据中的类别变量进行 One-Hot 编码

all_data = pd.get_dummies(all_data, columns=catelist)
print(all_data.head())
all_data.to_csv(trains_name, index=False)
all_data.describe()



In [None]:
# 独热编码之后，重新计算 columns
# 所有的列，去除 uid 和 credit_level
columns = all_data.columns.to_list()
columns.remove('uid')
columns.remove('credit_level')

print("columns: len", len(columns), columns)


In [None]:
# 划分训练数据和待预测数据
train_data = all_data[all_data.credit_level != -1]
predict_data = all_data[all_data.credit_level == -1]

train_data.to_csv(train_name)
predict_data.to_csv(predict_name)

In [None]:
# 4 模型预测
#
# # 准备工作：将数据集分为训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(train_data[columns], train_data['credit_level'], test_size=0.4,
                                                    random_state=42)
# # 划分验证集和真正的测试集
# X_test, X_valid, y_test, y_valid = train_test_split(X_test, y_test, test_size=0.5, random_state=42)

In [None]:
# 4.4 xgboost
# encode y_train
xgb_le = LabelEncoder()
y_train = xgb_le.fit_transform(y_train)

xgb_name = PREFIX + 'xgboost.joblib'

In [None]:
# 创建一个xgboost分类器
xgb = XGBClassifier(random_state=42)
xgb.fit(X_train, y_train)
dump(xgb, xgb_name)


In [None]:
# 读取训练模型
model = load(xgb_name)
# 预测
y_pred = model.predict(X_test)
# 如果使用 xgboost，需要将预测结果反编码
y_pred = xgb_le.inverse_transform(y_pred)


In [None]:
# 5 模型评估
# 5.1 计算准确率
# 计算准确率
accuracy = accuracy_score(y_test, y_pred)
print('准确率为：', accuracy)
# 5.2 混淆矩阵
# 计算混淆矩阵并保存为图片
# 假设 y_true, y_pred, class_names 已经定义
save_path = PREFIX + 'confusion_matrix_xgb.png'
labels = [35, 50, 60, 70, 85]
cm = confusion_matrix(y_test, y_pred, labels=labels)
disp = ConfusionMatrixDisplay(cm, display_labels=labels)
disp.plot()
disp.figure_.savefig(save_path)
# 5.3 计算精确率和召回率
precision = precision_score(y_test, y_pred, average='macro')  # 计算宏平均精确率
recall = recall_score(y_test, y_pred, average='macro')  # 计算宏平均召回率

print("精确率为: ", precision)
print("召回率为: ", recall)
# 5.4 计算F1分数
f1 = f1_score(y_test, y_pred, average='macro')  # 计算宏平均F1分数
print("F1分数为: ", f1)
# 计算Cohen's Kappa系数
kappa = cohen_kappa_score(y_test, y_pred)
print("Cohen's Kappa系数为: ", kappa)

print(metrics.classification_report(y_test, y_pred))
print(metrics.confusion_matrix(y_test, y_pred))



In [127]:
# 6 模型应用
#  读取待预测数据
result = pd.read_csv(predict_name)
# 预测
y_pred = model.predict(result[columns])
y_pred = xgb_le.inverse_transform(y_pred)
# 保存预测结果至 star_test_lr.csv 文件
result['credit_level'] = y_pred
result = result.loc[:, ['uid', 'credit_level']]
result.to_csv(PREFIX + 'result.csv', index=False)
