In [1]:
import pandas as pd
df = pd.read_excel('信用评分卡模型.xlsx')
df.head()

Unnamed: 0,月收入,年龄,性别,历史授信额度,历史违约次数,信用评分
0,7783,29,0,32274,3,73
1,7836,40,1,6681,4,72
2,6398,25,0,26038,2,74
3,6483,23,1,24584,4,65
4,5167,23,1,6710,3,73


In [2]:
# 通过如下代码将特征变量和目标变量单独提取出来，代码如下：
X = df.drop(columns='信用评分')
Y = df['信用评分']

In [3]:
# 从Scikit-Learn库中引入LinearRegression()模型进行模型训练，代码如下：
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X,Y)

In [4]:
# 4.线性回归方程构造
print('各系数为:' + str(model.coef_))
print('常数项系数k0为:' + str(model.intercept_))

各系数为:[ 5.58658996e-04  1.62842002e-01  2.18430276e-01  6.69996665e-05
 -1.51063940e+00]
常数项系数k0为:67.1668660385318


In [5]:
# 利用3.2节模型评估的方法对此多元线性回归模型进行评估，代码如下：
import statsmodels.api as sm
X2 = sm.add_constant(X)
est = sm.OLS(Y, X2).fit()
est.summary()

0,1,2,3
Dep. Variable:,信用评分,R-squared:,0.629
Model:,OLS,Adj. R-squared:,0.628
Method:,Least Squares,F-statistic:,337.6
Date:,"Thu, 28 Nov 2024",Prob (F-statistic):,2.32e-211
Time:,15:14:48,Log-Likelihood:,-2969.8
No. Observations:,1000,AIC:,5952.0
Df Residuals:,994,BIC:,5981.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,67.1669,1.121,59.906,0.000,64.967,69.367
月收入,0.0006,8.29e-05,6.735,0.000,0.000,0.001
年龄,0.1628,0.022,7.420,0.000,0.120,0.206
性别,0.2184,0.299,0.730,0.466,-0.369,0.806
历史授信额度,6.7e-05,7.78e-06,8.609,0.000,5.17e-05,8.23e-05
历史违约次数,-1.5106,0.140,-10.811,0.000,-1.785,-1.236

0,1,2,3
Omnibus:,13.18,Durbin-Watson:,1.996
Prob(Omnibus):,0.001,Jarque-Bera (JB):,12.534
Skew:,-0.236,Prob(JB):,0.0019
Kurtosis:,2.721,Cond. No.,427000.0


In [6]:
# 这里使用第九章讲过的GBDT回归模型同样来做一下回归分析，首先读取1000条信用卡客户的数据并划分特征变量和目标变量，这部分代码和上面线性回归的代码是一样的。
# 1.读取数据
import pandas as pd
df = pd.read_excel('信用评分卡模型.xlsx')
# 2.提取特征变量和目标变量
X = df.drop(columns='信用评分')
y = df['信用评分']

In [7]:
# 通过如下代码划分训练集和测试集数据：
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

In [8]:
# 划分训练集和测试集完成后，就可以从Scikit-Learn库中引入GBDT模型进行模型训练了，代码如下：
from sklearn.ensemble import GradientBoostingRegressor
model = GradientBoostingRegressor()  # 使用默认参数
model.fit(X_train, y_train)

In [9]:
# 模型搭建完毕后，通过如下代码预测测试集数据：
y_pred = model.predict(X_test)
print(y_pred[0:10])

[70.77631652 71.40032104 73.73465155 84.52533945 71.09188294 84.9327599
 73.72232388 83.44560704 82.61221486 84.86927209]


In [10]:
# 通过和之前章节类似的代码，我们可以将预测值和实际值进行对比：
a = pd.DataFrame()  # 创建一个空DataFrame 
a['预测值'] = list(y_pred)
a['实际值'] = list(y_test)
a.head()

Unnamed: 0,预测值,实际值
0,70.776317,79
1,71.400321,80
2,73.734652,62
3,84.525339,89
4,71.091883,80


In [11]:
# 因为GradientBoostingRegressor()是一个回归模型，所以我们通过查看其R-squared值来评判模型的拟合效果：
from sklearn.metrics import r2_score
r2 = r2_score(y_test, model.predict(X_test))
print(r2)

0.6765017936599459


In [12]:
# 我们还可以通过GradientBoostingRegressor()自带的score()函数来查看模型预测的效果：
model.score(X_test, y_test)

0.6765017936599459

In [13]:
# 如下所示，其中前3步读取数据，提取特征变量和目标变量，划分训练集和测试集都与GBDT模型相同，因此不再重复，直接从第四步模型开始讲解：
# 1.读取数据
import pandas as pd
df = pd.read_excel('信用评分卡模型.xlsx')
# 2.提取特征变量和目标变量
X = df.drop(columns='信用评分')
y = df['信用评分']
# 3.划分测试集和训练集
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

In [14]:
# 划分训练集和测试集完成后，就可以从Scikit-Learn库中引入XGBRegressor()模型进行模型训练了，代码如下：
from xgboost import XGBRegressor
model = XGBRegressor()  # 使用默认参数
model.fit(X_train, y_train)

In [15]:
# 模型搭建完毕后，通过如下代码预测测试集数据：
y_pred = model.predict(X_test)
print(y_pred[0:10])

[68.67499  70.25757  70.777275 83.4361   70.37347  84.74148  79.75432
 83.773285 83.42506  84.90951 ]


In [16]:
# 通过和之前章节类似的代码，我们可以将预测值和实际值进行对比：
a = pd.DataFrame()  # 创建一个空DataFrame 
a['预测值'] = list(y_pred)
a['实际值'] = list(y_test)
a.head()

Unnamed: 0,预测值,实际值
0,68.674988,79
1,70.257568,80
2,70.777275,62
3,83.436096,89
4,70.373466,80


In [17]:
# 因为XGBRegressor()是一个回归模型，所以通过查看R-squared来评判模型的拟合效果：
from sklearn.metrics import r2_score
r2 = r2_score(y_test, model.predict(X_test))
print(r2)

0.5706755793386162


In [18]:
# 我们还可以通过XGBRegressor()自带的score()函数来查看模型预测的效果：
model.score(X_test, y_test)

0.5706755793386162

In [19]:
# 通过10.2.3节讲过的feature_importances_属性，我们来查看模型的特征重要性：
features = X.columns  # 获取特征名称
importances = model.feature_importances_  # 获取特征重要性

# 通过二维表格形式显示
importances_df = pd.DataFrame()
importances_df['特征名称'] = features
importances_df['特征重要性'] = importances
importances_df.sort_values('特征重要性', ascending=False)

Unnamed: 0,特征名称,特征重要性
4,历史违约次数,0.542783
0,月收入,0.243293
3,历史授信额度,0.110344
1,年龄,0.058815
2,性别,0.044765


In [20]:
# 通过和10.2.4节类似的代码，我们可以对XGBoost回归模型进行参数调优，代码如下：
from sklearn.model_selection import GridSearchCV  
parameters = {'max_depth': [1, 3, 5], 'n_estimators': [50, 100, 150], 'learning_rate': [0.01, 0.05, 0.1, 0.2]}  # 指定模型中参数的范围
clf = XGBRegressor()  # 构建回归模型
grid_search = GridSearchCV(model, parameters, scoring='r2', cv=5) 

In [21]:
grid_search.fit(X_train, y_train)  # 传入数据
grid_search.best_params_  # 输出参数的最优值
# 在模型中设置参数，代码如下：
model = XGBRegressor(max_depth=3, n_estimators=50, learning_rate=0.1)
model.fit(X_train, y_train)

In [22]:
# 此时再通过r2_score()函数进行模型评估，代码如下（也可以用model.score(X_test, y_test)进行评分，效果一样）：
from sklearn.metrics import r2_score
r2 = r2_score(y_test, model.predict(X_test))
print(r2)

0.6957048168460183


In [23]:
from sklearn.preprocessing import StandardScaler
X_new = StandardScaler().fit_transform(X)

X_new  # 打印标准化后的数据

array([[-0.88269208, -1.04890243, -1.01409939, -0.60873764,  0.63591822],
       [-0.86319167,  0.09630122,  0.98609664, -1.55243002,  1.27956013],
       [-1.39227834, -1.46534013, -1.01409939, -0.83867808, -0.0077237 ],
       ...,
       [ 1.44337605,  0.61684833,  0.98609664,  1.01172301, -0.0077237 ],
       [ 0.63723633, -0.21602705,  0.98609664, -0.32732239, -0.0077237 ],
       [ 1.57656755,  0.61684833, -1.01409939,  1.30047599, -0.0077237 ]])

In [24]:
# 3.划分测试集和训练集
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size=0.2, random_state=123)

# 4.建模
# 划分训练集和测试集完成后，就可以从Scikit-Learn库中引入XGBRegressor()模型进行模型训练了，代码如下：
from xgboost import XGBRegressor
model = XGBRegressor()  # 使用默认参数
model.fit(X_train, y_train)

# 因为XGBRegressor()是一个回归模型，所以通过查看R-squared来评判模型的拟合效果：
from sklearn.metrics import r2_score
r2 = r2_score(y_test, model.predict(X_test))
print(r2)

0.5706755793386162
