In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, f1_score


In [4]:

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split

In [2]:
# 读取数据集
data = pd.read_csv('Loan_defaults_training.csv')
data

Unnamed: 0,id,loanAmnt,term,interestRate,installment,grade,subGrade,employmentTitle,homeOwnership,annualIncome,...,openAcc,pubRec,pubRecBankruptcies,revolBal,revolUtil,totalAcc,initialListStatus,applicationType,title,isDefault
0,0,35000,5,19.52,917.97,E,E2,320,2,110000.0,...,7,0,0.0,24178,48.9,27,0,0,1,1
1,1,18000,5,18.49,461.90,D,D2,219843,0,46000.0,...,13,0,0.0,15096,38.9,18,1,0,1723,0
2,2,12000,5,16.99,298.17,D,D3,31698,0,74000.0,...,11,0,0.0,4606,51.8,27,0,0,0,0
3,3,11000,3,7.26,340.96,A,A4,46854,1,118000.0,...,9,0,0.0,9948,52.6,28,1,0,4,0
4,4,3000,3,12.99,101.07,C,C2,54,1,29000.0,...,12,0,0.0,2942,32.0,27,0,0,11,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,99995,10000,3,17.09,356.98,D,D1,7484,1,24000.0,...,6,0,0.0,7858,81.0,15,1,0,0,0
99996,99996,11225,5,17.57,282.43,D,D4,72305,0,65000.0,...,13,0,0.0,13321,69.4,28,0,0,6,0
99997,99997,20000,3,11.99,664.20,B,B3,213814,1,63000.0,...,10,0,0.0,12171,55.8,28,0,0,0,0
99998,99998,10000,3,11.55,330.00,B,B3,219259,1,52000.0,...,7,0,0.0,3680,35.7,14,1,0,4,0


In [7]:

# 删除id列
if 'id' in data.columns:
    data = data.drop(columns=['id'])

# 识别分类变量
categorical_cols = ['grade', 'subGrade', 'employmentTitle', 'purpose', 'postCode', 'regionCode',
                    'initialListStatus', 'applicationType', 'title']

# Label Encoding
label_encoders = {col: LabelEncoder() for col in categorical_cols}
for col in categorical_cols:
    data[col] = label_encoders[col].fit_transform(data[col])

# 处理缺失值
imputer = SimpleImputer(strategy='median')
data_imputed = imputer.fit_transform(data)


# Label Encoding
label_encoders = {col: LabelEncoder() for col in categorical_cols}
for col in categorical_cols:
    data[col] = label_encoders[col].fit_transform(data[col])

# 目标变量
y = data['isDefault']

# 自变量
X = data.drop(columns=['isDefault'])

# 处理缺失值
imputer = SimpleImputer(strategy='median')
X_imputed = imputer.fit_transform(X)

# 标准化特征
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_imputed)

# 分割数据集为训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

In [8]:


# 定义模型和参数网格
models = {
    'LogisticRegression': LogisticRegression(),
    'RandomForest': RandomForestClassifier(),
    'SVM': SVC(),
    'GradientBoosting': GradientBoostingClassifier(),
    'KNN': KNeighborsClassifier()
}

param_grids = {
    'LogisticRegression': {'C': [0.01, 0.1, 1, 10, 100]},
    'RandomForest': {'n_estimators': [50, 100, 200], 'max_depth': [None, 10, 20, 30]},
    'SVM': {'C': [0.01, 0.1, 1, 10], 'gamma': [0.001, 0.01, 0.1, 1]},
    'GradientBoosting': {'n_estimators': [50, 100, 200], 'learning_rate': [0.01, 0.1, 0.2], 'max_depth': [3, 5, 7]},
    'KNN': {'n_neighbors': [3, 5, 7, 9], 'weights': ['uniform', 'distance']}
}

# 交叉验证和超参数调优
best_models = {}
best_params = {}
for name, model in models.items():
    grid_search = GridSearchCV(model, param_grids[name], cv=10, scoring='f1', n_jobs=-1)
    grid_search.fit(X_train, y_train)
    best_models[name] = grid_search.best_estimator_
    best_params[name] = grid_search.best_params_

# 打印最佳参数
for name, params in best_params.items():
    print(f"{name} 最佳参数: {params}")

# 评估最佳模型
for name, model in best_models.items():
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    print(f"{name} - Accuracy: {accuracy:.4f}, F1 Score: {f1:.4f}")


KeyboardInterrupt: 