In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import ExtraTreesClassifier,RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from scipy.stats import randint 

# 导入数据集
file_path = r'E:\python练习\2week\data_副本.csv'# 替换为实际文件路径
data = pd.read_csv(file_path)

#随机抽取数据的 30%
data_sample = data.sample(frac=0.3, random_state=42)  # frac=0.3 表示抽取 30% 的数据

# 去除日期列
if 'earliesCreditLine' in data.columns:
    data = data.drop(columns=['earliesCreditLine'])

# 处理非数值数据
# 将所有非数值列转换为数值列（例如，将类别数据转换为数值编码）
for column in data.columns:
    if data[column].dtype == 'object':
        data[column] = pd.Categorical(data[column]).codes

print("\n缺失值检查：")
print(data.isnull().sum())

## 区分特征和标签
labels = data.iloc[:, 2]       # 选择除 'interestRate' 之外的所有列作为特征
features = data.drop(data.columns[2], axis=1)    # 选择 'interestRate' 作为标签

print("\n标签列名：")
print(data.columns[2])

# 检查标签的唯一值
print("\n标签的唯一值：")
print(labels.unique())


# 将标签转换为分类类型
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)
# 分割数据集，80%作为训练集，20%作为测试集
X_train, X_test, y_train, y_test = train_test_split(features, encoded_labels, test_size=0.2, random_state=42)

# 标准化特征
scaler = StandardScaler()
X_train_standardized = scaler.fit_transform(X_train)
X_test_standardized = scaler.transform(X_test)

# 特征选择
model = ExtraTreesClassifier()
model.fit(X_train_standardized, y_train)

# 输出特征重要度
print("特征重要度：")
print(model.feature_importances_)

# 训练 ExtraTreesClassifier 并获取特征重要性
model = ExtraTreesClassifier()
model.fit(X_train_standardized, y_train)

# 保留 95% 的方差
pca = PCA(n_components=0.95)
X_train_pca = pca.fit_transform(X_train_standardized)
X_test_pca = pca.transform(X_test_standardized)

from sklearn.ensemble import RandomForestClassifier
from scipy.stats import randint
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# 定义 Random Forest 模型
rf = RandomForestClassifier(random_state=42)

# 定义超参数分布
param_distributions_rf = {
    'n_estimators': randint(100, 500),
    'max_depth': randint(5, 50),
    'min_samples_split': randint(2, 20),
    'min_samples_leaf': randint(1, 20),
    'max_features': ['auto', 'sqrt', 'log2'],
    'bootstrap': [True, False],
    'criterion': ['gini', 'entropy']
}

# 定义交叉验证策略
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

# 定义随机搜索
random_search_rf = RandomizedSearchCV(estimator=rf, 
                                      param_distributions=param_distributions_rf, 
                                      n_iter=20, 
                                      cv=cv, 
                                      scoring='accuracy', 
                                      n_jobs=-1, 
                                      random_state=42, 
                                      verbose=2)

# 执行搜索
random_search_rf.fit(X_train_standardized, y_train)

# 输出最佳参数和最佳得分
print("Best parameters found for Random Forest: ", random_search_rf.best_params_)
print("Best accuracy score for Random Forest: ", random_search_rf.best_score_)

# 使用最佳参数训练模型并在测试集上评估
best_rf = random_search_rf.best_estimator_
y_pred_rf = best_rf.predict(X_test_standardized)

# 计算模型的不同评分指标
accuracy_rf = accuracy_score(y_test, y_pred_rf)
precision_rf = precision_score(y_test, y_pred_rf, average='weighted')
recall_rf = recall_score(y_test, y_pred_rf, average='weighted')
f1_rf = f1_score(y_test, y_pred_rf, average='weighted')

print(f"Random Forest Test set accuracy: {accuracy_rf}")
print(f"Random Forest Test set precision: {precision_rf}")
print(f"Random Forest Test set recall: {recall_rf}")
print(f"Random Forest Test set F1-score: {f1_rf}")



缺失值检查：
id                 0
loanAmnt           0
term               0
interestRate       0
installment        0
grade              0
subGrade           0
employmentTitle    0
annualIncome       0
isDefault          0
purpose            0
dti                0
ficoRangeLow       0
openAcc            0
revolBal           0
totalAcc           0
title              0
policyCode         0
dtype: int64

标签列名：
term

标签的唯一值：
[5 3]
特征重要度：
[0.0309835  0.21890375 0.0750225  0.17967589 0.0831763  0.08889562
 0.03331684 0.04148373 0.01383893 0.01839741 0.03259685 0.05809709
 0.03052688 0.0387336  0.03596618 0.02038492 0.        ]
Fitting 3 folds for each of 20 candidates, totalling 60 fits
Best parameters found for Random Forest:  {'bootstrap': False, 'criterion': 'entropy', 'max_depth': 31, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 4, 'n_estimators': 200}
Best accuracy score for Random Forest:  0.9342133752322695
Random Forest Test set accuracy: 0.935350165722917
Random Fo

n_estimators: 在 100 到 500 之间随机选择树的数量。
max_depth: 随机选择树的最大深度，以控制模型的复杂性。
min_samples_split 和 min_samples_leaf: 控制分裂节点时的样本数，从而减少过拟合。
max_features: 控制每个分裂时可选择的特征数量，可以是 'auto'（默认，使用所有特征）、'sqrt'（平方根特征数）或 'log2'（对数特征数）。
bootstrap: 决定是否使用自助采样，True 为使用，False 为不使用。
通过这种随机搜索方法，可以找到性能最佳的参数组合，并且避免手动调整参数的繁琐过程。