In [4]:
import numpy as np
import pandas as pd
from scipy.stats import skew, kurtosis
from scipy.special import boxcox1p
from sklearn.preprocessing import PowerTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score,confusion_matrix
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import cross_val_score,StratifiedKFold

In [5]:
x_train=np.load("Training data/X_train.npy")
y_train=np.load("Training data/y_train.npy")
x_test=np.load("Testing data/X_test.npy")

x_train = pd.DataFrame(x_train).rename(columns={i: f"x_{i}" for i in range(111)})
y_train = pd.DataFrame(y_train).rename(columns={i: f"x_{i}" for i in range(111)})
x_test = pd.DataFrame(x_test).rename(columns={i: f"x_{i}" for i in range(111)})

In [6]:
#中位数填补
for column in x_train.columns:
    x_train[column].fillna(x_train[column].median(), inplace=True)
    x_test[column].fillna(x_test[column].median(), inplace=True)

In [7]:
threshold = 0.01  # 或者任何认为合适的值

# 找出非零值少于1%的列
cols_to_drop = [col for col in x_train.columns if (x_train[col] != 0).mean() < threshold]

# 删除这些列
x_train.drop(columns=cols_to_drop, inplace=True)
x_test.drop(columns=cols_to_drop, inplace=True)

In [8]:
y = y_train.iloc[:, 1]  # 选择第几个目标特征

In [9]:
# 拆分数据为训练集和验证集
X_train, X_valid, Y_train, Y_valid = train_test_split(x_train, y, test_size=0.2, random_state=42)

# 初始化随机森林模型
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# 训练模型
rf_classifier.fit(X_train, Y_train)

# 在验证集上进行预测
y_valid_pred = rf_classifier.predict(X_valid)

# 打印分类报告
print(classification_report(Y_valid, y_valid_pred))
print("Accuracy:", accuracy_score(Y_valid, y_valid_pred))

              precision    recall  f1-score   support

           0       0.79      1.00      0.88       157
           1       0.00      0.00      0.00        43

    accuracy                           0.79       200
   macro avg       0.39      0.50      0.44       200
weighted avg       0.62      0.79      0.69       200

Accuracy: 0.785


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


直接用随机森林预测无法预测任何一个1，所以需要对少量类别样本过采样

In [10]:
smote = SMOTE(random_state=42)
# 使用SMOTE过采样处理不平衡的数据
X_train_smote, Y_train_smote = smote.fit_resample(x_train, y)
# 拆分数据为训练集和验证集
X_train, X_valid, Y_train, Y_valid = train_test_split(X_train_smote, Y_train_smote, test_size=0.2, random_state=42)
# 初始化随机森林模型
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
# 训练模型
rf_classifier.fit(X_train, Y_train)
# 预测
y_valid_pred = rf_classifier.predict(X_valid)

conf_matrix = confusion_matrix(Y_valid, y_valid_pred)

# 打印分类报告
print(classification_report(Y_valid, y_valid_pred))
print("Confusion Matrix\n",conf_matrix)
print("Accuracy:", accuracy_score(Y_valid, y_valid_pred))

              precision    recall  f1-score   support

           0       0.82      0.98      0.89       168
           1       0.97      0.76      0.85       147

    accuracy                           0.88       315
   macro avg       0.89      0.87      0.87       315
weighted avg       0.89      0.88      0.87       315

Confusion Matrix
 [[164   4]
 [ 35 112]]
Accuracy: 0.8761904761904762


以上是用过采样后的数据进行验证的，可能在真实的数据不平衡的数据集上表现不好，而且从结果看预测标签0时候表现更好，预测标签1时有33个被错误预测为负例。

**交叉验证，用过采样的数据训练，用原始的数据验证**

In [11]:
# 初始化SMOTE
smote = SMOTE(random_state=42)
# 初始化随机森林模型
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
# 初始化交叉验证方法
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
# 准备保存分数
scores = []

# StratifiedKFold保持每个折的类别比例
for train_index, test_index in kf.split(x_train, y):
    # 分割数据
    X_train_fold, X_test_fold = x_train.iloc[train_index], x_train.iloc[test_index]
    Y_train_fold, Y_test_fold = y.iloc[train_index], y.iloc[test_index]
    # 在每个训练折上应用SMOTE
    X_train_fold_smote, Y_train_fold_smote = smote.fit_resample(X_train_fold, Y_train_fold)
    # 在SMOTE处理过的训练数据上训练模型
    rf_classifier.fit(X_train_fold_smote, Y_train_fold_smote)
    # 验证模型在原始未处理的验证折上的性能
    Y_pred_fold = rf_classifier.predict(X_test_fold)
    scores.append(accuracy_score(Y_test_fold, Y_pred_fold))

# 输出每一折的准确率
print(scores)
# 输出平均准确率
print("Average Cross-Validated Accuracy: %0.2f" % np.mean(scores))

[0.79, 0.79, 0.785, 0.785, 0.78]
Average Cross-Validated Accuracy: 0.79


In [12]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# 初始化SMOTE
smote = SMOTE(random_state=42)
# 初始化随机森林模型
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
# 初始化交叉验证方法
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# 准备保存所有特征的分数
all_scores = {
    'accuracy': [],
    'precision': [],
    'recall': [],
    'f1': [],
    'roc_auc': []
}

# 对y_train的每一列（即每一个标签）进行操作
for i in range(y_train.shape[1]):
    y = y_train.iloc[:, i]  # 选择第i个目标特征
    scores = []  # 用于保存当前目标特征的分数
    roc_auc_scores = []

    # StratifiedKFold保持每个折的类别比例
    for train_index, test_index in kf.split(x_train, y):
        # 分割数据
        X_train_fold, X_test_fold = x_train.iloc[train_index], x_train.iloc[test_index]
        Y_train_fold, Y_test_fold = y.iloc[train_index], y.iloc[test_index]
        # 在每个训练折上应用SMOTE
        X_train_fold_smote, Y_train_fold_smote = smote.fit_resample(X_train_fold, Y_train_fold)
        # 在SMOTE处理过的训练数据上训练模型
        rf_classifier.fit(X_train_fold_smote, Y_train_fold_smote)
        # 验证模型在原始未处理的验证折上的性能
        Y_pred_fold = rf_classifier.predict(X_test_fold)

        # 保存当前目标特征的分数
        scores.append(accuracy_score(Y_test_fold, Y_pred_fold))
        all_scores['accuracy'].append(accuracy_score(Y_test_fold, Y_pred_fold))
        all_scores['precision'].append(precision_score(Y_test_fold, Y_pred_fold, zero_division=0))
        all_scores['recall'].append(recall_score(Y_test_fold, Y_pred_fold, zero_division=0))
        all_scores['f1'].append(f1_score(Y_test_fold, Y_pred_fold, zero_division=0))
        
        # 计算AUC分数，需要预测概率，这里只在标签平衡时计算
        if len(np.unique(Y_test_fold)) > 1:
            Y_pred_probs_fold = rf_classifier.predict_proba(X_test_fold)[:, 1]
            roc_auc_scores.append(roc_auc_score(Y_test_fold, Y_pred_probs_fold))
        else:
            roc_auc_scores.append(np.nan)  # 当某一类别没有出现在测试集中时，跳过AUC计算

    # 输出当前目标特征的平均分数
    print(f"Scores for target {i}:")
    print(f"Accuracy: {np.mean(scores)}")
    print(f"ROC AUC: {np.nanmean(roc_auc_scores)}")

# 输出所有目标特征的平均分数
print("Average Scores for all targets:")
print(f"Average Accuracy: {np.mean(all_scores['accuracy'])}")
print(f"Average Precision: {np.mean(all_scores['precision'])}")
print(f"Average Recall: {np.mean(all_scores['recall'])}")
print(f"Average F1 Score: {np.mean(all_scores['f1'])}")
print(f"Average ROC AUC Score: {np.nanmean(roc_auc_scores)}")


Scores for target 0:
Accuracy: 0.7
ROC AUC: 0.4912202380952381
Scores for target 1:
Accuracy: 0.7860000000000001
ROC AUC: 0.48887747212946103
Scores for target 2:
Accuracy: 0.777
ROC AUC: 0.5063878048474824
Scores for target 3:
Accuracy: 0.76
ROC AUC: 0.5216344231186153
Scores for target 4:
Accuracy: 0.766
ROC AUC: 0.5109657096937393
Scores for target 5:
Accuracy: 0.711
ROC AUC: 0.4938102486047692
Scores for target 6:
Accuracy: 0.781
ROC AUC: 0.5222173659673659
Scores for target 7:
Accuracy: 0.744
ROC AUC: 0.5022554075438123
Scores for target 8:
Accuracy: 0.5959999999999999
ROC AUC: 0.5378711474468798
Scores for target 9:
Accuracy: 0.7140000000000001
ROC AUC: 0.5132447351804934
Scores for target 10:
Accuracy: 0.7190000000000001
ROC AUC: 0.549972786647315
Average Scores for all targets:
Average Accuracy: 0.7321818181818182
Average Precision: 0.21765729378169574
Average Recall: 0.03669239827224443
Average F1 Score: 0.05605584602792293
Average ROC AUC Score: 0.549972786647315
