In [67]:
import numpy as np
import pandas as pd
from scipy.stats import skew, kurtosis
from scipy.special import boxcox1p
from sklearn.preprocessing import PowerTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score,confusion_matrix
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import cross_val_score,StratifiedKFold

In [68]:
x_train=np.load("Training data/X_train.npy")
y_train=np.load("Training data/y_train.npy")
x_test=np.load("Testing data/X_test.npy")

x_train = pd.DataFrame(x_train).rename(columns={i: f"x_{i}" for i in range(111)})
y_train = pd.DataFrame(y_train).rename(columns={i: f"x_{i}" for i in range(111)})
x_test = pd.DataFrame(x_test).rename(columns={i: f"x_{i}" for i in range(111)})

In [69]:
#中位数填补
for column in x_train.columns:
    x_train[column].fillna(x_train[column].median(), inplace=True)
    x_test[column].fillna(x_test[column].median(), inplace=True)

In [70]:
threshold = 0.01  # 或者任何认为合适的值

# 找出非零值少于1%的列
cols_to_drop = [col for col in x_train.columns if (x_train[col] != 0).mean() < threshold]

# 删除这些列
x_train.drop(columns=cols_to_drop, inplace=True)
x_test.drop(columns=cols_to_drop, inplace=True)

In [71]:
y = y_train.iloc[:, 1]  # 选择第几个目标特征

In [72]:
# 拆分数据为训练集和验证集
X_train, X_valid, Y_train, Y_valid = train_test_split(x_train, y, test_size=0.2, random_state=42)

# 初始化随机森林模型
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# 训练模型
rf_classifier.fit(X_train, Y_train)

# 在验证集上进行预测
y_valid_pred = rf_classifier.predict(X_valid)

# 打印分类报告
print(classification_report(Y_valid, y_valid_pred))
print("Accuracy:", accuracy_score(Y_valid, y_valid_pred))

              precision    recall  f1-score   support

           0       0.79      1.00      0.88       157
           1       0.00      0.00      0.00        43

    accuracy                           0.79       200
   macro avg       0.39      0.50      0.44       200
weighted avg       0.62      0.79      0.69       200

Accuracy: 0.785


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


直接用随机森林预测无法预测任何一个1，所以需要对少量类别样本过采样

In [73]:
smote = SMOTE(random_state=42)
# 使用SMOTE过采样处理不平衡的数据
X_train_smote, Y_train_smote = smote.fit_resample(x_train, y)
# 拆分数据为训练集和验证集
X_train, X_valid, Y_train, Y_valid = train_test_split(X_train_smote, Y_train_smote, test_size=0.2, random_state=42)
# 初始化随机森林模型
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
# 训练模型
rf_classifier.fit(X_train, Y_train)
# 预测
y_valid_pred = rf_classifier.predict(X_valid)

conf_matrix = confusion_matrix(Y_valid, y_valid_pred)

# 打印分类报告
print(classification_report(Y_valid, y_valid_pred))
print("Confusion Matrix\n",conf_matrix)
print("Accuracy:", accuracy_score(Y_valid, y_valid_pred))

              precision    recall  f1-score   support

           0       0.82      0.98      0.89       168
           1       0.97      0.76      0.85       147

    accuracy                           0.88       315
   macro avg       0.89      0.87      0.87       315
weighted avg       0.89      0.88      0.87       315

Confusion Matrix
 [[164   4]
 [ 35 112]]
Accuracy: 0.8761904761904762


以上是用过采样后的数据进行验证的，可能在真实的数据不平衡的数据集上表现不好，而且从结果看预测标签0时候表现更好，预测标签1时有33个被错误预测为负例。

**交叉验证，用过采样的数据训练，用原始的数据验证**

In [74]:
# 初始化SMOTE
smote = SMOTE(random_state=42)
# 初始化随机森林模型
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
# 初始化交叉验证方法
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
# 准备保存分数
scores = []

# StratifiedKFold保持每个折的类别比例
for train_index, test_index in kf.split(x_train, y):
    # 分割数据
    X_train_fold, X_test_fold = x_train.iloc[train_index], x_train.iloc[test_index]
    Y_train_fold, Y_test_fold = y.iloc[train_index], y.iloc[test_index]
    # 在每个训练折上应用SMOTE
    X_train_fold_smote, Y_train_fold_smote = smote.fit_resample(X_train_fold, Y_train_fold)
    # 在SMOTE处理过的训练数据上训练模型
    rf_classifier.fit(X_train_fold_smote, Y_train_fold_smote)
    # 验证模型在原始未处理的验证折上的性能
    Y_pred_fold = rf_classifier.predict(X_test_fold)
    scores.append(accuracy_score(Y_test_fold, Y_pred_fold))

# 输出每一折的准确率
print(scores)
# 输出平均准确率
print("Average Cross-Validated Accuracy: %0.2f" % np.mean(scores))

[0.79, 0.79, 0.785, 0.785, 0.78]
Average Cross-Validated Accuracy: 0.79
