In [12]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score,confusion_matrix,f1_score, roc_curve, auc,roc_auc_score,log_loss,make_scorer
from Include.MLSMOTE import get_tail_label,get_index,get_minority_instace,MLSMOTE

In [13]:
# 加载数据
x_train = np.load('../Training data/X_train.npy')  # 特征 (1000, 111)
y_train = np.load('../Training data/y_train.npy')  # 目标 (1000, 11)
x_test=np.load("../Testing data/X_test.npy")


x_train = pd.DataFrame(x_train).rename(columns={i: f"x_{i}" for i in range(111)})
y_train = pd.DataFrame(y_train).rename(columns={i: f"y_{i}" for i in range(11)})
x_test = pd.DataFrame(x_test).rename(columns={i: f"x_{i}" for i in range(111)})

#中位数填补
for column in x_train.columns:
    x_train.fillna({column: x_train[column].median()}, inplace=True)
    x_test.fillna({column: x_test[column].median()}, inplace=True)


# 找出非零值少于1%的列
threshold = 0.01  # 或者任何认为合适的值
cols_to_drop = [col for col in x_train.columns if (x_train[col] != 0).mean() < threshold]

# 删除这些列
x_train.drop(columns=cols_to_drop, inplace=True)
x_test.drop(columns=cols_to_drop, inplace=True)

# 特征缩放
scaler = StandardScaler()
x_train = pd.DataFrame(scaler.fit_transform(x_train), columns=x_train.columns)
x_test = pd.DataFrame(scaler.fit_transform(x_test), columns=x_test.columns)

print(x_train.shape)
print(x_test.shape)

(1000, 72)
(700, 72)


In [14]:
# 寻找尾部标签
tail_labels = get_tail_label(y_train)

# 获取尾部标签的索引
indices = get_index(y_train)

# 获取尾部标签的样本
X_sub, y_sub = get_minority_instace(x_train, y_train)

# 应用MLSMOTE来增强数据集
X_resampled, y_resampled = MLSMOTE(X_sub, y_sub, 1)  # 假设生成新样本

# 合并原始数据和生成的数据
X_train_final = pd.concat([x_train, X_resampled], ignore_index=True)
y_train_final = pd.concat([y_train, y_resampled], ignore_index=True)

# 划分出来百分之二十的测试集
X_train, X_valid, Y_train, Y_valid = train_test_split(X_train_final, y_train_final, test_size=0.2, random_state=42)
print(X_train.shape)
print(Y_train.shape)
print(X_valid.shape)

(1415, 72)
(1415, 11)
(354, 72)


In [15]:
def loss(y_true, y_pred):
    # 确保 y_true 和 y_pred 都是 DataFrame
    if not isinstance(y_true, pd.DataFrame):
        y_true = pd.DataFrame(y_true)
    if not isinstance(y_pred, pd.DataFrame):
        y_pred = pd.DataFrame(y_pred)
    
    logloss = 0
    # 对每个标签计算log_loss并累加
    for i in range(y_true.shape[1]):  
        logloss += log_loss(y_true.iloc[:, i], y_pred.iloc[:, i], labels=[0,1])
    # 计算平均log_loss
    return logloss / y_true.shape[1]

In [16]:
# 创建SVM模型，使用循环遍历每个y的维度
num_labels = y_train_final.shape[1]  # y_train_final有11个标签
random_forest_models = []
average_scores = {
    'accuracy': [],
    'f1_score': [],
    'roc_auc': []
}
predictions = pd.DataFrame()
predictions_proba = np.zeros((Y_valid.shape[0], num_labels * 2))
for i in range(num_labels):
    # 提取目标标签列
    Y_train_label = Y_train.iloc[:, i]
    Y_valid_label = Y_valid.iloc[:, i]

    # 训练SVM模型
    model = RandomForestClassifier(n_estimators=100, random_state=42)  # 使用100棵树
    model.fit(X_train, Y_train_label)

    random_forest_models.append(model)

    # 预测
    Y_pred = model.predict(X_valid)
    Y_pred_prob = model.predict_proba(X_valid)
    predictions_proba[:, 2*i:2*i+2] = Y_pred_prob

    # 计算指标
    acc = accuracy_score(Y_valid_label, Y_pred)
    f1 = f1_score(Y_valid_label, Y_pred)
    roc_auc = roc_auc_score(Y_valid_label, Y_pred_prob[:, 1])

    average_scores['accuracy'].append(acc)
    average_scores['f1_score'].append(f1)
    average_scores['roc_auc'].append(roc_auc)

    Y_pred_test=model.predict(x_test)
    predictions[f'y_pred_{i}'] = Y_pred_test

# 计算平均指标
average_loss = loss(Y_valid, predictions_proba)
mean_accuracy = np.mean(average_scores['accuracy'])
mean_f1 = np.mean(average_scores['f1_score'])
mean_roc_auc = np.mean(average_scores['roc_auc'])

print(f"Average Accuracy: {mean_accuracy}")
print(f"Average F1-Score: {mean_f1}")
print(f"Average ROC-AUC: {mean_roc_auc}")
print(f"Average Loss: {average_loss}")

Average Accuracy: 0.9178222907036466
Average F1-Score: 0.8352802125607202
Average ROC-AUC: 0.9521518298442261
Average Loss: 1.131949291998175


In [17]:
print(predictions)

     y_pred_0  y_pred_1  y_pred_2  y_pred_3  y_pred_4  y_pred_5  y_pred_6  \
0         0.0       0.0       0.0       0.0       0.0       0.0       0.0   
1         0.0       0.0       0.0       0.0       0.0       0.0       0.0   
2         0.0       0.0       0.0       0.0       0.0       0.0       0.0   
3         0.0       0.0       0.0       0.0       0.0       0.0       0.0   
4         0.0       0.0       0.0       0.0       0.0       0.0       0.0   
..        ...       ...       ...       ...       ...       ...       ...   
695       0.0       0.0       0.0       0.0       0.0       0.0       0.0   
696       0.0       0.0       0.0       0.0       0.0       0.0       0.0   
697       0.0       0.0       0.0       0.0       0.0       0.0       0.0   
698       0.0       0.0       0.0       0.0       0.0       0.0       0.0   
699       0.0       0.0       0.0       0.0       0.0       0.0       0.0   

     y_pred_7  y_pred_8  y_pred_9  y_pred_10  
0         0.0       0.0     