In [68]:
import numpy as np
import tensorflow as tf
import pandas as pd
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Dropout, Concatenate
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC 
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score,confusion_matrix
from imblearn.over_sampling import SMOTE

找出适合OneHotEncoder的数据

https://zhuanlan.zhihu.com/p/686452650

In [69]:
# 加载数据
X_train = np.load('../Training data/X_train.npy')  # 特征 (1000, 111)
y_train = np.load('../Training data/y_train.npy')  # 目标 (1000, 11)
x_test=np.load("../Testing data/X_test.npy")


x_train = pd.DataFrame(X_train).rename(columns={i: f"x_{i}" for i in range(111)})
y_train = pd.DataFrame(y_train).rename(columns={i: f"y_{i}" for i in range(11)})
x_test = pd.DataFrame(x_test).rename(columns={i: f"x_{i}" for i in range(111)})

#中位数填补
for column in x_train.columns:
    x_train.fillna({column: x_train[column].median()}, inplace=True)
    x_test.fillna({column: x_test[column].median()}, inplace=True)


# 找出非零值少于1%的列
threshold = 0.01  # 或者任何认为合适的值
cols_to_drop = [col for col in x_train.columns if (x_train[col] != 0).mean() < threshold]

# 删除这些列
x_train.drop(columns=cols_to_drop, inplace=True)
x_test.drop(columns=cols_to_drop, inplace=True)

# 特征缩放
scaler = StandardScaler()
x_train = pd.DataFrame(scaler.fit_transform(x_train), columns=x_train.columns)
x_test = pd.DataFrame(scaler.fit_transform(x_test), columns=x_test.columns)


进行重采样

In [70]:
smote = SMOTE()
resampled_data = [];    max_size = 0
for i in range(y_train.shape[1]):
    x_resampled, y_resampled = smote.fit_resample(x_train, y_train.iloc[:, i])
    resampled_data.append((x_resampled, y_resampled))
    if len(y_resampled.values) > max_size:
        max_size = len(x_resampled)

In [71]:
# 使用SMOTE过采样处理不平衡的数据
X_train_smote, Y_train_smote = [], []

for x_resampled, y_resampled in resampled_data:
    smote = SMOTE(sampling_strategy={1: max_size - np.sum(y_resampled), 0: len(y_resampled) - np.sum(y_resampled)})
    x_resampled, y_resampled = smote.fit_resample(x_resampled, y_resampled)
    X_train_smote.append(x_resampled)
    Y_train_smote.append(y_resampled.values.reshape(-1, 1))


In [72]:
av_X_train_smote = np.array(X_train_smote[0])
for i in range(1, 11):
    av_X_train_smote += X_train_smote[i]
av_X_train_smote = av_X_train_smote / 11


In [73]:

X_train_smote = av_X_train_smote
Y_train_smote = np.hstack(Y_train_smote)


# 划分出来百分之二十的测试集
X_train, X_valid, Y_train, Y_valid = train_test_split(X_train_smote, Y_train_smote, test_size=0.2, random_state=42)

# 将处理后的DataFrame转换回NumPy数组


NN部分

In [74]:
# 定义模型结构
shared_input = Input(shape=(X_train.shape[1], ), name='shared_input')
shared_layer = Dense(128, activation='sigmoid')(shared_input)
shared_layer = Dense(64, activation='relu')(shared_layer)
shared_layer = Dense(32, activation='relu')(shared_layer)

定义Specific部分

In [75]:
outputs = []
for i in range(11):
    task_output = Dense(16,  activation='sigmoid', name=f'task_{i}_hidden')(shared_layer)
    task_output = Dense(1,  activation='sigmoid', name=f'task_{i}_output')(task_output)
    outputs.append(task_output)
outputs = Concatenate(axis=-1)(outputs)

训练模型并输出

In [76]:
model = Model(inputs=shared_input, outputs=outputs)
# 编译模型，添加精确率和召回率指标
model.compile(
    optimizer='adam',
    loss='binary_crossentropy', #将NN的值传入最终的separate models
    metrics=[
        tf.keras.metrics.Precision(name='precision', thresholds=[0.2, 0.5, 0.7, 0.8, 0.9, 0.95]),
        tf.keras.metrics.Recall(name='recall', thresholds=[0.2, 0.5, 0.7, 0.8, 0.9, 0.95]),
        tf.keras.metrics.CategoricalAccuracy(name='accuracy_score')
    ])


# 训练模型
history = model.fit(X_train, Y_train, epochs=100, validation_data=(X_valid, Y_valid))
# 计算模型在训练集上的精确率和召回率
train_precision = history.history['precision'] 
train_recall = history.history['recall']
train_accuracy = history.history['accuracy_score']

# 计算模型在验证集上的精确率和召回率
val_precision = history.history['val_precision']
val_recall = history.history['val_recall']
val_accuracy = history.history['val_accuracy_score']

# 打印训练集和验证集上的精确率和召回率
print(f'Train Precision: {train_precision[-1]}')
print(f'Train Recall: {train_recall[-1]}')
print(f'Train Accuracy: {train_accuracy[-1]}')
print(f'Validation Precision: {val_precision[-1]}')
print(f'Validation Recall: {val_recall[-1]}')
print(f'Validation Accuracy: {val_accuracy[-1]}')

Epoch 1/100
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 7ms/step - accuracy_score: 0.0175 - loss: 0.7178 - precision: 0.2588 - recall: 0.2765 - val_accuracy_score: 0.0476 - val_loss: 0.6865 - val_precision: 0.1795 - val_recall: 0.3090
Epoch 2/100
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy_score: 0.0429 - loss: 0.6823 - precision: 0.1848 - recall: 0.3232 - val_accuracy_score: 0.0476 - val_loss: 0.6731 - val_precision: 0.1869 - val_recall: 0.3166
Epoch 3/100
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy_score: 0.0281 - loss: 0.6683 - precision: 0.1883 - recall: 0.2859 - val_accuracy_score: 0.0603 - val_loss: 0.6467 - val_precision: 0.1893 - val_recall: 0.3129
Epoch 4/100
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy_score: 0.0582 - loss: 0.6270 - precision: 0.2480 - recall: 0.3071 - val_accuracy_score: 0.2127 - val_loss: 0.6027 - val_precision: 0.3509 -