In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score

# 1. 读取数据
data = pd.read_csv("/kaggle/input/datawhale/糖尿病风险预测挑战赛公开数据/train.csv")  # 替换为您的训练数据文件路径
binary_columns = ['HighBP', 'HighChol', 'CholCheck', 'Smoker', 'Stroke', 'HeartDiseaseorAttack',
                  'PhysActivity', 'Fruits', 'Veggies', 'HvyAlcoholConsump', 'NoDocbcCost', 'DiffWalk', 'Sex']

for col in binary_columns:
    data[col] = (data[col] > 0.5).astype(int)
# 2. 数据预处理
X = data.drop(columns=['id', 'target'])  # 除去id和目标列的特征数据
y = data['target']  # 目标列

# 划分训练集和验证集
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# 特征标准化
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)

# 3. 构建DNN模型
model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(X_train.shape[1],)),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(3, activation='softmax')  # 输出层，使用softmax激活函数进行多类别分类
])

# 编译模型
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# 4. 计算类别权重
class_weights = dict()
class_counts = y_train.value_counts()

for class_label, count in class_counts.items():
    weight = max(class_counts.values) / count
    class_weights[class_label] = weight

# 5. 训练模型，并传递类别权重
model.fit(X_train, y_train, epochs=20, batch_size=32, validation_data=(X_val, y_val), class_weight=class_weights)

# 6. 预测验证集
val_predictions = model.predict(X_val)
val_predicted_labels = np.argmax(val_predictions, axis=-1)  # 找到最可能的类别

# 7. 计算F1-score
f1 = f1_score(y_val, val_predicted_labels, average='weighted')

print("Validation F1-score:", f1)

# 8. 预测测试集
test_data = pd.read_csv("/kaggle/input/datawhale/糖尿病风险预测挑战赛公开数据/test.csv")  # 替换为您的测试数据文件路径
for col in binary_columns:
    test_data[col] = (test_data[col] > 0.5).astype(int)
X_test = scaler.transform(test_data.drop(columns=['id']))

test_predictions = model.predict(X_test)
test_predicted_labels = np.argmax(test_predictions, axis=-1)

# 9. 将预测结果保存到CSV文件
test_data['target'] = test_predicted_labels
test_data[['id', 'target']].to_csv("submission.csv", index=False)  # 保存预测结果到submission.csv


