In [None]:
import lightgbm as lgb
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.preprocessing import StandardScaler
import numpy as np
# 假设你已经准备好了训练数据和测试数据
train_data = pd.read_csv('/kaggle/input/datawhale/糖尿病风险预测挑战赛公开数据/train.csv')
test_data = pd.read_csv('/kaggle/input/datawhale/糖尿病风险预测挑战赛公开数据/test.csv')

# 假设你已经进行了适当的特征选择和处理
# 请确保特征列的名字与训练数据和测试数据中的名字一致

# 划分训练集和验证集
X_train = train_data.drop(columns=['id', 'target'])  # 移除'id'和'target'列，保留特征
y_train = train_data['target']  # 目标列

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# 创建LightGBM数据集
dtrain = lgb.Dataset(X_train, label=y_train)
dval = lgb.Dataset(X_val, label=y_val, reference=dtrain)

# 设置LightGBM参数
params = {
    'objective': 'multiclass',  # 多分类问题
    'num_class': 3,  # 三个类别
    'metric': 'multi_logloss',  # 评估指标为多类别对数损失
    'max_depth': 6,  # 树的最大深度
    'min_child_weight': 1,  # 叶子节点最小权重
    'subsample': 0.8,  # 每棵树的样本随机抽样比例
    'colsample_bytree': 0.8,  # 每棵树的特征随机抽样比例
    'learning_rate': 0.1,  # 学习率
    'n_estimators': 100,  # 树的数量
    'seed': 42,  # 随机种子，可选
    'is_unbalance': True,
}

# 训练LightGBM模型
model = lgb.train(params, dtrain, num_boost_round=1000, valid_sets=[dval], early_stopping_rounds=50, verbose_eval=100)

# 预测验证集
y_pred = model.predict(X_val, num_iteration=model.best_iteration)
y_pred_class = [np.argmax(pred) for pred in y_pred]

# 计算F1分数
f1 = f1_score(y_val, y_pred_class, average='macro')
print(f'Validation F1 Score: {f1}')

# 预测测试数据
test_pred = model.predict(test_data.drop(columns=['id']), num_iteration=model.best_iteration)
test_pred_class = [np.argmax(pred) for pred in test_pred]

# 创建结果DataFrame
results = pd.DataFrame({'id': test_data['id'], 'target': test_pred_class})

# 保存结果为CSV文件
results.to_csv('submission_lgbm.csv', index=False)

