In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

In [2]:
# 导入数据
data = pd.read_csv("sph6004_assignment1_data.csv")

# 数据处理
# 删除缺失值大于5%的列
missing_percentage = data.isnull().mean() * 100
drop_columns = missing_percentage[missing_percentage >= 5].index
data = data.drop(columns=drop_columns)
# id跟预测无关，删除
data = data.drop(columns = "id")
# 删除人种列
data = data.drop(columns = "race")
# 删除包含缺失值的行
data = data.dropna()
# 性别用0/1表示
gender_mapping = {'F': 0, 'M': 1}
data['gender'] = data['gender'].map(gender_mapping)
# aki列进行二分
#data["aki"].replace({2: 1, 3: 1}, inplace=True)
data["aki"].replace({1: 0, 2: 1, 3: 1}, inplace=True)
#data["aki"].replace({1: 0, 2: 0, 3: 1}, inplace=True)
data.head()

Unnamed: 0,aki,gender,admission_age,heart_rate_min,heart_rate_max,heart_rate_mean,sbp_min,sbp_max,sbp_mean,dbp_min,...,sodium_min.1,sodium_max.1,potassium_min.1,potassium_max.1,gcs_min,gcs_motor,gcs_verbal,gcs_eyes,gcs_unable,weight_admit
0,1,0,79.953141,96.0,104.0,100.083333,103.0,126.0,116.136364,40.0,...,131.0,138.0,4.8,6.7,15.0,6.0,5.0,4.0,0.0,110.0
1,0,0,78.194169,72.0,134.0,97.263158,97.0,127.0,109.833333,56.0,...,126.0,132.0,2.9,4.5,15.0,6.0,5.0,4.0,0.0,82.0
2,1,0,65.602396,60.0,97.0,84.166667,95.0,143.0,112.153846,56.0,...,137.0,137.0,3.8,3.8,15.0,6.0,5.0,4.0,0.0,62.1
3,1,0,64.906629,59.0,87.0,71.461538,113.0,150.0,138.16,60.0,...,140.0,141.0,4.3,4.5,15.0,1.0,0.0,1.0,1.0,113.1
5,1,1,62.002429,78.0,105.0,90.0,80.0,154.0,114.44,42.0,...,139.0,140.0,3.6,5.2,15.0,6.0,5.0,4.0,0.0,137.9


In [3]:
# 特征重要性评分
X = data.drop('aki', axis=1)  # 特征
y = data['aki']               # 目标变量
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 训练随机森林模型
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# 获取特征重要性
feature_importances = pd.Series(rf.feature_importances_, index=X_train.columns)
# 排序特征重要性
sorted_features = feature_importances.sort_values(ascending=False)
# 选取前20个最重要的特征
top_20_features = sorted_features[:20]
print(top_20_features)

weight_admit        0.052849
admission_age       0.036417
bun_max             0.030932
bun_min             0.029226
sbp_min             0.025389
resp_rate_mean      0.023210
spo2_mean           0.021941
dbp_mean            0.021924
sbp_mean            0.021604
heart_rate_mean     0.021566
glucose_max.2       0.021329
temperature_mean    0.021056
platelets_min       0.020969
glucose_max         0.020896
wbc_max             0.020804
mbp_min             0.020449
dbp_min             0.020394
platelets_max       0.020308
mbp_mean            0.020235
heart_rate_max      0.019805
dtype: float64


In [4]:
# 重新分割数据集，只使用顶部20个特征
X_top_features = X[top_20_features.index]
X_train_top, X_test_top, y_train, y_test = train_test_split(X_top_features, y, test_size=0.2, random_state=42)

In [5]:
###### 1 随机森林模型

# 训练模型
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train_top, y_train)

# 预测测试集
y_pred = model.predict(X_test_top)

# 模型评估
accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)
print(f"Accuracy: {accuracy}\nROC-AUC: {roc_auc}")
print(classification_report(y_test, y_pred))

# 交叉验证
cv_scores = cross_val_score(model, X_top_features, y, cv=5, scoring='roc_auc')
print(f"CV ROC-AUC Score: {cv_scores.mean()}")

Accuracy: 0.6596739130434782
ROC-AUC: 0.6579881639433622
              precision    recall  f1-score   support

           0       0.67      0.70      0.68      4802
           1       0.65      0.62      0.64      4398

    accuracy                           0.66      9200
   macro avg       0.66      0.66      0.66      9200
weighted avg       0.66      0.66      0.66      9200

CV ROC-AUC Score: 0.7173333779601635


In [7]:
###### 2 SVM模型

# 特征标准化
scaler = StandardScaler()
X_train_top_scaled = scaler.fit_transform(X_train_top)
X_test_top_scaled = scaler.transform(X_test_top)

# 训练SVM模型
svm_model = SVC(kernel='rbf', probability=True, random_state=42)
svm_model.fit(X_train_top_scaled, y_train)

# 预测测试集
y_pred = svm_model.predict(X_test_top_scaled)

# 模型评估
accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, svm_model.predict_proba(X_test_top_scaled)[:, 1])
print(f"Accuracy: {accuracy}\nROC-AUC: {roc_auc}")
print(classification_report(y_test, y_pred))

# 交叉验证（可选）
cv_scores = cross_val_score(svm_model, scaler.transform(X_top_features), y, cv=5, scoring='roc_auc')
print(f"CV ROC-AUC Score: {cv_scores.mean()}")


Accuracy: 0.676304347826087
ROC-AUC: 0.73564391371717
              precision    recall  f1-score   support

           0       0.68      0.73      0.70      4802
           1       0.68      0.62      0.64      4398

    accuracy                           0.68      9200
   macro avg       0.68      0.67      0.67      9200
weighted avg       0.68      0.68      0.68      9200

CV ROC-AUC Score: 0.7305172768032417


In [6]:
##### 3 GBM模型

# 训练GBM模型
gbm_model = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
gbm_model.fit(X_train_top, y_train)

# 预测测试集
y_pred = gbm_model.predict(X_test_top)

# 模型评估
accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, gbm_model.predict_proba(X_test_top)[:, 1])
print(f"Accuracy: {accuracy}\nROC-AUC: {roc_auc}")
print(classification_report(y_test, y_pred))

# 交叉验证（可选）
cv_scores = cross_val_score(gbm_model, X_top_features, y, cv=5, scoring='roc_auc')
print(f"CV ROC-AUC Score: {cv_scores.mean()}")


Accuracy: 0.6632608695652173
ROC-AUC: 0.7301819633664084
              precision    recall  f1-score   support

           0       0.67      0.71      0.69      4802
           1       0.66      0.61      0.63      4398

    accuracy                           0.66      9200
   macro avg       0.66      0.66      0.66      9200
weighted avg       0.66      0.66      0.66      9200

CV ROC-AUC Score: 0.7253650699191198
