In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, log_loss, f1_score

# 读取CSV文件
data = pd.read_csv('Processed_Dataset_of_Diabetes_Version3.csv')

# 提取特征和目标变量
X = data.iloc[:, :11]
y = data.iloc[:, -1]

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# 创建并训练随机森林分类器
rf_classifier = RandomForestClassifier(random_state=42)
rf_classifier.fit(X_train, y_train)

# 预测类别和概率
y_pred = rf_classifier.predict(X_test)
y_pred_proba = rf_classifier.predict_proba(X_test)

# 计算评估指标
acc = accuracy_score(y_test, y_pred)
loss = log_loss(y_test, y_pred_proba)
f1 = f1_score(y_test, y_pred, average='weighted')

# 输出评估指标
print(f"准确率 (ACC): {acc}")
print(f"对数损失 (Loss): {loss}")
print(f"F1分数 (F1-score): {f1}")
    

准确率 (ACC): 0.985
对数损失 (Loss): 0.062105235850144686
F1分数 (F1-score): 0.9851526794742164


In [13]:
from joblib import dump, load

In [17]:
dump(rf_classifier,'model.joblib')

['model.joblib']