In [None]:
# ライブラリのインポート
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, precision_score
from sklearn.utils import class_weight
from sklearn.ensemble import BaggingClassifier
from imblearn.under_sampling import RandomUnderSampler
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb

import shap

# データの読み込み
df = pd.read_excel('lightGBM_data_standard.xlsx')

# カテゴリ変数のデータ型を変換
cat_cols = ['FBS', 'HbA1c', 'US', 'HbA1c_NGSP']
df[cat_cols] = df[cat_cols].astype(str)

# ラベルエンコーディングを実行
le = LabelEncoder()
df[cat_cols] = df[cat_cols].apply(le.fit_transform)

# 特徴量とターゲットに分割
X = df.drop('T2db', axis=1)
y = df['T2db']

# 特定の列を削除
# df = df.drop(['TG', 'HDL', 'LDL', 'GOT', 'GPT', 'γ_GT', 'Ht', 'Hb', 'RBC', 'chewing', 'Medication1_Blood Pressure', 'Time of blood collection _after meal', 'Medication2_Blood Sugar', 'Eating style3_midnight snack', 'One-year weight change', 'HbA1c', 'FBS', 'US', 'HbA1c_NGSP'], axis=1)

# 削除した結果を反映した特徴量を再作成
X = df.drop('T2db', axis=1)

# アンダーサンプリング
rus = RandomUnderSampler(random_state=0)
X_resampled, y_resampled = rus.fit_resample(X, y)

# バギングによるモデルの学習と評価
accuracy_scores = []
f1_scores = []
precision_scores = []  # 適合率を格納するリスト
confusion_matrices = []

n_estimators = 10  # バギングの回数

for _ in range(n_estimators):
    # 学習データとテストデータに分割
    X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.3, shuffle=True, stratify=y_resampled, random_state=0)
    print('X_trainの形状：', X_train.shape, ' y_trainの形状：', y_train.shape, ' X_testの形状：', X_test.shape, ' y_testの形状：', y_test.shape)

    # LightGBM用のデータセットを作成
    lgb_train = lgb.Dataset(X_train, y_train)

    # ハイパーパラメータの設定
    params = {
        'objective': 'binary',
        'num_leaves': 5,
        'seed': 0,
        'verbose': -1,
    }

    # モデルの学習
    model = lgb.train(params,
                      lgb_train,
                      num_boost_round=500,
                      valid_sets=[lgb_train],
                      valid_names=['train'],
                      early_stopping_rounds=20)

    # テストデータの予測と評価
    y_test_pred_proba = model.predict(X_test)  # ラベル1の確率
    y_test_pred = np.round(y_test_pred_proba)  # 確率をラベル0 or 1に変換

    ac_score = accuracy_score(y_test, y_test_pred)
    f1 = f1_score(y_test, y_test_pred)
    precision = precision_score(y_test, y_test_pred)  # 適合率の計算

    # 混同行列
    cm = confusion_matrix(y_test, y_test_pred)
    plt.figure(figsize=(6, 4))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', annot_kws={"size": 20})
    plt.xlabel('pred')
    plt.ylabel('label')
    plt.savefig('confusion_matrix.png', dpi=300)
    plt.close()

    accuracy_scores.append(ac_score)
    f1_scores.append(f1)
    precision_scores.append(precision)  # 適合率
    confusion_matrices.append(cm)

# 結果の表示
print('accuracy = %.2f' % np.mean(accuracy_scores))
print('F1-score = %.2f' % np.mean(f1_scores))
print('Precision = %.2f' % np.mean(precision_scores))  