<a href="https://colab.research.google.com/github/MO230101/The-codes-for-hydrogel-study-/blob/main/Selection_of_RFE_model_for_paper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#250502-2
import pandas as pd
from sklearn.feature_selection import RFE
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# データの読み込み
df = pd.read_csv('file name.csv', index_col=0)
X = df.drop(['CA1'], axis=1)
y = df['CA1']

# 最終的な特徴量数を10に設定
n_features_to_select = 10

# 評価するモデルのリスト
models = {
    'GradientBoosting': GradientBoostingClassifier(random_state=42),
    'LogisticRegression': LogisticRegression(random_state=42, solver='liblinear'),
    'RandomForest': RandomForestClassifier(random_state=42)
}

# 結果を保存する辞書
results = {}
selected_features_all = {}
best_model_name = None
best_accuracy = -1.0

for name, model in models.items():
    print(f"\n--- {name} ---")

    # RFE を実行
    rfe = RFE(estimator=model, n_features_to_select=n_features_to_select, step=1)
    rfe.fit(X, y)

    # 選択された特徴量の名前を取得
    selected_feature_names = X.columns[rfe.support_]
    print(f"選択された {n_features_to_select} 個の特徴量: {selected_feature_names.tolist()}")
    selected_features_all[name] = selected_feature_names.tolist()

    # 選択された特徴量のみのデータを作成
    X_selected = X[selected_feature_names]

    # データを訓練データとテストデータに分割
    X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.3, random_state=42, stratify=y)

    # モデルを学習
    model.fit(X_train, y_train)

    # テストデータで予測
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1] if hasattr(model, 'predict_proba') else None

    # 評価指標を計算
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_pred_proba) if y_pred_proba is not None else None

    # 評価指標を表示
    print("予測性能:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-score: {f1:.4f}")
    if auc is not None:
        print(f"AUC: {auc:.4f}")
    else:
        print("AUC: Not applicable")

    results[name] = {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1-score': f1,
        'AUC': auc
    }

    # 最も性能の良いモデルを追跡
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_model_name = name

# 最も予測性能が良かったモデルの選択された特徴量を取得
if best_model_name:
    best_features = selected_features_all[best_model_name]
    output_df = pd.concat([X[best_features], df['CA1']], axis=1)
    output_filename = f'HNCO_CMSECPMGdry_Top{n_features_to_select}_Features_with_CA1_{best_model_name}.csv'
    output_df.to_csv(output_filename, index=False)
    print(f"\n最も予測性能の高かったモデル ({best_model_name}) で選択された {n_features_to_select} 個の特徴量と目的変数を含むデータフレームを '{output_filename}' に保存しました。")
else:
    print("\nモデルの評価中にエラーが発生しました。")

print("\n各モデルで選択された特徴量:")
for name, features in selected_features_all.items():
    print(f"{name}: {features}")

print("\n各モデルの予測性能:")
for name, metrics in results.items():
    print(f"{name}:")
    for metric, value in metrics.items():
        if value is not None:
            print(f"  {metric}: {value:.4f}")
        else:
            print(f"  {metric}: Not applicable")

print("\n処理が完了しました。")


--- GradientBoosting ---
選択された 10 個の特徴量: ['MinEStateIndex', 'MinPartialCharge', 'PEOE_VSA12', 'PEOE_VSA14', 'VSA_EState3', 'fr_Al_OH', 'DSC_area', 'DSC_MC_start', 'DSC_MC_end', 'DSC_peak_height']
予測性能:
Accuracy: 0.6250
Precision: 0.7500
Recall: 0.6000
F1-score: 0.6667
AUC: 0.5667

--- LogisticRegression ---
選択された 10 個の特徴量: ['MaxEStateIndex', 'MinEStateIndex', 'PEOE_VSA12', 'PEOE_VSA13', 'PEOE_VSA14', 'SlogP_VSA1', 'SlogP_VSA12', 'fr_Al_OH', 'DSC_MC_end', 'DSC_peak_height']
予測性能:
Accuracy: 0.8750
Precision: 0.8333
Recall: 1.0000
F1-score: 0.9091
AUC: 0.8667

--- RandomForest ---
選択された 10 個の特徴量: ['MinEStateIndex', 'MinPartialCharge', 'PEOE_VSA14', 'EState_VSA9', 'VSA_EState3', 'fr_Al_OH', 'DSC_area', 'DSC_MC_start', 'DSC_MC_end', 'DSC_peak_height']
予測性能:
Accuracy: 0.5000
Precision: 0.6000
Recall: 0.6000
F1-score: 0.6000
AUC: 0.6667

最も予測性能の高かったモデル (LogisticRegression) で選択された 10 個の特徴量と目的変数を含むデータフレームを 'HNCO_CMSECPMGdry_Top10_Features_with_CA1_LogisticRegression.csv' に保存しました。

各モデルで選択された特徴