In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split 

from pycaret.classification import *
import os
import joblib
import numpy as np
import shutil

In [2]:
selected_descriptor = pd.read_csv('../data/descriptor_selection.csv')

file_md_list = {}
for column in selected_descriptor.columns:
    filename = column
    selected_columns = selected_descriptor[column].iloc[0:].dropna().tolist()
    if filename and selected_columns:
        file_md_list[filename] = selected_columns

In [None]:
default_filenames = {
    'auc': 'AUC.png',
    'confusion_matrix': 'Confusion Matrix.png',
    'learning': 'Learning Curve.png',
    'feature': 'Feature Importance.png',
    'error': 'Prediction Error.png',
    'calibration': 'Calibration Curve.png'
}

for ratio in ['5x','10x','20x']:
    for TF in ['True','False']:
        file_name = f'descriptors_filtered_FTO_training_{ratio}_ignore3D_{TF}.csv'
        base_path = f'FTO_FP+weka_MD/{ratio}_md{TF}'.replace('True', 'T').replace('False', 'F')
        df = pd.read_csv(f"../data/preprocessed/filtered_FTO_training_{ratio}_ignore3D_{TF}.csv")
        print(f"{'='*50}{base_path} start{'='*50}")

        # 저장 폴더 생성
        os.makedirs(f'../result/{base_path}', exist_ok=True)
        os.makedirs(os.path.join("../result", base_path, "models"), exist_ok=True)
        plot_save_dir = os.path.join("../result", base_path, "plots")
        os.makedirs(plot_save_dir, exist_ok=True)

        md_cols = file_md_list[file_name]
        fp_cols = [f'X{i+1}' for i in range(1024)]
        filtered_df = df[['potency']+fp_cols+md_cols]

        X = filtered_df.drop('potency', axis=1)
        Y = filtered_df['potency']

        x_train, x_test, y_train, y_test = train_test_split(
            X, Y,
            test_size=0.1,
            random_state=42,
            stratify=Y
        )

        # 모델 구축
        all_comparison_df = {} 
        df_train = pd.concat([x_train, y_train], axis=1)

        setup(
            data=df_train,
            target='potency',
            session_id=42,
            train_size=0.9,
            fold=10,
            normalize=True,
            fix_imbalance=True,
            remove_outliers=True,
            n_jobs=-1, 
            verbose=False
        )
        compare_models(fold=5, sort='F1', n_select=5, turbo=True) 
        results_df = pull()

        # Top 5 모델 튜닝
        top_models = results_df.index[:5].tolist()
        summary_data = []

        for model_id in top_models:
            print(f"Tuning {model_id}...")
            
            # 모델 생성 및 튜닝
            model = create_model(model_id, verbose=False)
            tuned_model = tune_model(
                model, 
                optimize='F1',
                n_iter=50,
                fold=5,
                choose_better=True,
                verbose=False
            )
            
            # 결과 데이터 가져오기
            results = pull()
            model_name = tuned_model.__class__.__name__
            
            # 튜닝된 모델 저장 (.pkl)
            model_path = os.path.join("../result", base_path, "models", f"{ratio}_{model_name}_model.pkl")
            joblib.dump(tuned_model, model_path)
            
            # 평가 결과 저장 (.csv)
            eval_path = os.path.join("../result", base_path, f"{ratio}_{model_name}_evaluation.csv")
            results.to_csv(eval_path, index=False)
            
            # 각 모델별 plot 생성
            for plot_type, default_name in default_filenames.items():
                try:
                    plot_model(tuned_model, plot=plot_type, save=True, verbose=False)
                    
                    final_filename = f"{ratio}_{model_name}_{plot_type}.png"
                    final_save_path = os.path.join(plot_save_dir, final_filename)
                    
                    if os.path.exists(default_name):
                        shutil.move(default_name, final_save_path)
                        
                except Exception as e:
                    print(f"     - [{plot_type}] Plot 생성 실패: {e}")
            
            # 요약 데이터 추가
            numeric_cols = results.select_dtypes(include=[np.number]).columns
            avg_row = results[numeric_cols].mean().to_dict()
            std_row = results[numeric_cols].std().to_dict()
            
            avg_row.update({'Model': model_name, 'Type': 'Mean'})
            std_row.update({'Model': model_name, 'Type': 'Std'})
            
            summary_data.extend([avg_row, std_row])

        combined_summary = pd.DataFrame(summary_data)
        summary_path = os.path.join("../result", base_path, f"{ratio}_summary_evaluation.csv")
        combined_summary.to_csv(summary_path, index=False)



Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lr,Logistic Regression,0.9483,0.9643,0.8009,0.8836,0.8387,0.808,0.8105,0.95
et,Extra Trees Classifier,0.9483,0.9604,0.7852,0.8942,0.8358,0.8052,0.8079,0.246
gbc,Gradient Boosting Classifier,0.9457,0.9622,0.8113,0.8586,0.8329,0.8005,0.8019,0.894
svm,SVM - Linear Kernel,0.9448,0.9602,0.7487,0.9101,0.8194,0.7873,0.7938,0.192
lightgbm,Light Gradient Boosting Machine,0.9439,0.9606,0.7484,0.9024,0.8172,0.7844,0.7899,0.396
rf,Random Forest Classifier,0.9422,0.968,0.7538,0.8842,0.8133,0.7794,0.7832,0.25
ada,Ada Boost Classifier,0.9247,0.9202,0.749,0.7901,0.7682,0.7233,0.7242,0.356
dt,Decision Tree Classifier,0.8905,0.8253,0.7274,0.6616,0.6899,0.6239,0.6271,0.218
nb,Naive Bayes,0.8888,0.7156,0.4553,0.7956,0.5784,0.5196,0.547,0.65
ridge,Ridge Classifier,0.753,0.8243,0.7745,0.385,0.5131,0.3727,0.4143,0.196


Tuning lr...
Tuning et...
Tuning gbc...
Tuning svm...
     - [auc] Plot 생성 실패: AUC plot not available for estimators with no predict_proba attribute.
     - [calibration] Plot 생성 실패: Calibration plot not available for estimators with no predict_proba attribute.
Tuning lightgbm...


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
gbc,Gradient Boosting Classifier,0.9518,0.962,0.8062,0.8966,0.8486,0.82,0.822,0.97
lr,Logistic Regression,0.9492,0.9656,0.8063,0.8837,0.8422,0.812,0.814,0.208
lightgbm,Light Gradient Boosting Machine,0.9483,0.9593,0.7695,0.9078,0.8324,0.8022,0.8063,0.414
et,Extra Trees Classifier,0.9466,0.9673,0.78,0.8885,0.8304,0.7988,0.8014,0.264
rf,Random Forest Classifier,0.9422,0.9622,0.754,0.8843,0.8135,0.7796,0.7833,0.26
svm,SVM - Linear Kernel,0.9387,0.9545,0.7012,0.9136,0.7919,0.7568,0.7668,0.198
ada,Ada Boost Classifier,0.9299,0.9195,0.7645,0.8075,0.7843,0.7426,0.7437,0.362
dt,Decision Tree Classifier,0.8966,0.8373,0.7483,0.6764,0.7083,0.6459,0.6487,0.208
nb,Naive Bayes,0.8879,0.715,0.4553,0.7855,0.5763,0.5168,0.5425,0.212
ridge,Ridge Classifier,0.7574,0.8303,0.764,0.3896,0.5149,0.3763,0.4146,0.204


Tuning gbc...
Tuning lr...
Tuning lightgbm...
Tuning et...
Tuning rf...


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lr,Logistic Regression,0.9713,0.9686,0.8421,0.843,0.8421,0.8263,0.8266,0.286
et,Extra Trees Classifier,0.9694,0.9644,0.7737,0.8769,0.8201,0.8035,0.8065,0.38
gbc,Gradient Boosting Classifier,0.968,0.9681,0.7895,0.8485,0.8171,0.7996,0.8007,1.77
lightgbm,Light Gradient Boosting Machine,0.9661,0.9647,0.7211,0.8844,0.7935,0.7753,0.7806,0.662
rf,Random Forest Classifier,0.9642,0.968,0.7105,0.8719,0.7817,0.7625,0.7679,0.362
ada,Ada Boost Classifier,0.9556,0.9501,0.7526,0.7567,0.7538,0.7294,0.7299,0.584
svm,SVM - Linear Kernel,0.957,0.9654,0.6211,0.8745,0.7229,0.7004,0.7143,0.252
ridge,Ridge Classifier,0.9207,0.9379,0.8421,0.5446,0.6599,0.6176,0.6373,0.25
lda,Linear Discriminant Analysis,0.9197,0.9372,0.8421,0.541,0.6572,0.6145,0.6347,0.472
dt,Decision Tree Classifier,0.9326,0.8303,0.7053,0.6123,0.6551,0.618,0.6201,0.314


Tuning lr...
Tuning et...
Tuning gbc...
Tuning lightgbm...
Tuning rf...


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lr,Logistic Regression,0.9713,0.9661,0.8421,0.8446,0.8426,0.8268,0.8273,0.268
et,Extra Trees Classifier,0.9704,0.9696,0.7789,0.8811,0.8263,0.8102,0.8123,0.352
gbc,Gradient Boosting Classifier,0.968,0.9668,0.7895,0.8496,0.818,0.8005,0.8014,1.736
lightgbm,Light Gradient Boosting Machine,0.9685,0.9621,0.7579,0.8789,0.8135,0.7964,0.7993,0.642
svm,SVM - Linear Kernel,0.9666,0.9597,0.7368,0.8797,0.7995,0.7815,0.7865,0.244
rf,Random Forest Classifier,0.9665,0.9695,0.7158,0.8964,0.7949,0.777,0.7834,0.338
ada,Ada Boost Classifier,0.9589,0.9439,0.7579,0.7839,0.7699,0.7474,0.748,0.554
ridge,Ridge Classifier,0.9135,0.9356,0.8316,0.5181,0.6375,0.5916,0.6135,0.252
lda,Linear Discriminant Analysis,0.913,0.9352,0.8316,0.5159,0.636,0.5899,0.612,0.46
dt,Decision Tree Classifier,0.9269,0.8248,0.7,0.5826,0.6356,0.5954,0.5987,0.29


Tuning lr...
Tuning et...
Tuning gbc...
Tuning lightgbm...
Tuning svm...
     - [auc] Plot 생성 실패: AUC plot not available for estimators with no predict_proba attribute.
     - [calibration] Plot 생성 실패: Calibration plot not available for estimators with no predict_proba attribute.


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lightgbm,Light Gradient Boosting Machine,0.986,0.9728,0.7947,0.8982,0.8431,0.8358,0.8376,0.952
rf,Random Forest Classifier,0.9842,0.9732,0.7421,0.9119,0.817,0.8089,0.8142,0.546
et,Extra Trees Classifier,0.983,0.9779,0.7579,0.8678,0.8081,0.7993,0.8018,0.572
lr,Logistic Regression,0.981,0.9626,0.8263,0.7867,0.805,0.795,0.7959,0.414
gbc,Gradient Boosting Classifier,0.9815,0.9674,0.7842,0.8199,0.8012,0.7915,0.792,3.114
ridge,Ridge Classifier,0.9727,0.9478,0.8474,0.6705,0.7476,0.7334,0.7396,0.364
lda,Linear Discriminant Analysis,0.9725,0.9477,0.8474,0.6681,0.7459,0.7316,0.7381,0.672
ada,Ada Boost Classifier,0.9742,0.9331,0.7263,0.7329,0.7292,0.7157,0.7159,0.954
dt,Decision Tree Classifier,0.9605,0.8367,0.7,0.5682,0.6263,0.6057,0.6099,0.46
svm,SVM - Linear Kernel,0.9712,0.9708,0.4632,0.8694,0.6042,0.5907,0.6225,0.38


Tuning lightgbm...
Tuning rf...
Tuning et...
Tuning lr...
Tuning gbc...


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lightgbm,Light Gradient Boosting Machine,0.985,0.9694,0.7842,0.8864,0.8316,0.8238,0.8258,1.03
lr,Logistic Regression,0.9822,0.9641,0.8158,0.8117,0.8134,0.8041,0.8042,0.426
et,Extra Trees Classifier,0.9832,0.9677,0.7526,0.8768,0.8089,0.8002,0.8033,0.598
rf,Random Forest Classifier,0.983,0.9797,0.7368,0.8864,0.8033,0.7945,0.799,0.538
gbc,Gradient Boosting Classifier,0.9815,0.9673,0.7789,0.8239,0.8005,0.7908,0.7913,3.27
ridge,Ridge Classifier,0.9747,0.9492,0.8474,0.6975,0.7631,0.7499,0.755,0.388
lda,Linear Discriminant Analysis,0.9747,0.9491,0.8474,0.6975,0.7631,0.7499,0.755,0.74
ada,Ada Boost Classifier,0.9765,0.9555,0.7526,0.7549,0.7531,0.7408,0.7411,0.972
svm,SVM - Linear Kernel,0.976,0.9718,0.5737,0.8795,0.6923,0.6805,0.6983,0.38
dt,Decision Tree Classifier,0.9585,0.8207,0.6684,0.5525,0.6015,0.5799,0.5847,0.484


Tuning lightgbm...
Tuning lr...
Tuning et...
Tuning rf...
Tuning gbc...
