In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from pycaret.classification import *
import os
import joblib
import shutil
import warnings
import matplotlib.pyplot as plt
import shap

warnings.filterwarnings('ignore', category=UserWarning, module='lightgbm')

# 세팅

## 사용할 descriptor

In [2]:
selected_descriptor = pd.read_csv('../data/descriptor_selection.csv')

file_md_list = {}
for column in selected_descriptor.columns:
    filename = column
    selected_columns = selected_descriptor[column].iloc[0:].dropna().tolist()
    if filename and selected_columns:
        file_md_list[filename] = selected_columns

## 저장 함수

In [3]:
def safe_move_file(src, dst):
    if os.path.exists(src):
        try:
            os.makedirs(os.path.dirname(dst), exist_ok=True)
            shutil.move(src, dst)
            return True
        except Exception as e:
            print(f"파일 이동 실패: {src} -> {dst}, 에러: {e}")
            return False
    return False

def safe_save_csv(df, path):
    try:
        os.makedirs(os.path.dirname(path), exist_ok=True)
        df.to_csv(path, index=False)
        return True
    except Exception as e:
        print(f"CSV 저장 실패: {path}, 에러: {e}")
        return False

In [4]:
default_filenames = {
    'auc': 'AUC.png',
    'confusion_matrix': 'Confusion Matrix.png',
    'learning': 'Learning Curve.png',
    'feature': 'Feature Importance.png',
    'error': 'Prediction Error.png',
    'calibration': 'Calibration Curve.png'
}

target_models = ['lr', 'et', 'gbc', 'lightgbm', 'svm', 'rf', 'ada']

In [5]:
data_dir = os.path.join("..", "data", "preprocessed")
result_dir = os.path.join("..", "result")

# 모델 학습 및 평가

## 개별 모델 학습 및 블랜딩할 모델 선정

In [6]:
for ratio in ['5x', '10x']:
    file_name = f'descriptors_filtered_FTO_training_{ratio}_ignore3D_False.csv'
    base_path = f'FTO_Final/{ratio}_w3D'
    
    # 데이터 파일 경로
    data_path = os.path.join(data_dir, f"filtered_FTO_training_{ratio}_ignore3D_False.csv")
    df = pd.read_csv(data_path)
    print(f"{'='*50}{base_path} start{'='*50}")

    # 저장 폴더 생성
    full_result_path = os.path.join(result_dir, base_path)
    models_dir = os.path.join(full_result_path, "models")
    plots_dir = os.path.join(full_result_path, "plots")
    
    for dir_path in [full_result_path, models_dir, plots_dir]:
        os.makedirs(dir_path, exist_ok=True)

    md_cols = file_md_list[file_name]
    fp_cols = [f'X{i+1}' for i in range(1024)]
    filtered_df = df[['potency'] + fp_cols + md_cols]

    X = filtered_df.drop('potency', axis=1)
    Y = filtered_df['potency']

    x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.1, random_state=42, stratify=Y)
    df_train = pd.concat([x_train, y_train], axis=1)

    setup(
        data=df_train, 
        target='potency',
        session_id=42,
        train_size=0.9,
        fold=10,
        normalize=True,
        fix_imbalance=True,
        remove_outliers=True,
        n_jobs=-1, 
        verbose=False
    )
    
    summary_data = []
    for model_id in target_models:
        model = create_model(model_id, verbose=False)
        print(f"Tuning {model_id}...")
        
        tuned_model = tune_model(
            model, 
            optimize='F1',
            n_iter=50,
            fold=5,
            choose_better=True,
            verbose=False
        )
        results = pull()
        model_name = tuned_model.__class__.__name__
        
        # 모델 저장
        model_path = os.path.join(models_dir, f"{ratio}_{model_name}_model.pkl")
        joblib.dump(tuned_model, model_path)
        print(f"모델 저장 완료")
        
        # 평가 결과 저장
        eval_path = os.path.join(full_result_path, f"{ratio}_{model_name}_evaluation.csv")
        safe_save_csv(results, eval_path)
        print(f"평가 결과 저장 완료")
        
        # 플롯 생성 및 저장
        for plot_type, default_name in default_filenames.items():
            try:
                print(f"{plot_type} // {default_name}")
                plot_model(tuned_model, plot=plot_type, save=True, verbose=False)
                
                if os.path.exists(default_name):
                    final_filename = f"{ratio}_{model_name}_{plot_type}.png"
                    final_save_path = os.path.join(plots_dir, final_filename)
                    shutil.move(default_name, final_save_path)
                    print(f"{default_name} 저장 완료")
                else:
                    print(f"{default_name} 파일이 생성되지 않았습니다.")
                    
            except Exception as e:
                print(f"[{plot_type}] Plot 생성 실패: {e}")
                
                error_filename = f"{ratio}_{model_name}_{plot_type}_error.txt"
                error_path = os.path.join(plots_dir, error_filename)
                with open(error_path, 'w', encoding='utf-8') as f:
                    f.write(f"Plot Type: {plot_type}\n")
                    f.write(f"Model: {model_name}\n")
                    f.write(f"Error: {str(e)}\n")
                print(f"에러 정보가 {error_filename}에 저장되었습니다.")
        
        numeric_cols = results.select_dtypes(include=[np.number]).columns
        avg_row = results[numeric_cols].mean().to_dict()
        std_row = results[numeric_cols].std().to_dict()
        
        avg_row.update({'Model': model_name, 'Type': 'Mean'})
        std_row.update({'Model': model_name, 'Type': 'Std'})
        
        summary_data.extend([avg_row, std_row])

    # 전체 요약 저장
    if summary_data:
        combined_summary = pd.DataFrame(summary_data)
        summary_path = os.path.join(full_result_path, f"{ratio}_summary_evaluation.csv")
        safe_save_csv(combined_summary, summary_path)

    print(f"{'='*50}{base_path} completed{'='*50}")



  File "c:\Users\user\anaconda3\envs\cuda\lib\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
  File "c:\Users\user\anaconda3\envs\cuda\lib\subprocess.py", line 505, in run
    with Popen(*popenargs, **kwargs) as process:
  File "c:\Users\user\anaconda3\envs\cuda\lib\subprocess.py", line 951, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
  File "c:\Users\user\anaconda3\envs\cuda\lib\subprocess.py", line 1436, in _execute_child
    hp, ht, pid, tid = _winapi.CreateProcess(executable, args,


Tuning lr...
모델 저장 완료
평가 결과 저장 완료
auc // AUC.png
AUC.png 저장 완료
confusion_matrix // Confusion Matrix.png
Confusion Matrix.png 저장 완료
learning // Learning Curve.png
Learning Curve.png 저장 완료
feature // Feature Importance.png
Feature Importance.png 저장 완료
error // Prediction Error.png
Prediction Error.png 저장 완료
calibration // Calibration Curve.png
Calibration Curve.png 저장 완료
Tuning et...
모델 저장 완료
평가 결과 저장 완료
auc // AUC.png
AUC.png 저장 완료
confusion_matrix // Confusion Matrix.png
Confusion Matrix.png 저장 완료
learning // Learning Curve.png
Learning Curve.png 저장 완료
feature // Feature Importance.png
Feature Importance.png 저장 완료
error // Prediction Error.png
Prediction Error.png 저장 완료
calibration // Calibration Curve.png
Calibration Curve.png 저장 완료
Tuning gbc...
모델 저장 완료
평가 결과 저장 완료
auc // AUC.png
AUC.png 저장 완료
confusion_matrix // Confusion Matrix.png
Confusion Matrix.png 저장 완료
learning // Learning Curve.png
Learning Curve.png 저장 완료
feature // Feature Importance.png
Feature Importance.png 저장 완료
error

KeyboardInterrupt: 

## 블랜딩 모델 생성

In [None]:
blend_models_list = ['et', 'gbc', 'lightgbm', 'lr']

all_blended_results = {}

for ratio in ['5x', '10x']:
    file_name = f'descriptors_filtered_FTO_training_{ratio}_ignore3D_False.csv'
    base_path = f'FTO_Final/{ratio}_w3D'
    
    data_path = os.path.join(data_dir, f"filtered_FTO_training_{ratio}_ignore3D_False.csv")
    df = pd.read_csv(data_path)
    print(f"{'='*30}{ratio} Blending Start{'='*30}")

    full_result_path = os.path.join(result_dir, base_path)
    blend_models_dir = os.path.join(full_result_path, "blend_models")
    blend_plots_dir = os.path.join(full_result_path, "blend_plots")
    
    for dir_path in [blend_models_dir, blend_plots_dir]:
        os.makedirs(dir_path, exist_ok=True)

    md_cols = file_md_list[file_name]
    fp_cols = [f'X{i+1}' for i in range(1024)]
    filtered_df = df[['potency'] + fp_cols + md_cols]

    X = filtered_df.drop('potency', axis=1)
    Y = filtered_df['potency']

    x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.1, random_state=42, stratify=Y)
    df_train = pd.concat([x_train, y_train], axis=1)

    exp = setup(
        data=df_train, 
        target='potency',
        session_id=42,
        train_size=0.9,
        fold=10,
        normalize=True,
        fix_imbalance=True,
        remove_outliers=True,
        n_jobs=1,
        verbose=False,
    )
    
    fresh_models = []
    model_names = []
    individual_results = {}
    
    for model_id in blend_models_list:
        print(f"Creating fresh {model_id} model...")
        if model_id == 'lightgbm':
            lgb_params = {
                'boosting_type': 'gbdt',
                'num_leaves': 31,
                'learning_rate': 0.1,
                'feature_fraction': 0.8,
                'bagging_fraction': 0.8,
                'bagging_freq': 5,
                'min_child_samples': 20,
                'random_state': 42,
                'n_estimators': 100,
                'verbosity': -1
            }
            fresh_model = create_model(model_id, verbose=False, **lgb_params)
        else:
            fresh_model = create_model(model_id, verbose=False)
            
        tuned_model = tune_model(
            fresh_model, 
            optimize='F1',
            n_iter=50,
            fold=5,
            choose_better=True,
            verbose=False
        )
        model_results = pull()
        individual_results[model_id] = model_results
        
        fresh_models.append(tuned_model)
        model_names.append(tuned_model.__class__.__name__)
        
        print(f"{model_id} tuning completed")        
    
    print("Attempting to create blend model...")
    blend_success = False
    
    blend_attempts = [
            ]
    
    blended_model = blend_models(
        estimator_list=fresh_models,
        verbose=False,
        **{'fold': 3, 'method': 'soft'},
    )
    blend_results = pull()
    blend_success = True

    blend_model_path = os.path.join(blend_models_dir, f"{ratio}_blended_model2.pkl")
    joblib.dump(blended_model, blend_model_path)
    print(f"블렌드 모델 저장 완료: {blend_model_path}")
    
    blend_eval_path = os.path.join(full_result_path, f"{ratio}_blend_evaluation.csv")
    safe_save_csv(blend_results, blend_eval_path)
    
    # 테스트 데이터 예측
    test_data = pd.concat([x_test, y_test], axis=1)
    try:
        final_predictions = predict_model(blended_model, data=test_data, verbose=False)
        final_metrics = pull()
        final_metrics_path = os.path.join(full_result_path, f"{ratio}_blend_final_metrics.csv")
        safe_save_csv(final_metrics, final_metrics_path)
        print("최종 예측 완료")
    except Exception as e:
        print(f"Final prediction failed: {e}")
        final_metrics = None
    
    # 플롯 생성
    plot_types = ['auc', 'confusion_matrix', 'learning', 'feature']
    for plot_type in plot_types:
        try:
            print(f"Generating {plot_type} plot...")
            plot_model(blended_model, plot=plot_type, save=True, verbose=False)
            
            default_name = default_filenames.get(plot_type, f'{plot_type}.png')
            if os.path.exists(default_name):
                final_filename = f"{ratio}_blend_{plot_type}.png"
                final_save_path = os.path.join(blend_plots_dir, final_filename)
                safe_move_file(default_name, final_save_path)
                print(f"{plot_type} 플롯 저장 완료")
                
        except Exception as e:
            print(f"[{plot_type}] Plot 생성 실패: {e}")
            error_filename = f"{ratio}_blend_{plot_type}_error.txt"
            error_path = os.path.join(blend_plots_dir, error_filename)
            with open(error_path, 'w', encoding='utf-8') as f:
                f.write(f"Plot Type: {plot_type}\nModel: Blended\nError: {str(e)}\n")
    
    # 개별 모델 결과 저장
    for model_id, results in individual_results.items():
        individual_path = os.path.join(full_result_path, f"{ratio}_{model_id}_individual_results.csv")
        safe_save_csv(results, individual_path)
    
    all_blended_results[ratio] = {
        'blend_results': blend_results,
        'final_metrics': final_metrics,
        'model_names': model_names,
        'individual_results': individual_results
    }
    
    print(f"블렌딩 성공: {', '.join(model_names)}")    
    print(f"{'='*30}{ratio} Blending Complete{'='*30}")

print("\n블렌딩 결과 요약:")
for ratio, results in all_blended_results.items():
    print(f"\n[{ratio}] Blend Model:")
    print(f"  - 사용된 모델: {', '.join(results['model_names'])}")
    if 'blend_results' in results and results['blend_results'] is not None:
        if 'F1' in results['blend_results'].columns:
            blend_f1 = results['blend_results']['F1'].mean()
            print(f"  - 블렌드 모델 평균 F1: {blend_f1:.4f}")
    print("  - 개별 모델 F1 점수:")
    for model_id, result_df in results['individual_results'].items():
        if 'F1' in result_df.columns:
            individual_f1 = result_df['F1'].mean()
            print(f"    {model_id}: {individual_f1:.4f}")

print("\n모든 블렌딩 작업이 완료되었습니다!")

Creating fresh et model...
et tuning completed
Creating fresh gbc model...
gbc tuning completed
Creating fresh lightgbm model...
lightgbm tuning completed
Creating fresh lr model...
lr tuning completed
Attempting to create blend model...
블렌드 모델 저장 완료: ..\result\FTO_Final/5x_w3D\blend_models\5x_blended_model2.pkl
최종 예측 완료
Generating auc plot...
auc 플롯 저장 완료
Generating confusion_matrix plot...
confusion_matrix 플롯 저장 완료
Generating learning plot...
learning 플롯 저장 완료
Generating feature plot...
[feature] Plot 생성 실패: Feature Importance and RFE plots not available for estimators that doesnt support coef_ or feature_importances_ attribute.
블렌딩 성공: ExtraTreesClassifier, GradientBoostingClassifier, LGBMClassifier, LogisticRegression
Creating fresh et model...
et tuning completed
Creating fresh gbc model...
gbc tuning completed
Creating fresh lightgbm model...
lightgbm tuning completed
Creating fresh lr model...
lr tuning completed
Attempting to create blend model...
블렌드 모델 저장 완료: ..\result\FTO_Fi

# SHAP

In [None]:
for ratio in ['5x', '10x']:
    print(f"\n{'='*50} {ratio} {'='*50}")

    base_path = f'FTO_Final/{ratio}_w3D'
    full_result_path = os.path.join(result_dir, base_path)
    shap_dir = os.path.join(full_result_path, "SHAP")
    os.makedirs(shap_dir, exist_ok=True)

    # 데이터 로드
    file_name = f'descriptors_filtered_FTO_training_{ratio}_ignore3D_False.csv'
    data_path = os.path.join(data_dir, f"filtered_FTO_training_{ratio}_ignore3D_False.csv")
    df = pd.read_csv(data_path)

    md_cols = file_md_list[file_name]
    fp_cols = [f'X{i+1}' for i in range(1024)]
    filtered_df = df[['potency'] + fp_cols + md_cols]

    X = filtered_df.drop('potency', axis=1)
    Y = filtered_df['potency']

    x_train, x_test, y_train, y_test = train_test_split(
        X, Y, test_size=0.1, random_state=42, stratify=Y
    )
    df_train = pd.concat([x_train, y_train], axis=1)

    # PyCaret setup (전처리 파이프라인 복원용)
    exp = setup(
        data=df_train,
        target='potency',
        session_id=42,
        train_size=0.9,
        fold=10,
        normalize=True,
        fix_imbalance=True,
        remove_outliers=True,
        n_jobs=1,
        verbose=False
    )

    # 전처리 파이프라인으로 변환
    pipeline = get_config('pipeline')
    X_test_transformed = pipeline.transform(x_test)
    X_train_transformed = pipeline.transform(x_train)

    if hasattr(X_test_transformed, 'columns'):
        feature_names = X_test_transformed.columns.tolist()
        X_test_df = X_test_transformed
        X_train_df = X_train_transformed
    else:
        feature_names = [f'f{i}' for i in range(X_test_transformed.shape[1])]
        X_test_df = pd.DataFrame(X_test_transformed, columns=feature_names)
        X_train_df = pd.DataFrame(X_train_transformed, columns=feature_names)

    # Best 모델 선택 (AUC 기준)
    summary_path = os.path.join(full_result_path, f"{ratio}_summary_evaluation.csv")
    summary_df = pd.read_csv(summary_path)
    mean_df = summary_df[summary_df['Type'] == 'Mean'].copy()
    best_row = mean_df.loc[mean_df['AUC'].idxmax()]
    best_model_name = best_row['Model']
    print(f"Best 모델: {best_model_name} (AUC: {best_row['AUC']:.4f})")

    # 모델 로드 (joblib)
    model_path = os.path.join(full_result_path, "models", f"{ratio}_{best_model_name}_model.pkl")
    best_model = joblib.load(model_path)

    # SHAP 계산
    if 'LogisticRegression' in best_model_name:
        explainer = shap.LinearExplainer(
            best_model,
            X_train_df,
            feature_perturbation="correlation_dependent"
        )
        shap_values_class1 = explainer.shap_values(X_test_df)
        print("LinearExplainer 사용")
    else:
        try:
            explainer = shap.TreeExplainer(best_model)
            shap_values = explainer.shap_values(X_test_df)
            shap_values_class1 = shap_values[1] if isinstance(shap_values, list) else shap_values
            print("TreeExplainer 사용")
        except Exception as e:
            print(f"TreeExplainer 실패: {e} -> KernelExplainer 사용")
            X_background = shap.sample(X_train_df, 50)
            explainer = shap.KernelExplainer(best_model.predict_proba, X_background)
            shap_values = explainer.shap_values(X_test_df, nsamples=100)
            if isinstance(shap_values, list):
                shap_values_class1 = shap_values[1]
            elif isinstance(shap_values, np.ndarray) and shap_values.ndim == 3:
                shap_values_class1 = shap_values[:, :, 1]
            else:
                shap_values_class1 = shap_values
            print("KernelExplainer 사용")
            
    # Summary Plot
    plt.figure(figsize=(12, 8))
    shap.summary_plot(shap_values_class1, X_test_df,
                      feature_names=feature_names, show=False, max_display=20)
    plt.tight_layout()
    plt.savefig(os.path.join(shap_dir, f'shap_{best_model_name}_summary.png'), dpi=300, bbox_inches='tight')
    plt.close()

    # Bar Plot
    plt.figure(figsize=(10, 8))
    shap.summary_plot(shap_values_class1, X_test_df,
                      feature_names=feature_names, plot_type='bar', show=False, max_display=20)
    plt.tight_layout()
    plt.savefig(os.path.join(shap_dir, f'shap_{best_model_name}_bar.png'), dpi=300, bbox_inches='tight')
    plt.close()

    # Feature Importance CSV
    mean_abs_shap = np.abs(shap_values_class1).mean(axis=0)
    pd.DataFrame({
        'feature': feature_names,
        'mean_abs_shap': mean_abs_shap
    }).sort_values('mean_abs_shap', ascending=False).to_csv(
        os.path.join(shap_dir, f'shap_{best_model_name}_importance.csv'), index=False
    )

    print(f"완료: {ratio} - {best_model_name} SHAP 저장 -> {shap_dir}")


Best 모델: LogisticRegression (AUC: 0.8310)


Estimating transforms:   0%|          | 0/1000 [00:00<?, ?it/s]

LinearExplainer 사용
완료: 5x - LogisticRegression SHAP 저장 -> ../result/FTO_Final/5x_w3D/SHAP

Best 모델: GradientBoostingClassifier (AUC: 0.8374)
TreeExplainer 사용
완료: 10x - GradientBoostingClassifier SHAP 저장 -> ../result/FTO_Final/10x_w3D/SHAP
