In [2]:
# 1. 필요한 라이브러리
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import (confusion_matrix, classification_report, roc_auc_score,
                             accuracy_score, precision_score, recall_score, f1_score,
                             make_scorer, precision_recall_curve, average_precision_score, roc_curve)
from sklearn.neighbors import KNeighborsClassifier
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
import seaborn as sns

# 2. 데이터 로드 및 이직률 인코딩
df = pd.read_csv("../data/WA_Fn-UseC_-HR-Employee-Attrition.csv")
df['Attrition'] = LabelEncoder().fit_transform(df['Attrition'])

# 3. 주요 13개 변수만 선택 (상관계수 기반, 이직률 제외)
selected_features = [
    'Age', 'EnvironmentSatisfaction', 'JobInvolvement', 'JobLevel',
    'JobSatisfaction', 'MaritalStatus', 'MonthlyIncome', 'OverTime',
    'StockOptionLevel', 'TotalWorkingYears', 'YearsAtCompany',
    'YearsInCurrentRole', 'YearsWithCurrManager'
]


# 4. 범주형 인코딩
df['OverTime'] = LabelEncoder().fit_transform(df['OverTime'])
df['MaritalStatus'] = LabelEncoder().fit_transform(df['MaritalStatus'])

# 5. X, y 분리
X = df[selected_features]
y = df['Attrition']

# 6. 스케일링
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 7. SMOTE 처리
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_scaled, y)


# 8. 학습/테스트 분할
X_train, X_test, y_train, y_test = train_test_split(
    X_resampled, y_resampled, test_size=0.2, stratify=y_resampled, random_state=42
)

# 9. KNN 모델 정의 및 학습
model = KNeighborsClassifier(n_neighbors=5)
model.fit(X_train, y_train)

# 10. 예측
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]

# print("\n📄 Classification Report:")
# print(classification_report(y_test, y_pred))

# 11. 교차검증 성능 계산
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scorers = {
    'f1': make_scorer(f1_score),
    'accuracy': make_scorer(accuracy_score),
    'precision': make_scorer(precision_score),
    'recall': make_scorer(recall_score),
    #'roc_auc': make_scorer(roc_auc_score, needs_proba=True)
    'roc_auc': make_scorer(roc_auc_score)
}
cv_results = {metric: cross_val_score(model, X_train, y_train, cv=cv, scoring=scorer).mean()
              for metric, scorer in scorers.items()}
cv_results['f1(Test)'] = f1_score(y_test, y_pred)

# 12. 출력
print("📊 KNN 교차검증 + 테스트셋 성능:\n")
for metric, score in cv_results.items():
    print(f"{metric:<10}: {score:.4f}")

📊 KNN 교차검증 + 테스트셋 성능:

f1        : 0.8618
accuracy  : 0.8463
precision : 0.7855
recall    : 0.9554
roc_auc   : 0.8463
f1(Test)  : 0.8613


# 하이퍼파라미터 튜닝 (precision, recall, f1)

In [52]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, precision_score, recall_score, f1_score, accuracy_score, roc_auc_score
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
import warnings

warnings.simplefilter(action='ignore', category=FutureWarning)

# 🔧 KNN 하이퍼파라미터 후보 정의
param_grid = {
    'model__n_neighbors': [3, 5, 7, 9],         # K 값
    'model__weights': ['uniform', 'distance'], # 가중치 방식
    'model__p': [1, 2]                          # 거리 측정 방식 (1: 맨해튼, 2: 유클리디안)
}

# ⚙️ 파이프라인: SMOTE → KNN 분류기
pipeline = Pipeline(steps=[
    ('smote', SMOTE(random_state=42)),                # 클래스 불균형 보정
    ('model', KNeighborsClassifier())                 # KNN 분류기
])

# 📊 다양한 평가지표 정의
scoring = {
    'precision': make_scorer(precision_score),
    'recall': make_scorer(recall_score),
    'f1': make_scorer(f1_score),
    'accuracy': make_scorer(accuracy_score),
    'roc_auc': make_scorer(roc_auc_score)
}

# 📂 결과 저장용 딕셔너리
results = {}

# 🔁 precision, recall, f1 기준별로 각각 튜닝
for metric in ['precision', 'recall', 'f1']:
    grid = GridSearchCV(
        estimator=pipeline,           # 전체 파이프라인 탐색
        param_grid=param_grid,        # KNN 하이퍼파라미터
        scoring=scoring,              # 다양한 지표
        refit=metric,                 # 기준 metric에 따라 최적화
        cv=5,
        verbose=0,
        n_jobs=-1,
        return_train_score=True
    )

    # 🏋️ 하이퍼파라미터 탐색 수행
    grid.fit(X_train, y_train)

    # ✅ 최적 파라미터 및 지표 저장
    best_index = grid.best_index_
    results[metric] = {
        'best_params': grid.best_params_,
        'scores': {
            'Precision': grid.cv_results_['mean_test_precision'][best_index],
            'Recall': grid.cv_results_['mean_test_recall'][best_index],
            'F1 Score': grid.cv_results_['mean_test_f1'][best_index],
            'Accuracy': grid.cv_results_['mean_test_accuracy'][best_index],
            'ROC AUC': grid.cv_results_['mean_test_roc_auc'][best_index]
        }
    }

# 📢 결과 출력
for metric, content in results.items():
    print(f"\n🎯 [기준: {metric.upper()}]")
    print("Best Parameters:", content['best_params'])
    for score_name, val in content['scores'].items():
        print(f"{score_name:12}: {val:.4f}")



🎯 [기준: PRECISION]
Best Parameters: {'model__n_neighbors': 3, 'model__p': 1, 'model__weights': 'distance'}
Precision   : 0.8404
Recall      : 0.9736
F1 Score    : 0.9017
Accuracy    : 0.8935
ROC AUC     : 0.8935

🎯 [기준: RECALL]
Best Parameters: {'model__n_neighbors': 3, 'model__p': 2, 'model__weights': 'distance'}
Precision   : 0.8113
Recall      : 0.9797
F1 Score    : 0.8871
Accuracy    : 0.8748
ROC AUC     : 0.8748

🎯 [기준: F1]
Best Parameters: {'model__n_neighbors': 3, 'model__p': 1, 'model__weights': 'distance'}
Precision   : 0.8404
Recall      : 0.9736
F1 Score    : 0.9017
Accuracy    : 0.8935
ROC AUC     : 0.8935
