In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import make_scorer, precision_score, recall_score, f1_score, accuracy_score, roc_auc_score
from sklearn.svm import SVC
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
import warnings
warnings.filterwarnings('ignore')

# 📌 데이터 로드
df = pd.read_csv('../data/WA_Fn-UseC_-HR-Employee-Attrition.csv')

# 🧹 전처리: 이직률(Attrition) 라벨 인코딩
df['Attrition'] = df['Attrition'].map({'Yes': 1, 'No': 0})

# 🧼 범주형 → 원핫인코딩
df = pd.get_dummies(df, drop_first=True)

# ✅ 주요 13개 특성만 선택 (이직률 제외)
selected_columns = [
    'Age', 'JobLevel', 'MonthlyIncome', 'TotalWorkingYears', 'YearsAtCompany',
    'YearsInCurrentRole', 'YearsSinceLastPromotion', 'YearsWithCurrManager',
    'DistanceFromHome', 'NumCompaniesWorked', 'OverTime_Yes',
    'JobRole_Research Scientist', 'EducationField_Life Sciences'
]

X = df[selected_columns]
y = df['Attrition']

# ✂️ 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# 📏 평가지표 정의
scoring = {
    'f1': make_scorer(f1_score),
    'accuracy': make_scorer(accuracy_score),
    'precision': make_scorer(precision_score),
    'recall': make_scorer(recall_score),
    'roc_auc': make_scorer(roc_auc_score)
}

# 🔧 하이퍼파라미터 후보 (SVC는 확률예측이 필요하므로 probability=True 필수)
param_grid = {
    'model__C': [0.1, 1, 10],
    'model__kernel': ['linear', 'rbf'],
    'model__gamma': ['scale', 'auto']
}

# 🔁 파이프라인: SMOTE + StandardScaler + SVC
pipeline = Pipeline(steps=[
    ('smote', SMOTE(random_state=42)),
    ('scaler', StandardScaler()),  # SVM은 스케일링 필수
    ('model', SVC(probability=True, random_state=42))
])

# 🧠 GridSearchCV 실행 (f1-score 기준)
grid = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    scoring=scoring,
    refit='f1',
    cv=5,
    n_jobs=-1,
    verbose=1,
    return_train_score=True
)

# 📚 모델 학습
grid.fit(X_train, y_train)

# ✅ 결과 출력
best_index = grid.best_index_
print("\n📌 Best Parameters:", grid.best_params_)
print("\n📊 SVC Cross-Validation Scores:")
print(f"F1 Score     : {grid.cv_results_['mean_test_f1'][best_index]:.4f}")
print(f"Accuracy     : {grid.cv_results_['mean_test_accuracy'][best_index]:.4f}")
print(f"Precision    : {grid.cv_results_['mean_test_precision'][best_index]:.4f}")
print(f"Recall       : {grid.cv_results_['mean_test_recall'][best_index]:.4f}")
print(f"ROC AUC      : {grid.cv_results_['mean_test_roc_auc'][best_index]:.4f}")


Fitting 5 folds for each of 12 candidates, totalling 60 fits

📌 Best Parameters: {'model__C': 10, 'model__gamma': 'scale', 'model__kernel': 'linear'}

📊 SVC Cross-Validation Scores:
F1 Score     : 0.4097
Accuracy     : 0.7041
Precision    : 0.3045
Recall       : 0.6316
ROC AUC      : 0.6748


In [3]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, precision_score, recall_score, f1_score, accuracy_score, roc_auc_score
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
import warnings

warnings.simplefilter(action='ignore', category=FutureWarning)

# 🔧 SVC 하이퍼파라미터 후보 정의
param_grid = {
    'model__C': [0.1, 1, 10],                     # Regularization strength
    'model__kernel': ['linear', 'rbf'],          # 커널 종류
    'model__gamma': ['scale', 'auto']            # rbf 커널의 gamma
}

# ⚙️ 파이프라인 정의: SMOTE → SVC
pipeline = Pipeline(steps=[
    ('smote', SMOTE(random_state=42)),           # 클래스 불균형 처리
    ('model', SVC(probability=True, random_state=42))  # SVC 모델
])

# 📊 평가지표 정의
scoring = {
    'precision': make_scorer(precision_score),
    'recall': make_scorer(recall_score),
    'f1': make_scorer(f1_score),
    'accuracy': make_scorer(accuracy_score),
    'roc_auc': make_scorer(roc_auc_score)
}

# 📂 결과 저장용 딕셔너리
results = {}

# 🔁 각각의 기준(f1, recall, precision)에 대해 하이퍼파라미터 튜닝 수행
for metric in ['precision', 'recall', 'f1']:
    grid = GridSearchCV(
        estimator=pipeline,
        param_grid=param_grid,
        scoring=scoring,
        refit=metric,
        cv=5,
        verbose=0,
        n_jobs=-1,
        return_train_score=True
    )

    # 🏋️ 하이퍼파라미터 탐색 실행
    grid.fit(X_train, y_train)

    # ✅ 최적 모델 정보 저장
    best_index = grid.best_index_
    results[metric] = {
        'best_params': grid.best_params_,
        'scores': {
            'Precision': grid.cv_results_['mean_test_precision'][best_index],
            'Recall': grid.cv_results_['mean_test_recall'][best_index],
            'F1 Score': grid.cv_results_['mean_test_f1'][best_index],
            'Accuracy': grid.cv_results_['mean_test_accuracy'][best_index],
            'ROC AUC': grid.cv_results_['mean_test_roc_auc'][best_index]
        }
    }

# 📤 결과 출력
for metric, content in results.items():
    print(f"\n🎯 [Refit 기준: {metric.upper()}]")
    print("Best Parameters:", content['best_params'])
    for score_name, val in content['scores'].items():
        print(f"{score_name:12}: {val:.4f}")



KeyboardInterrupt

