# DecisionTree

In [3]:
# 📌 라이브러리 불러오기
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import (confusion_matrix, classification_report, roc_auc_score,
                             accuracy_score, precision_score, recall_score, f1_score,
                             make_scorer, precision_recall_curve, average_precision_score, roc_curve)
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
import seaborn as sns

# 📌 데이터 로드
df = pd.read_csv("../data/WA_Fn-UseC_-HR-Employee-Attrition.csv")
df['Attrition'] = LabelEncoder().fit_transform(df['Attrition'])

# 📌 주요 변수 (상관계수 기준 13개)
selected_features = [
    'Age', 'EnvironmentSatisfaction', 'JobInvolvement', 'JobLevel',
    'JobSatisfaction', 'MaritalStatus', 'MonthlyIncome', 'OverTime',
    'StockOptionLevel', 'TotalWorkingYears', 'YearsAtCompany',
    'YearsInCurrentRole', 'YearsWithCurrManager'
]

# 📌 범주형 변수 인코딩
df['OverTime'] = LabelEncoder().fit_transform(df['OverTime'])
df['MaritalStatus'] = LabelEncoder().fit_transform(df['MaritalStatus'])

# 📌 X, y 설정
X = df[selected_features]
y = df['Attrition']

# 📌 X, y 설정
X = df[selected_features]
y = df['Attrition']

# 📌 스케일링 (SMOTE 전에)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 📌 SMOTE 적용
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_scaled, y)

# 📌 학습/테스트 분할
X_train, X_test, y_train, y_test = train_test_split(
    X_resampled, y_resampled, test_size=0.2, stratify=y_resampled, random_state=42
)

# 📌 Decision Tree 모델 학습
model = DecisionTreeClassifier(random_state=42)
model.fit(X_train, y_train)

# 📌 예측
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]

In [4]:
# 📌 교차검증 평가 지표 계산
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scorers = {
    'f1': make_scorer(f1_score),
    'accuracy': make_scorer(accuracy_score),
    'precision': make_scorer(precision_score),
    'recall': make_scorer(recall_score),
    #'roc_auc': make_scorer(roc_auc_score, needs_proba=True)
    'roc_auc': make_scorer(roc_auc_score)
}
cv_results = {
    metric: cross_val_score(model, X_train, y_train, cv=cv, scoring=scorer).mean()
    for metric, scorer in scorers.items()
}
cv_results['f1(Test)'] = f1_score(y_test, y_pred)

# 📌 평가지표 출력
print("\n📊 DecisionTree 교차검증 + 테스트셋 성능:")
for metric, score in cv_results.items():
    print(f"{metric:<10}: {score:.4f}")



📊 DecisionTree 교차검증 + 테스트셋 성능:
f1        : 0.8399
accuracy  : 0.8372
precision : 0.8268
recall    : 0.8539
roc_auc   : 0.8372
f1(Test)  : 0.8538


In [24]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, precision_score, recall_score, f1_score, accuracy_score, roc_auc_score
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
import warnings

warnings.simplefilter(action='ignore', category=FutureWarning)

# 🔧 하이퍼파라미터 후보 정의
param_grid = {
    'model__max_depth': [3, 5, 10, None],           # 트리 최대 깊이
    'model__min_samples_split': [2, 5, 10],         # 노드를 나누기 위한 최소 샘플 수
    'model__min_samples_leaf': [1, 2, 4]            # 리프 노드가 되기 위한 최소 샘플 수
}

# ⚙️ 파이프라인 정의: SMOTE → Decision Tree
pipeline = Pipeline(steps=[
    ('smote', SMOTE(random_state=42)),                      # 클래스 불균형 보정
    ('model', DecisionTreeClassifier(random_state=42))      # 결정 트리 모델
])

# 📊 다양한 평가지표 정의
scoring = {
    'precision': make_scorer(precision_score),
    'recall': make_scorer(recall_score),
    'f1': make_scorer(f1_score),
    'accuracy': make_scorer(accuracy_score),
    'roc_auc': make_scorer(roc_auc_score)
}

# 📂 결과 저장용 딕셔너리
results = {}

# 🔁 precision, recall, f1 기준별로 각각 튜닝
for metric in ['precision', 'recall', 'f1']:
    grid = GridSearchCV(
        estimator=pipeline,           # 파이프라인 전체를 탐색
        param_grid=param_grid,        # 트리 하이퍼파라미터
        scoring=scoring,              # 다양한 평가 지표
        refit=metric,                 # metric 기준으로 최적 모델 선택
        cv=5,                         # 5-Fold 교차검증
        verbose=0,
        n_jobs=-1,
        return_train_score=True
    )

    # 🏋️ 그리드 서치 실행
    grid.fit(X_train, y_train)

    # ✅ 최적 파라미터 및 평가 결과 저장
    best_index = grid.best_index_
    results[metric] = {
        'best_params': grid.best_params_,
        'scores': {
            'Precision': grid.cv_results_['mean_test_precision'][best_index],
            'Recall': grid.cv_results_['mean_test_recall'][best_index],
            'F1 Score': grid.cv_results_['mean_test_f1'][best_index],
            'Accuracy': grid.cv_results_['mean_test_accuracy'][best_index],
            'ROC AUC': grid.cv_results_['mean_test_roc_auc'][best_index]
        }
    }

# 📢 모든 결과 출력
for metric, content in results.items():
    print(f"\n🎯 [기준: {metric.upper()}]")
    print("Best Parameters:", content['best_params'])
    for score_name, val in content['scores'].items():
        print(f"{score_name:12}: {val:.4f}")



🎯 [기준: PRECISION]
Best Parameters: {'model__max_depth': 10, 'model__min_samples_leaf': 4, 'model__min_samples_split': 2}
Precision   : 0.8600
Recall      : 0.7850
F1 Score    : 0.8207
Accuracy    : 0.8286
ROC AUC     : 0.8286

🎯 [기준: RECALL]
Best Parameters: {'model__max_depth': None, 'model__min_samples_leaf': 1, 'model__min_samples_split': 2}
Precision   : 0.8232
Recall      : 0.8479
F1 Score    : 0.8349
Accuracy    : 0.8326
ROC AUC     : 0.8327

🎯 [기준: F1]
Best Parameters: {'model__max_depth': 10, 'model__min_samples_leaf': 1, 'model__min_samples_split': 2}
Precision   : 0.8376
Recall      : 0.8418
F1 Score    : 0.8391
Accuracy    : 0.8387
ROC AUC     : 0.8387


🔍 주요 하이퍼파라미터 설명
| 파라미터                | 설명                   |
| ------------------- | -------------------- |
| `max_depth`         | 트리 깊이를 제한하여 과적합 방지   |
| `min_samples_split` | 분할을 위해 필요한 최소 샘플 수   |
| `min_samples_leaf`  | 리프 노드가 되기 위한 최소 샘플 수 |
