# XGBoost

In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import make_scorer, precision_score, recall_score, f1_score, accuracy_score, roc_auc_score, confusion_matrix, ConfusionMatrixDisplay
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
import matplotlib.pyplot as plt
import warnings

warnings.filterwarnings('ignore')
# 📌 데이터 로드
#df = pd.read_csv("./original_data/WA_Fn-UseC_-HR-Employee-Attrition.csv")
df = pd.read_csv('../data/WA_Fn-UseC_-HR-Employee-Attrition.csv')
# 2. 🧹 전처리: 이직률(Attrition) 라벨 인코딩 (Yes/No → 1/0)
df['Attrition'] = df['Attrition'].map({'Yes': 1, 'No': 0})

# 3. 🧼 범주형 → 원핫인코딩 (drop_first=True로 다중공선성 방지)
df = pd.get_dummies(df, drop_first=True)

# 4. ✅ 주요 13개 특성 (이직률은 제외)
selected_columns = [
    'Age', 'JobLevel', 'MonthlyIncome', 'TotalWorkingYears', 'YearsAtCompany',
    'YearsInCurrentRole', 'YearsSinceLastPromotion', 'YearsWithCurrManager',
    'DistanceFromHome', 'NumCompaniesWorked', 'OverTime_Yes',
    'JobRole_Research Scientist', 'EducationField_Life Sciences'  # 컬럼명은 본인 데이터에 맞게 조정
]

X = df[selected_columns]
y = df['Attrition']  # 타겟값

# 5. ✂️ 학습 / 테스트 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# 6. 📏 평가지표 정의
scoring = {
    'f1': make_scorer(f1_score),
    'accuracy': make_scorer(accuracy_score),
    'precision': make_scorer(precision_score),
    'recall': make_scorer(recall_score),
    'roc_auc': make_scorer(roc_auc_score)
}

# 7. 🔧 하이퍼파라미터 후보
param_grid = {
    'model__n_estimators': [100, 200],
    'model__max_depth': [3, 5],
    'model__learning_rate': [0.1, 0.3],
    'model__subsample': [0.8, 1.0]
}

# 8. 🔁 파이프라인 (SMOTE + XGBoost)
pipeline = Pipeline(steps=[
    ('smote', SMOTE(random_state=42)),
    ('model', XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42))
])

# 9. 🧠 GridSearchCV (기준: f1-score)
grid = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    scoring=scoring,
    refit='f1',
    cv=5,
    n_jobs=-1,
    verbose=1,
    return_train_score=True
)

# 10. 📚 모델 학습
grid.fit(X_train, y_train)

# 11. ✅ 결과 출력
best_index = grid.best_index_
print("\n📌 Best Parameters:", grid.best_params_)
print("\n📊 XGBoost Cross-Validation Scores:")
print(f"F1 Score     : {grid.cv_results_['mean_test_f1'][best_index]:.4f}")
print(f"Accuracy     : {grid.cv_results_['mean_test_accuracy'][best_index]:.4f}")
print(f"Precision    : {grid.cv_results_['mean_test_precision'][best_index]:.4f}")
print(f"Recall       : {grid.cv_results_['mean_test_recall'][best_index]:.4f}")
print(f"ROC AUC      : {grid.cv_results_['mean_test_roc_auc'][best_index]:.4f}")



Fitting 5 folds for each of 16 candidates, totalling 80 fits

📌 Best Parameters: {'model__learning_rate': 0.1, 'model__max_depth': 3, 'model__n_estimators': 100, 'model__subsample': 0.8}

📊 XGBoost Cross-Validation Scores:
F1 Score     : 0.4257
Accuracy     : 0.7789
Precision    : 0.3663
Recall       : 0.5105
ROC AUC      : 0.6706


In [4]:
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, precision_score, recall_score, f1_score, accuracy_score, roc_auc_score
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
import pandas as pd

# X, y는 13개 주요 변수(X), 타겟(y)로 구성된 데이터셋
# 예시: X = df.drop(columns='Attrition'), y = df['Attrition']

# 하이퍼파라미터 후보 정의
param_grid = {
    'model__n_estimators': [100, 200],
    'model__max_depth': [3, 5],
    'model__learning_rate': [0.01, 0.1]
}

# 파이프라인 정의
pipeline = Pipeline(steps=[
    ('smote', SMOTE(random_state=42)),
    ('model', XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42))
])

# 평가지표 정의
scoring = {
    'precision': make_scorer(precision_score),
    'recall': make_scorer(recall_score),
    'f1': make_scorer(f1_score),
    'accuracy': make_scorer(accuracy_score),
    'roc_auc': make_scorer(roc_auc_score)
}

# 결과 저장용 딕셔너리
results = {}

# 각 기준별로 반복
for metric in ['precision', 'recall', 'f1']:
    grid = GridSearchCV(
        estimator=pipeline,
        param_grid=param_grid,
        scoring=scoring,
        refit=metric,             # 기준 지표로 최적 모델 선택
        cv=5,
        n_jobs=-1,
        verbose=1
    )
    grid.fit(X, y)

    best_index = grid.best_index_
    results[metric] = {
        'best_params': grid.best_params_,
        'scores': {
            'F1 Score': grid.cv_results_['mean_test_f1'][best_index],
            'Accuracy': grid.cv_results_['mean_test_accuracy'][best_index],
            'Precision': grid.cv_results_['mean_test_precision'][best_index],
            'Recall': grid.cv_results_['mean_test_recall'][best_index],
            'ROC AUC': grid.cv_results_['mean_test_roc_auc'][best_index]
        }
    }

# 결과 출력
for metric, res in results.items():
    print(f"\n[최적 평가 기준: {metric.upper()}]")
    print("Best Params:", res['best_params'])
    for score_name, value in res['scores'].items():
        print(f"{score_name:10}: {value:.4f}")


Fitting 5 folds for each of 8 candidates, totalling 40 fits
Fitting 5 folds for each of 8 candidates, totalling 40 fits
Fitting 5 folds for each of 8 candidates, totalling 40 fits

[최적 평가 기준: PRECISION]
Best Params: {'model__learning_rate': 0.1, 'model__max_depth': 5, 'model__n_estimators': 200}
F1 Score  : 0.3906
Accuracy  : 0.7796
Precision : 0.3527
Recall    : 0.4387
ROC AUC   : 0.6419

[최적 평가 기준: RECALL]
Best Params: {'model__learning_rate': 0.01, 'model__max_depth': 3, 'model__n_estimators': 200}
F1 Score  : 0.3770
Accuracy  : 0.7197
Precision : 0.2933
Recall    : 0.5319
ROC AUC   : 0.6439

[최적 평가 기준: F1]
Best Params: {'model__learning_rate': 0.1, 'model__max_depth': 3, 'model__n_estimators': 100}
F1 Score  : 0.4084
Accuracy  : 0.7524
Precision : 0.3325
Recall    : 0.5318
ROC AUC   : 0.6633
