In [13]:
%pip install lightgbm


Note: you may need to restart the kernel to use updated packages.


In [5]:
%pip install imblearn

Collecting imblearn
  Downloading imblearn-0.0-py2.py3-none-any.whl.metadata (355 bytes)
Downloading imblearn-0.0-py2.py3-none-any.whl (1.9 kB)
Installing collected packages: imblearn
Successfully installed imblearn-0.0
Note: you may need to restart the kernel to use updated packages.


In [1]:
%pip install imbalanced-learn xgboost

Note: you may need to restart the kernel to use updated packages.


In [6]:
%pip install category_encoders imbalanced-learn xgboost lightgbm catboost


Note: you may need to restart the kernel to use updated packages.


In [18]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score, f1_score
from lightgbm import LGBMClassifier

# 1. 데이터 불러오기 및 전처리
df = pd.read_csv('data/Employee_Attrition.csv')
df['Attrition_n'] = df['Attrition'].map({'Yes': 1, 'No': 0})

# 사용할 컬럼
cols = [
    'OverTime', 'TotalWorkingYears', 'JobLevel', 'YearsInCurrentRole',
    'MonthlyIncome', 'Age', 'YearsWithCurrManager', 'StockOptionLevel',
    'YearsAtCompany', 'JobInvolvement', 'DistanceFromHome',
    'JobSatisfaction', 'EnvironmentSatisfaction', 'Gender', 'MaritalStatus',
    'JobRole', 'Department', 'BusinessTravel', 'EducationField',
    'Education', 'WorkLifeBalance', 'PerformanceRating'
]
df = df[cols + ['Attrition_n']].dropna()
df_encoded = pd.get_dummies(df, drop_first=True)

# 데이터 분할 (누수 방지)
X_full = df_encoded.drop(['Attrition_n'], axis=1)
y_full = df_encoded['Attrition_n']

X_train, X_test, y_train, y_test = train_test_split(X_full, y_full, stratify=y_full, test_size=0.2, random_state=42)

# 클러스터링은 학습 데이터에서만 수행
leavers_train = X_train[y_train == 1].copy()
scaler = StandardScaler()
leavers_train_scaled = scaler.fit_transform(leavers_train)

kmeans = KMeans(n_clusters=2, random_state=42)
leaver_clusters = kmeans.fit_predict(leavers_train_scaled)

# 클러스터 결과를 원본 X_train에 병합
X_train_clustered = X_train.copy()
X_train_clustered.loc[y_train == 1, 'leaver_cluster'] = leaver_clusters
X_train_clustered['leaver_cluster'] = X_train_clustered['leaver_cluster'].fillna(0)

# X_test에도 동일한 컬럼 추가 (테스트셋에는 클러스터 정보 제공 X)
X_test_clustered = X_test.copy()
X_test_clustered['leaver_cluster'] = 0  # 중립 값으로 처리

# 스케일링
scaler_final = StandardScaler()
X_train_scaled = scaler_final.fit_transform(X_train_clustered)
X_test_scaled = scaler_final.transform(X_test_clustered)

# 모델 훈련 및 예측
model = LGBMClassifier(random_state=42)
model.fit(X_train_scaled, y_train)
y_pred = model.predict(X_test_scaled)

# 평가
acc = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Accuracy : {acc:.4f}")
print(f"F1-Score : {f1:.4f}")


[LightGBM] [Info] Number of positive: 190, number of negative: 986
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000171 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 542
[LightGBM] [Info] Number of data points in the train set: 1176, number of used features: 37
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.161565 -> initscore=-1.646632
[LightGBM] [Info] Start training from score -1.646632
Accuracy : 0.8367
F1-Score : 0.0000




In [19]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score
from imblearn.over_sampling import SMOTE
from lightgbm import LGBMClassifier

# 📂 데이터 불러오기 및 전처리
df = pd.read_csv('data/Employee_Attrition.csv')
df['Attrition_n'] = df['Attrition'].map({'Yes': 1, 'No': 0})

cols = [
    'OverTime', 'TotalWorkingYears', 'JobLevel', 'YearsInCurrentRole',
    'MonthlyIncome', 'Age', 'YearsWithCurrManager', 'StockOptionLevel',
    'YearsAtCompany', 'JobInvolvement', 'DistanceFromHome',
    'JobSatisfaction', 'EnvironmentSatisfaction', 'Gender', 'MaritalStatus',
    'JobRole', 'Department', 'BusinessTravel', 'EducationField',
    'Education', 'WorkLifeBalance', 'PerformanceRating'
]
df = df[cols + ['Attrition_n']].dropna()
df_encoded = pd.get_dummies(df, drop_first=True)

X = df_encoded.drop("Attrition_n", axis=1)
y = df_encoded["Attrition_n"]

# 📦 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.2, random_state=42
)

# 🔧 스케일링
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# ⚖️ SMOTE 적용 (이직자 수 증가)
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train_scaled, y_train)

# 🚀 모델 학습
model = LGBMClassifier(random_state=42)
model.fit(X_train_res, y_train_res)

# 🎯 예측 및 평가
y_pred = model.predict(X_test_scaled)

print("\n📊 이진 분류 성능 평가 (SMOTE 적용)")
print(f"Accuracy : {accuracy_score(y_test, y_pred):.4f}")
print(f"F1-Score : {f1_score(y_test, y_pred):.4f}")



[LightGBM] [Info] Number of positive: 986, number of negative: 986
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000680 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3932
[LightGBM] [Info] Number of data points in the train set: 1972, number of used features: 37
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Accuracy : 0.8401360544217688
F1-Score : 0.11320754716981132


