In [1]:
import pandas as pd
import mlflow
import mlflow.xgboost
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from mlflow.models import infer_signature
from mlflow.sklearn import save_model

"""
1. 데이터 로딩
2. 고객 이탈 예측을 위한 데이터
3. 'Exited' 컬럼이 예측 대상(target)
"""
data = pd.read_csv('../data/churn.csv')

# Tracking Server 설정
mlflow.set_tracking_uri(uri="http://127.0.0.1:5000") # 환경 변수를 설정했지만, 확실하게 설정

In [2]:
# 불필요한 컬럼 제거 후 feature(X), target(y) 분리
X = data.drop(['Exited', 'RowNumber', 'CustomerId', 'Surname'], axis=1)
y = data['Exited']

In [3]:
# 범주형 수치형 feature 정의
categorical_features = ['Geography', 'Gender']
numeric_features = ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary']

In [4]:
# 전처리 파이프라인 설정
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features), # 수치형 데이터 표준화
        ('cat', OneHotEncoder(), categorical_features) # 범주형 데이터 원핫인코딩
    ])

In [5]:
# 데이터 전처리 수행
X_processed = preprocessor.fit_transform(X)

In [6]:
# 데이터셋 학습용과 테스트용으로 분할
X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.2, random_state=42)

In [7]:
# MLFlow 실험 설정
mlflow.set_experiment("practice3")

2025/10/10 14:24:34 INFO mlflow.tracking.fluent: Experiment with name 'practice3' does not exist. Creating a new experiment.


<Experiment: artifact_location=('file:///C:/Users/SSAFY/Desktop/TIL/100_offline/8. DA/82일차 '
 '(10.10)/data_science2_ws_7_3/mlflow_project/src/mlruns/1'), creation_time=1760073874799, experiment_id='1', last_update_time=1760073874799, lifecycle_stage='active', name='practice3', tags={}>

In [None]:
# XGBoost 모델 학습 및 평가
with mlflow.start_run(run_name="XGBoost"):
    model = xgb.XGBClassifier(n_estimators=100, eval_metric='logloss')
    model.fit(X_train, y_train) # 모델 학습
    preds = model.predict(X_test) # 예측 수행

    # 성능 평가 지표 계산
    acc = accuracy_score(y_test, preds)
    precision = precision_score(y_test, preds)
    recall = recall_score(y_test, preds)
    f1 = f1_score(y_test, preds)
    roc_auc = roc_auc_score(y_test, preds)

    # MLflow에 하이퍼파라미터 및 평가 지표 저장
    mlflow.log_params({'model': 'XGBoost', 'n_estimators': 100})
    mlflow.log_metrics({
        'accuracy': acc,
        'precision': precision,
        'recall': recall,
        'f1_score': f1,
        'roc_auc': roc_auc
    })

    # 입력과 출력 형태 정의 및 MLFlow에 모델 저장
    signature = infer_signature(X_train, preds)
    
    # Tracking 서버 모델 저장
    mlflow.xgboost.log_model(model, "xgboost_model", signature=signature, model_format='json')

    # 로컬 모델 저장
    save_model(sk_model=model, path="./my_model")


🏃 View run XGBoost at: http://127.0.0.1:5000/#/experiments/1/runs/fd3af9363424402087e9bc315c708b29
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1


In [10]:
# 로컬에서 모델 불러오기
loaded_model = mlflow.pyfunc.load_model("./my_model")
preds = loaded_model.predict(X_test)

# (옵션) Tracking 서버 에서 모델 불러오고 싶은 경우
# model_uri = "runs:/ba0aa44aec5b41f98d24e052de1bd1e1/xgboost_model"
# model = mlflow.xgboost.load_model(model_uri)
# preds = model.predict(X_test)

In [11]:
# 예측 결과 출력
print("Loaded model predictions:", preds[:20])  # 처음 10개 출력

# 성능 평가 지표 계산
acc = accuracy_score(y_test, preds)
precision = precision_score(y_test, preds)
recall = recall_score(y_test, preds)
f1 = f1_score(y_test, preds)
roc_auc = roc_auc_score(y_test, preds)

# 성능 지표 재확인 (선택 사항)
print("Accuracy (loaded model):", acc)
print("precision (loaded model):", precision)
print("recall (loaded model):", recall)
print("f1 (loaded model):", f1)
print("roc_auc (loaded model):", roc_auc)


Loaded model predictions: [0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0]
Accuracy (loaded model): 0.858
precision (loaded model): 0.693950177935943
recall (loaded model): 0.4961832061068702
f1 (loaded model): 0.5786350148367952
roc_auc (loaded model): 0.7213336690148539
