In [15]:
import pandas as pd
import mlflow
import mlflow.xgboost
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from mlflow.models import infer_signature

"""
1. 데이터 로딩
2. 고객 이탈 예측을 위한 데이터
3. 'Exited' 컬럼이 예측 대상(target)
"""
data = pd.read_csv('../data/churn.csv')

# Tracking Server 설정
mlflow.set_tracking_uri(uri="http://127.0.0.1:5000")

In [16]:
# 불필요한 컬럼 제거 후 feature(X), target(y) 분리
X = data.drop(['Exited', 'RowNumber', 'CustomerId', 'Surname'], axis=1)
y = data['Exited']

In [17]:
# 범주형 수치형 feature 정의
categorical_features = ['Geography', 'Gender']
numeric_features = ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary']

In [18]:
# 전처리 파이프라인 설정
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features), # 수치형 데이터 표준화
        ('cat', OneHotEncoder(), categorical_features) # 범주형 데이터 원핫인코딩
    ])

In [19]:
# 데이터 전처리 수행
X_processed = preprocessor.fit_transform(X)

In [20]:
# 데이터셋 학습용과 테스트용으로 분할
X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.2, random_state=42)

In [21]:
# MLFlow 실험 설정
mlflow.set_experiment("practice1")

2025/10/10 11:32:52 INFO mlflow.tracking.fluent: Experiment with name 'practice1' does not exist. Creating a new experiment.


<Experiment: artifact_location=('file:///C:/Users/SSAFY/Desktop/TIL/100_offline/8. DA/82일차 '
 '(10.10)/data_science2_ws_7_1/mlflow_project/mlruns/1'), creation_time=1760063572581, experiment_id='1', last_update_time=1760063572581, lifecycle_stage='active', name='practice1', tags={}>

In [22]:
# 랜덤 포레스트 모델 학습 및 평가
with mlflow.start_run(run_name="RandomForest"):
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train) # 모델 학습
    preds = model.predict(X_test) # 예측 수행

    # 성능 지표 계산
    acc = accuracy_score(y_test, preds)
    precision = precision_score(y_test, preds)
    recall = recall_score(y_test, preds)
    f1 = f1_score(y_test, preds)
    roc_auc = roc_auc_score(y_test, preds)

    # MLFlow에 하이퍼파라미터 및 평가 지표 저장
    mlflow.log_params({
        'model': 'RandomForest',
        'n_estimators': 100, 
    })

    mlflow.log_metrics({
        'accuracy': acc,
        'precision': precision,
        'recall': recall,
        'f1_score': f1,
        'roc_auc': roc_auc
    })

    # 입력과 출력 형태 정의 및 MLFlow에 모델 저장
    signature = infer_signature(X_train, preds)
    mlflow.sklearn.log_model(model, "random_forest_model", signature=signature)

🏃 View run RandomForest at: http://127.0.0.1:5000/#/experiments/1/runs/d5b6cd6832e34a5881d28608db7db3b5
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1


In [23]:
# XGBoost 모델 학습 및 평가
with mlflow.start_run(run_name="XGBoost"):
    model = xgb.XGBClassifier(n_estimators=100, eval_metric='logloss')
    model.fit(X_train, y_train) # 모델 학습
    preds = model.predict(X_test) # 예측 수행

    # 성능 평가 지표 계산
    acc = accuracy_score(y_test, preds)
    precision = precision_score(y_test, preds)
    recall = recall_score(y_test, preds)
    f1 = f1_score(y_test, preds)
    roc_auc = roc_auc_score(y_test, preds)

    # MLflow에 하이퍼파라미터 및 평가 지표 저장
    mlflow.log_params({'model': 'XGBoost', 'n_estimators': 100})
    mlflow.log_metrics({
        'accuracy': acc,
        'precision': precision,
        'recall': recall,
        'f1_score': f1,
        'roc_auc': roc_auc
    })

    # 입력과 출력 형태 정의 및 MLFlow에 모델 저장
    signature = infer_signature(X_train, preds)
    mlflow.xgboost.log_model(model, "xgboost_model", signature=signature, model_format='json')

🏃 View run XGBoost at: http://127.0.0.1:5000/#/experiments/1/runs/ff603a19bb9f4fbf93b3bab7909a7180
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1
