In [1]:
import pandas as pd
import xgboost as xgb
import mlflow
import mlflow.xgboost
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from mlflow.models import infer_signature

# Îç∞Ïù¥ÌÑ∞ Î°úÎî©
data = pd.read_csv('../data/dataset.csv')

# Tracking Server ÏÑ§Ï†ï
mlflow.set_tracking_uri(uri="http://127.0.0.1:5000") # ÌôòÍ≤Ω Î≥ÄÏàòÎ•º ÏÑ§Ï†ïÌñàÏßÄÎßå, ÌôïÏã§ÌïòÍ≤å ÏÑ§Ï†ï

In [2]:
# ÌÉÄÍ≤üÍ≥º ÌäπÏÑ± Î∂ÑÎ¶¨
X = data.drop('Default', axis=1)
y = data['Default']

In [3]:
# Ï†ÑÏ≤òÎ¶¨Î•º ÏúÑÌïú Ïª¨Îüº Íµ¨Î∂Ñ
categorical_cols = ['State', 'BankState', 'NewExist', 'UrbanRural', 'RealEstate']
numerical_cols = ['DisbursementGross', 'GrAppv', 'daysterm']

In [4]:
# Ï†ÑÏ≤òÎ¶¨ ÌååÏù¥ÌîÑÎùºÏù∏ Íµ¨ÏÑ± numÏùÄ ÏàòÏπòÌòï, catÏùÄ Î≤îÏ£ºÌòï
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), numerical_cols),
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
])

In [5]:
# Ï†ÑÏ≤òÎ¶¨ Ïã§Ìñâ
X_processed = preprocessor.fit_transform(X)

In [6]:
# ÌõàÎ†®/ÌÖåÏä§Ìä∏ Î∂ÑÌï†
X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.2, random_state=42)

In [7]:
# XGBoost Î™®Îç∏ ÌïôÏäµ
model = xgb.XGBClassifier(eval_metric='logloss')
model.fit(X_train, y_train)

In [9]:
# 2. MLflowÎ•º ÏÇ¨Ïö©ÌïòÏó¨ Î™®Îç∏ Î°úÍπÖ Î∞è Îì±Î°ù
mlflow.set_experiment("assignment2")

with mlflow.start_run(run_name="XGBoost"):

    # Î™®Îç∏ ÌååÎùºÎØ∏ÌÑ∞ Í∏∞Î°ù
    mlflow.log_params(model.get_params())

    # ÏÑ±Îä• ÏßÄÌëú Í∏∞Î°ù 
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]

    # ÏÑ±Îä• ÌèâÍ∞Ä ÏßÄÌëú Í≥ÑÏÇ∞
    acc = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred_proba)

    mlflow.log_metric("roc_auc", roc_auc)

    # Î™®Îç∏ Î°úÍπÖ Î∞è Model Registry Îì±Î°ù
    signature = infer_signature(X_train, y_pred)
    mlflow.xgboost.log_model(model, "model", registered_model_name="LoanDefaultModel", signature=signature, input_example=X_test[:1])

    print(f"Î™®Îç∏ Îì±Î°ù ÏôÑÎ£å. ROC-AUC: {roc_auc:.4f}")

  self.get_booster().save_model(fname)
  from .autonotebook import tqdm as notebook_tqdm
Downloading artifacts: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 7/7 [00:00<00:00, 615.32it/s]
Successfully registered model 'LoanDefaultModel'.
2025/10/10 17:23:13 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: LoanDefaultModel, version 1


Î™®Îç∏ Îì±Î°ù ÏôÑÎ£å. ROC-AUC: 0.9718
üèÉ View run XGBoost at: http://127.0.0.1:5000/#/experiments/1/runs/973faaf29d0b4a269b954397ef42b986
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/1


Created version '1' of model 'LoanDefaultModel'.
