In [20]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
import mlflow
import mlflow.sklearn
from mlflow.models import infer_signature

mlflow.set_tracking_uri("http://127.0.0.1:5000")

# 1. 데이터 로드
train_df = pd.read_csv("../data/train.csv")
test_df = pd.read_csv("../data/test.csv")

In [21]:
# 2. 결측값 처리
# 각 컬럼별로 적절한 방식으로 결측값을 채움
for df in [train_df, test_df]:
    df.fillna({
        'Gender': 'Unknown',
        'Married': 'Unknown',
        'Self_Employed': 'Unknown',
        'LoanAmount': df['LoanAmount'].mean(),
        'Loan_Amount_Term': df['Loan_Amount_Term'].median(),
        'Credit_History': df['Credit_History'].mode()[0]
    }, inplace=True)


In [22]:
# 3. 범주형 변수 라벨 인코딩
cat_cols = ['Gender', 'Married', 'Education', 'Self_Employed', 'Property_Area']
le = LabelEncoder()
for col in cat_cols:
    train_df[col] = le.fit_transform(train_df[col])
    test_df[col] = le.transform(test_df[col])  # 동일한 인코더로 변환

In [23]:
# 4. 종속 변수 인코딩 ('Y' → 1, 'N' → 0)
train_df['Loan_Status'] = train_df['Loan_Status'].map({'Y': 1, 'N': 0})

In [24]:
# 5. 특성과 레이블 정의
features = [
    'Gender', 'Married', 'Education', 'Self_Employed',
    'ApplicantIncome', 'CoapplicantIncome',
    'LoanAmount', 'Loan_Amount_Term', 'Credit_History', 'Property_Area'
]

X = train_df[features]
y = train_df['Loan_Status']


In [25]:
# 6. 학습/검증 데이터 분할 (stratify로 클래스 비율 유지)
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [26]:
# 7. 피처 스케일링 (LogisticRegression에 필요)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)

In [27]:
# 스케일링 후 다시 DataFrame으로 변환 (MLflow input_example용)
X_train = pd.DataFrame(X_train, columns=features)
X_val = pd.DataFrame(X_val, columns=features)


In [28]:
# 8. MLflow 실험 설정
mlflow.set_experiment("Loan_Prediction_Experiment")

2025/10/01 17:15:09 INFO mlflow.tracking.fluent: Experiment with name 'Loan_Prediction_Experiment' does not exist. Creating a new experiment.


<Experiment: artifact_location=('file:///C:/Users/SSAFY/Desktop/TIL/100_offline/8. DA/73일차 '
 '(10.1)/data_science2_hw_6_4/mlruns/1'), creation_time=1759306509166, experiment_id='1', last_update_time=1759306509166, lifecycle_stage='active', name='Loan_Prediction_Experiment', tags={}>

In [29]:
# 9. 실험에 사용할 모델 정의 (모델명, 클래스, 하이퍼파라미터 목록)
models = {
    "RandomForest": {
        "class": RandomForestClassifier,
        "params_list": [
            {"max_depth": 5, "n_estimators": 100},
            {"max_depth": 7, "n_estimators": 300},
        ]
    },
    "LogisticRegression": {
        "class": LogisticRegression,
        "params_list": [
            {"C": 1.0, "max_iter": 100},
            {"C": 0.5, "max_iter": 300},
        ]
    }
}


In [31]:
# 10. 모델별 실험 루프
for model_name, model_info in models.items():
    for params in model_info["params_list"]:
        with mlflow.start_run():
            # 모델 학습
            ModelClass = model_info["class"]
            model = ModelClass(**params)
            model.fit(X_train, y_train)

            # 예측 및 평가
            preds = model.predict(X_val)
            acc = accuracy_score(y_val, preds)
            f1 = f1_score(y_val, preds)
            try:
                proba = model.predict_proba(X_val)[:, 1]
                auc = roc_auc_score(y_val, proba)
            except:
                auc = None  # 일부 모델은 predict_proba 없음

            # MLflow에 실험 결과 기록
            mlflow.set_tag("model_name", model_name)
            for k, v in params.items():
                mlflow.log_param(k, v)
            mlflow.log_metric("accuracy", acc)
            mlflow.log_metric("f1_score", f1)
            if auc is not None:
                mlflow.log_metric("roc_auc", auc)

            # 모델 서명 및 예제 입력 포함하여 저장
            signature = infer_signature(X_train, preds)
            mlflow.sklearn.log_model(
                model,
                artifact_path="model",
                signature=signature,
                input_example=X_train.iloc[:1]
            )

            print(f"[{model_name}] Params={params} → ACC={acc:.4f}, F1={f1:.4f}, AUC={auc if auc else 'N/A'}")


  from .autonotebook import tqdm as notebook_tqdm
Downloading artifacts: 100%|██████████| 7/7 [00:00<00:00, 4407.77it/s]


[RandomForest] Params={'max_depth': 5, 'n_estimators': 100} → ACC=0.8537, F1=0.9032, AUC=0.7981424148606812
🏃 View run grandiose-eel-28 at: http://127.0.0.1:5000/#/experiments/1/runs/2ebbebd1f4a1494a9250b9f66d4dd558
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1


Downloading artifacts: 100%|██████████| 7/7 [00:00<00:00, 2333.13it/s]


[RandomForest] Params={'max_depth': 7, 'n_estimators': 300} → ACC=0.8618, F1=0.9081, AUC=0.8068111455108359
🏃 View run nervous-rat-575 at: http://127.0.0.1:5000/#/experiments/1/runs/b2300b50f99540b2b03df8fd95024e24
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1


Downloading artifacts: 100%|██████████| 7/7 [00:00<00:00, 3497.34it/s]


[LogisticRegression] Params={'C': 1.0, 'max_iter': 100} → ACC=0.8618, F1=0.9081, AUC=0.8148606811145511
🏃 View run funny-bat-554 at: http://127.0.0.1:5000/#/experiments/1/runs/7fb6c3726968417693997ea87f10cb02
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1


Downloading artifacts: 100%|██████████| 7/7 [00:00<00:00, 909.77it/s] 

[LogisticRegression] Params={'C': 0.5, 'max_iter': 300} → ACC=0.8618, F1=0.9081, AUC=0.8145510835913313
🏃 View run fortunate-sponge-17 at: http://127.0.0.1:5000/#/experiments/1/runs/96aba73cf4d74eee89f4cf70150b39a3
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1



