#### 노트북 수행 필요한 세팅

In [None]:
!mkdir model_artifacts

In [None]:
# 다음 명령어는 터미널 창에서 입력!
# sudo add-apt-repository universe
# sudo apt update
# sudo apt install graphviz

In [None]:
!python3 -m pip install --upgrade pip
!pip install mlflow --quiet
!pip install pyngrok --quiet
!pip install graphviz --quiet
!pip install pydotplus --quiet

#### 분석용 패키지 임포트

In [None]:
import itertools
import warnings

import graphviz
import matplotlib.pyplot as plt
import mlflow
import mlflow.sklearn
import numpy as np
import pandas as pd
import seaborn as sns
from IPython.display import Image
from mlflow import log_artifact, log_metric, log_param
from mlflow.models.signature import infer_signature
from pydotplus import graph_from_dot_data
from sklearn import tree
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.inspection import permutation_importance
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold, cross_val_score, train_test_split

plt.style.use("fivethirtyeight")
pd.plotting.register_matplotlib_converters()

warnings.filterwarnings("ignore")

#### Import Data

In [None]:
# 실습파일 다운로드 및 unzip
!wget -nc "http://archive.ics.uci.edu/ml/machine-learning-databases/00275/Bike-Sharing-Dataset.zip"
!unzip -o "Bike-Sharing-Dataset.zip"

# 데이터를 판다스 데이터프레임으로 로드
bike_sharing = pd.read_csv("hour.csv")
bike_sharing

#### Data preprocessing

In [None]:
# 필요컬럼만 사용
bike_sharing.drop(
    columns=["instant", "dteday", "registered", "casual"], inplace=True
)

# 컬럼명 변경
bike_sharing.rename(
    columns={
        "yr": "year",
        "mnth": "month",
        "hr": "hour_of_day",
        "holiday": "is_holiday",
        "workingday": "is_workingday",
        "weathersit": "weather_situation",
        "temp": "temperature",
        "atemp": "feels_like_temperature",
        "hum": "humidity",
        "cnt": "rented_bikes",
    },
    inplace=True,
)

bike_sharing

#### Prepare training and test data sets

In [None]:
# train/test 데이터 분리 (70:30 랜덤)
X = bike_sharing.drop("rented_bikes", axis=1)
y = bike_sharing.rented_bikes
X_train, X_test, y_train, y_test = train_test_split(
    X, y, train_size=0.7, test_size=0.3, random_state=42
)

print(f"Training samples: {X_train.size}")
print(f"Test samples: {X_test.size}")

#### Evaluation Metrics

In [None]:
def rmse(y, y_pred):
    return np.sqrt(mean_squared_error(y, y_pred))


def rmse_score(y, y_pred):
    score = rmse(y, y_pred)
    print("RMSE score: {:.4f}".format(score))
    return score


def rmsle_cv(model, X_train, y_train):
    kf = KFold(n_splits=3, shuffle=True, random_state=42).get_n_splits(
        X_train.values
    )
    rmse = np.sqrt(
        -cross_val_score(
            model,
            X_train.values,
            y_train,
            scoring="neg_mean_squared_error",
            cv=kf,
        )
    )
    return rmse


def rmse_cv_score(model, X_train, y_train):
    score = rmsle_cv(model, X_train, y_train)
    print(
        "Cross-Validation RMSE score: {:.4f} (std = {:.4f})".format(
            score.mean(), score.std()
        )
    )
    return score

#### Feature Importance

In [None]:
def model_feature_importance(model):
    feature_importance = pd.DataFrame(
        model.feature_importances_,
        index=X_train.columns,
        columns=["Importance"],
    )

    # 중요도에 따른 정렬
    feature_importance.sort_values(
        by="Importance", ascending=False, inplace=True
    )

    # plot
    plt.figure(figsize=(12, 8))
    sns.barplot(
        data=feature_importance.reset_index(),
        y="index",
        x="Importance",
    ).set_title("Feature Importance")
    # save image
    plt.savefig("model_artifacts/feature_importance.png", bbox_inches="tight")

#### Decision Tree Visualization

In [None]:
def model_tree_visualization(model):
    tree_dot_data = tree.export_graphviz(
        decision_tree=model.estimators_[0, 0],
        label="all",
        feature_names=X_train.columns,
        filled=True,
        rounded=True,
        proportion=True,
        impurity=False,
        precision=1,
    )

    # save image
    graph_from_dot_data(tree_dot_data).write_png(
        "model_artifacts/Decision_Tree_Visualization.png"
    )

    # show tree
    return graphviz.Source(tree_dot_data)

### MLflow Tracking

#### MLflow Logger

In [None]:
# 파라미터와 메트릭 추적
def log_mlflow_run(model, signature):
    # Auto-logging for scikit-learn estimators
    # mlflow.sklearn.autolog()

    # 로깅 정보: 모델 이름
    name = model.__class__.__name__
    mlflow.set_tag("estimator_name", name)

    # 로깅 정보 : 입력 피쳐 목록
    mlflow.set_tag("features", str(X_train.columns.values.tolist()))

    # 로깅 정보 : 파라미터 정보
    mlflow.log_params({key: model.get_params()[key] for key in parameters})

    # 로깅 정보: 평가 메트릭
    mlflow.log_metrics(
        {
            "RMSE_CV": score_cv.mean(),
            "RMSE": score,
        }
    )

    # 로깅 정보 : 학습 loss
    for s in model.train_score_:
        mlflow.log_metric("Train Loss", s)

    # 모델 아티팩트 저장
    mlflow.sklearn.log_model(model, "model", signature=signature)

    # log charts
    mlflow.log_artifacts("model_artifacts")

    # misc
    # Log all model parameters
    # mlflow.log_params(model.get_params())
    # mlflow.log_param("Training size", X_test.size)
    # mlflow.log_param("Test size", y_test.size)

#### Initialize MLflow experiment

In [None]:
experiment_name = "rented_bikes"

mlflow.set_experiment(experiment_name)
mlflow.set_tracking_uri("./mlruns")

# 모델 아티팩트 저장 폴더 생성
# !mkdir -p "model_artifacts"

#### Start experiments tracking UI

In [None]:
# 백그라운드 실행
get_ipython().system_raw("mlflow ui --port 5000 &")

### Model Training

In [None]:
# 모델 하이퍼파라미터
parameters = {
    "learning_rate": [0.1, 0.05, 0.01],
    "max_depth": [4, 5, 6],
}

# 모델 하이퍼파라미터 튜닝 with Grid search 방식
params_keys = parameters.keys()
params_values = [
    parameters[key] if isinstance(parameters[key], list) else [parameters[key]]
    for key in params_keys
]
runs_parameters = [
    dict(zip(params_keys, combination))
    for combination in itertools.product(*params_values)
]

runs_parameters

#### Training runs

In [None]:
# training loop
for i, run_parameters in enumerate(runs_parameters):
    print(f"Run {i}: {run_parameters}")

    # mlflow: stop active runs if any
    if mlflow.active_run():
        mlflow.end_run()

    # mlflow: track run
    mlflow.start_run(run_name=f"Run {i}")

    # create model instance
    model = GradientBoostingRegressor(**run_parameters)

    # train
    model.fit(X_train, y_train)

    # get evaluations scores
    score = rmse_score(y_test, model.predict(X_test))
    score_cv = rmse_cv_score(model, X_train, y_train)

    # generate charts
    model_feature_importance(model)
    plt.close()
    model_tree_visualization(model)

    # get model signature
    signature = infer_signature(
        model_input=X_train, model_output=model.predict(X_train)
    )

    # mlflow: log metrics
    log_mlflow_run(model, signature)

    # mlflow: end tracking
    mlflow.end_run()
    print("")

#### Best Model Results

In [None]:
best_run_df = mlflow.search_runs(
    order_by=["metrics.RMSE_CV ASC"], max_results=1
)
if len(best_run_df.index) == 0:
    raise Exception(f"Found no runs for experiment '{experiment_name}'")

best_run = mlflow.get_run(best_run_df.at[0, "run_id"])
best_model_uri = f"{best_run.info.artifact_uri}/model"
best_model = mlflow.sklearn.load_model(best_model_uri)

# 베스트 결과 정보
print("Best run info:")
print(f"Run id: {best_run.info.run_id}")
print(f"Run parameters: {best_run.data.params}")
print("Run score: RMSE_CV = {:.4f}".format(best_run.data.metrics["RMSE_CV"]))
print(f"Run model URI: {best_model_uri}")

### Model Serving


In [None]:
get_ipython().system_raw(
    f"mlflow models serve --model-uri {best_model_uri} \
        --host 0.0.0.0 --port 5001 --workers 1 --no-conda &"
)
!sleep 5

!curl --silent --show-error 'http://localhost:5001/invocations' \
    -H 'Content-Type: application/json' \
    -d '{"dataframe_split":{"columns": ["season", "year", "month",\
        "hour_of_day", "is_holiday", "weekday", "is_workingday", \
        "weather_situation", "temperature", "feels_like_temperature", \
        "humidity", "windspeed"], \
        "data": [[3, 0, 1, 23, 0, 6, 0, 1, 0.24, 0.2879, 0.81, 0.0000]]}}'