In [1]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVC
from xgboost import XGBClassifier
from dotenv import load_dotenv
import mlflow
from mlflow import MlflowClient
import os
from datetime import datetime
import gdown



* 'schema_extra' has been renamed to 'json_schema_extra'


# 開發實驗階段

- 請先完成快速安裝
- 此為開發實驗階段主要跟大家分享如何將過程紀錄在MLflow中，並將每次的實驗紀錄(模型參數, Loss曲線, 評估指標…等)儲存起來，方便之後多實驗結果比較。

# 功能介紹
- 紀錄模型超參數及訓練結果、並將模型存到 Minio裡面

In [2]:
# 使用 Gdown 獲取資料
# 資料下載 url
url = "https://drive.google.com/file/d/13_yil-3-ihA_px4nFdWq8KVoQWxxffHm/view?usp=sharing"
gdown.download(url, output='data/titanic_data.csv', quiet=False, fuzzy=True)

# 資料讀取
data = pd.read_csv("data/titanic_data.csv")


Downloading...
From: https://drive.google.com/uc?id=13_yil-3-ihA_px4nFdWq8KVoQWxxffHm
To: /Users/shlongkuu/mlops/MLOps-is-all-you-need/Quick_start/data/titanic_data.csv
100%|██████████| 61.0k/61.0k [00:00<00:00, 37.7MB/s]


'data/titanic_data.csv'

In [4]:
# 將 Age 的缺失值補 Age 的平均數
data['Age'].fillna(data['Age'].mean(), inplace = True) # inplace=True 表示覆蓋掉原資料
# 資料 Ground Truth 設定
y_train = data.Survived
X_train = data.drop(columns='Survived')

X_train

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,3,Jerry,male,22.000000,1,0,A/5 21171,7.2500,,S
1,2,1,Jerry,female,38.000000,1,0,PC 17599,71.2833,C85,C
2,3,3,Jerry,female,26.000000,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,Jerry,female,35.000000,1,0,113803,53.1000,C123,S
4,5,3,Jerry,male,35.000000,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...
886,887,2,"Montvila, Rev. Juozas",male,27.000000,0,0,211536,13.0000,,S
887,888,1,"Graham, Miss. Margaret Edith",female,19.000000,0,0,112053,30.0000,B42,S
888,889,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,29.699118,1,2,W./C. 6607,23.4500,,S
889,890,1,"Behr, Mr. Karl Howell",male,26.000000,0,0,111369,30.0000,C148,C


In [5]:
numerical_features = ['Age', 'SibSp', 'Parch', 'Fare']
X_train = X_train[numerical_features]

# 將連續變項歸一化(MinMaxScaler): 將數值壓縮到0~1之間
scaler = MinMaxScaler()
X_train[numerical_features] = scaler.fit_transform(X_train[numerical_features])

print(X_train.shape)
X_train

(891, 4)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[numerical_features] = scaler.fit_transform(X_train[numerical_features])


Unnamed: 0,Age,SibSp,Parch,Fare
0,0.271174,0.125,0.000000,0.014151
1,0.472229,0.125,0.000000,0.139136
2,0.321438,0.000,0.000000,0.015469
3,0.434531,0.125,0.000000,0.103644
4,0.434531,0.000,0.000000,0.015713
...,...,...,...,...
886,0.334004,0.000,0.000000,0.025374
887,0.233476,0.000,0.000000,0.058556
888,0.367921,0.125,0.333333,0.045771
889,0.321438,0.000,0.000000,0.058556


In [6]:
# 建立模型
model_svc = SVC(C=1.0,        # Regularization parameter
                kernel='rbf') # kernel

model_xgb = XGBClassifier(max_depth=2,
                          learning_rate=0.1)


In [7]:
# 訓練模型
model_svc.fit(X_train, y_train)

In [8]:
model_xgb.fit(X_train, y_train)

In [9]:
# 評估指標
y_pred = model_svc.predict(X_train)
accuracy_svc = (y_pred == y_train).sum()/y_train.shape[0]
accuracy_svc

0.7362514029180696

In [10]:
y_pred = model_xgb.predict(X_train)
accuracy_xgb = (y_pred == y_train).sum()/y_train.shape[0]
accuracy_xgb

0.7328843995510662

In [11]:
load_dotenv('.env')
os.environ["AWS_ACCESS_KEY_ID"] = os.getenv('MINIO_ROOT_USER')
os.environ["AWS_SECRET_ACCESS_KEY"] = os.getenv('MINIO_ROOT_PASSWORD')
os.environ["MLFLOW_S3_ENDPOINT_URL"] = os.getenv('MLFLOW_S3_ENDPOINT_URL')

mlflow.set_tracking_uri(os.getenv('MLFLOW_TRACKING_URI'))

experiment_name = 'Titanic'
existing_exp = mlflow.get_experiment_by_name(experiment_name)

if not existing_exp:
    mlflow.create_experiment(experiment_name, "s3://mlflow/")
mlflow.set_experiment(experiment_name)


now = datetime.now()
dt_string = now.strftime("%Y-%m-%d %H-%M-%S")
with mlflow.start_run(run_name='Run_%s' % dt_string):
    mlflow.set_experiment_tag('developer', 'GU')

    mlflow.log_params({
        'Model': "XGboost",
        'Learning rate': 0.1,
    })

    mlflow.log_metric("Test Accuracy", accuracy_xgb)

    mlflow.xgboost.log_model(model_xgb, artifact_path='Model')

now = datetime.now()
dt_string = now.strftime("%Y-%m-%d %H-%M-%S")
with mlflow.start_run(run_name='Run_%s' % dt_string):
    mlflow.set_experiment_tag('developer', 'GU')

    mlflow.log_params({
        'Model': 'SVC',
        'C': 1,
        'kernel':'rbf',
    })

    mlflow.log_metric("Test Accuracy", accuracy_svc)

    mlflow.sklearn.log_model(model_svc, artifact_path='Model')



## 前言
此部署階段主要跟大家分享如何將訓練好的模型進行部署，一般來說會有兩道手續：
1. 從眾多實驗中找出要將哪個模型進行部署，需要對該模型進行"註冊"(Register)
2. 使用註冊後的進行部署，並實際進行資料推論

* 因為部署階段需要使用到前面安裝步驟的相關套件，所以請先確保有確實完成快速安裝
* 此階段需要幾個訓練完成的模型並上傳至 MLflow，也請確定"開發實驗階段"有確實完成

## 功能介紹

1. 註冊模型(Register model)
2. 模型部署預測

### 註冊評估指標最高的模型

In [12]:
'''
獲得實驗編號
'''
target_experiments = {}
for rm in mlflow.search_experiments(filter_string="name = 'Titanic'"):
    target_experiments = dict(rm)

experiment_id = target_experiments['experiment_id']

experiment_id

'6'

In [13]:
'''
透過實驗編號取得每一次的模型紀錄
'''
runs_df = mlflow.search_runs(experiment_ids=experiment_id)
runs_df = runs_df.sort_values(by=['metrics.Test Accuracy'], ascending=False)
runs_df.reset_index(inplace=True)
runs_df

Unnamed: 0,index,run_id,experiment_id,status,artifact_uri,start_time,end_time,metrics.Test Accuracy,params.kernel,params.Model,params.C,params.Learning rate,tags.mlflow.source.type,tags.mlflow.log-model.history,tags.mlflow.source.name,tags.mlflow.user,tags.mlflow.runName
0,0,e660cc950b66467e8a3506f20ea7c3d2,6,FINISHED,s3://mlflow/e660cc950b66467e8a3506f20ea7c3d2/a...,2023-11-04 01:49:37.015000+00:00,2023-11-04 01:49:38.633000+00:00,0.736251,rbf,SVC,1.0,,LOCAL,"[{""run_id"": ""e660cc950b66467e8a3506f20ea7c3d2""...",/Users/shlongkuu/miniconda3/envs/torch/lib/pyt...,shlongkuu,Run_2023-11-04 09-49-37
1,2,72c957c850134a3dbdf5cabf34b4ceac,6,FINISHED,s3://mlflow/72c957c850134a3dbdf5cabf34b4ceac/a...,2023-11-04 01:47:13.516000+00:00,2023-11-04 01:47:15.147000+00:00,0.736251,rbf,SVC,1.0,,LOCAL,"[{""run_id"": ""72c957c850134a3dbdf5cabf34b4ceac""...",/Users/shlongkuu/miniconda3/envs/torch/lib/pyt...,shlongkuu,Run_2023-11-04 09-47-13
2,1,80aedc32739c438391d2dabeb3a47618,6,FINISHED,s3://mlflow/80aedc32739c438391d2dabeb3a47618/a...,2023-11-04 01:49:34.831000+00:00,2023-11-04 01:49:36.975000+00:00,0.732884,,XGboost,,0.1,LOCAL,"[{""run_id"": ""80aedc32739c438391d2dabeb3a47618""...",/Users/shlongkuu/miniconda3/envs/torch/lib/pyt...,shlongkuu,Run_2023-11-04 09-49-34
3,3,b2f625467679442a83351fb7787be462,6,FINISHED,s3://mlflow/b2f625467679442a83351fb7787be462/a...,2023-11-04 01:47:10.408000+00:00,2023-11-04 01:47:13.470000+00:00,0.732884,,XGboost,,0.1,LOCAL,"[{""run_id"": ""b2f625467679442a83351fb7787be462""...",/Users/shlongkuu/miniconda3/envs/torch/lib/pyt...,shlongkuu,Run_2023-11-04 09-47-10


In [14]:
'''
將評估指標表現最好的模型進行”註冊“
'''
best_run = runs_df.iloc[0]
best_run_id = best_run["run_id"]
mv = mlflow.register_model(model_uri="runs:/%s/Model"%best_run_id, 
                           name="Titanic_model")
mv

Registered model 'Titanic_model' already exists. Creating a new version of this model...
2023/11/04 09:49:38 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: Titanic_model, version 3
Created version '3' of model 'Titanic_model'.


<ModelVersion: aliases=[], creation_timestamp=1699062578852, current_stage='None', description='', last_updated_timestamp=1699062578852, name='Titanic_model', run_id='e660cc950b66467e8a3506f20ea7c3d2', run_link='', source='s3://mlflow/e660cc950b66467e8a3506f20ea7c3d2/artifacts/Model', status='READY', status_message='', tags={}, user_id='', version='3'>

In [15]:
'''
將註冊後的模型加入版本號(Staging, Production, Archived)
'''
client = MlflowClient(tracking_uri=os.getenv('MLFLOW_TRACKING_URI'))
client.transition_model_version_stage(
    name="Titanic_model", version=int(mv.version), stage="Production"
)

<ModelVersion: aliases=[], creation_timestamp=1699062578852, current_stage='Production', description='', last_updated_timestamp=1699062578903, name='Titanic_model', run_id='e660cc950b66467e8a3506f20ea7c3d2', run_link='', source='s3://mlflow/e660cc950b66467e8a3506f20ea7c3d2/artifacts/Model', status='READY', status_message='', tags={}, user_id='', version='3'>

### 取得註冊後的模型並進行推論

In [16]:
import mlflow.pyfunc
import numpy as np

In [17]:
'''
下載註冊後的模型, 並使用MLflow 讀取模型
'''
model_name = "Titanic_model"
stage = "Production"

model = mlflow.pyfunc.load_model(model_uri=f"models:/{model_name}/{stage}")
model

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

mlflow.pyfunc.loaded_model:
  artifact_path: Model
  flavor: mlflow.sklearn
  run_id: e660cc950b66467e8a3506f20ea7c3d2

In [18]:
'''
建立一筆測試資料，並進行預測
'''

result = model.predict(X_train[:1])
result

array([0])