In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support

%matplotlib inline

df = pd.read_csv('dataset_preprocessing.csv', index_col=0)
df.head()

Unnamed: 0,Age,SibSp,Parch,Fare,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male,Embarked_C,Embarked_N,Embarked_Q,Embarked_S,Survived
0,0.271174,0.125,0.0,0.014151,False,False,True,False,True,False,False,False,True,0
1,0.472229,0.125,0.0,0.139136,True,False,False,True,False,True,False,False,False,1
2,0.321438,0.0,0.0,0.015469,False,False,True,True,False,False,False,False,True,1
3,0.434531,0.125,0.0,0.103644,True,False,False,True,False,False,False,False,True,1
4,0.434531,0.0,0.0,0.015713,False,False,True,False,True,False,False,False,True,0


In [2]:
label = df["Survived"]
df = df.drop(["Survived"], axis=1)

In [3]:
X_train, X_test, y_train, y_test = train_test_split(df, label, test_size=0.25, random_state=1234)

### 0. 일반적인 모델 학습

In [4]:
# 모델 초기화
n_estimator = 300
random_state = 1234

model = RandomForestClassifier(n_estimators=n_estimator, random_state=random_state)
# 모델 학습
model.fit(X_train, y_train)
# 테스트 데이터에 대한 예측
y_pred = model.predict(X_test)
# 정확도 계산
accuracy = accuracy_score(y_test, y_pred)
prf = precision_recall_fscore_support(y_test, y_pred, average='binary')

In [5]:
# 정확도 출력
accuracy

0.8116591928251121

In [6]:
prf

(0.788235294117647, 0.7362637362637363, 0.7613636363636364, None)

### 1. MLflow를 활용한 모델 학습 Tracking
* custom logging (직접 로깅) 을 통한 모델 학습 기록 관리
* auto logging을 활용한 모델 학습 기록 관리
* auto loggine + custom logging

##### 1) Custom logging(직접 로깅)을 통한 모델 학습 기록 관리

In [7]:
import mlflow

'''
	To do
    tracking uri를 셋팅하기 -> local host로 진행 
    experiment 생성하기 -> hellomlflow!
    experiment 셋팅하기 -> hellomlflow!
'''
mlflow.set_tracking_uri('http://127.0.0.1:5000')
# mlflow.create_experiment('hello_mlflow')

In [10]:
# To do
# set experiment 
mlflow.set_experiment('hello_mlflow')

<Experiment: artifact_location='mlflow-artifacts:/397909761359334731', creation_time=1723429343558, experiment_id='397909761359334731', last_update_time=1723429343558, lifecycle_stage='active', name='hello_mlflow', tags={}>

In [9]:
'''
	To do
	mlflow 활용해서 Custom Logging을 진행 
    아래 함수를 이용해 logging을 진행
		1) mlflow.log_param()
			n_estimator
		2) mlflow.log_model()
			save model
    	3) mlflow.log_metric()
			metric -> accuracy, precision, recall, f1score

'''
n_estimator = 80
random_state = 123

with mlflow.start_run():

	model = RandomForestClassifier(n_estimators=n_estimator, random_state= random_state)
	model.fit(X_train, y_train)
	y_pred = model.predict(X_test)
	accuracy = accuracy_score(y_test,y_pred)
	prf = precision_recall_fscore_support(y_test, y_pred, average='binary')

	mlflow.log_param('n_estimator',n_estimator)
	mlflow.log_metric('accuracy_on_test',accuracy)
	mlflow.log_metric('precision_on_test', prf[0])
	mlflow.log_metric('recall_on_test',prf[1])
	mlflow.log_metric('f1score_on_test', prf[2])
	mlflow.sklearn.log_model(model, "model")

2024/08/12 11:46:23 INFO mlflow.tracking._tracking_service.client: 🏃 View run gifted-auk-659 at: http://127.0.0.1:5000/#/experiments/397909761359334731/runs/a78e7d57a53e467bb53ca319be3aed99.
2024/08/12 11:46:23 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/397909761359334731.


##### 2) auto logging을 활용한 모델 학습 기록 관리

In [11]:
# Mlflow Sklearn을 활용해서 모델 및 메트릭 자동 기록! 
'''
	To do 
	use autolog() on mlflow 
'''

mlflow.sklearn.autolog()

In [12]:
# 모델 초기화
n_estimator = 77
random_state = 2222

model = RandomForestClassifier(n_estimators=n_estimator, random_state=random_state)
# 모델 학습
model.fit(X_train, y_train)
# 테스트 데이터에 대한 예측
y_pred = model.predict(X_test)
# 정확도 계산
accuracy = accuracy_score(y_test, y_pred)
prf = precision_recall_fscore_support(y_test, y_pred, average='binary')

2024/08/12 11:51:06 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '1a22abccf3de4df5b2231f9194898a4b', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow
2024/08/12 11:51:09 INFO mlflow.tracking._tracking_service.client: 🏃 View run rebellious-snake-195 at: http://127.0.0.1:5000/#/experiments/397909761359334731/runs/1a22abccf3de4df5b2231f9194898a4b.
2024/08/12 11:51:09 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/397909761359334731.


### 3) auto logging + custom logging을 활용한 모델 학습 관리

In [13]:
'''
	To do
	auto logging에서 수집되지 않는 정보 추가로 logging 하기 
		precision, recall, f1score, accuracy for test data set 

'''


model = RandomForestClassifier(n_estimators=n_estimator, random_state= random_state)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test,y_pred)
prf = precision_recall_fscore_support(y_test, y_pred, average='binary')

mlflow.log_param('n_estimator',n_estimator)
mlflow.log_metric('accuracy_on_test',accuracy)
mlflow.log_metric('precision_on_test', prf[0])
mlflow.log_metric('recall_on_test',prf[1])
mlflow.log_metric('f1score_on_test', prf[2])

2024/08/12 12:00:36 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'a3ba99312bd3425f9ab7101464b80791', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow
2024/08/12 12:00:39 INFO mlflow.tracking._tracking_service.client: 🏃 View run unequaled-ant-285 at: http://127.0.0.1:5000/#/experiments/397909761359334731/runs/a3ba99312bd3425f9ab7101464b80791.
2024/08/12 12:00:39 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/397909761359334731.


In [14]:
mlflow.end_run()

2024/08/12 12:01:19 INFO mlflow.tracking._tracking_service.client: 🏃 View run abrasive-owl-869 at: http://127.0.0.1:5000/#/experiments/397909761359334731/runs/914a9b642cca4436bc4d981900449512.
2024/08/12 12:01:19 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/397909761359334731.


In [15]:

with mlflow.start_run():
    n_estimator = 80
    random_state = 123
    max_depth = 6
    model = RandomForestClassifier(n_estimators=n_estimator, max_depth = max_depth, random_state= random_state)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test,y_pred)
    prf = precision_recall_fscore_support(y_test, y_pred, average='binary')

    mlflow.log_param('n_estimator',n_estimator)
    mlflow.log_metric('accuracy_on_test',accuracy)
    mlflow.log_metric('precision_on_test', prf[0])
    mlflow.log_metric('recall_on_test',prf[1])
    mlflow.log_metric('f1score_on_test', prf[2])
    mlflow.sklearn.log_model(model, "model")

2024/08/12 12:02:45 INFO mlflow.tracking._tracking_service.client: 🏃 View run blushing-fish-443 at: http://127.0.0.1:5000/#/experiments/397909761359334731/runs/385f7b20a0544758bcf8e744dac1159f.
2024/08/12 12:02:45 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/397909761359334731.


### 2. MLflow를 활용한 모델 최적화 Tracking
* 직접 parameter 변경해가며 모델 최적화
* GridSearch 혹은 RandomSearch를 활용하여 모델 최적화


##### 1) 직접 parameter 변경해가며 모델 최적화

In [16]:
def train_model_with_hyperparameters(n_estimator, max_depth, max_feature):
    with mlflow.start_run():
        model = RandomForestClassifier(n_estimators=n_estimator, max_depth=max_depth, max_features=max_feature)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        prf = precision_recall_fscore_support(y_test, y_pred, average='binary')
        mlflow.log_metric("precision_on_test", prf[0])
        mlflow.log_metric("recall_on_test", prf[1])
        mlflow.log_metric("f1score_on_test", prf[2])
        mlflow.log_metric("accuracy_on_test", accuracy)

In [17]:
n_estimators = [50, 100, 150, 200]
max_depths = [2, 5, 10]
max_features = [5, 8, 10, 13]

'''
    To do
    run model with hyper-parameter combination 
'''

for n_estimator in n_estimators:
    for max_depth in max_depths:
        for max_feature in max_features:
            train_model_with_hyperparameters(n_estimator, max_depth, max_feature)

2024/08/12 13:24:25 INFO mlflow.tracking._tracking_service.client: 🏃 View run bold-boar-859 at: http://127.0.0.1:5000/#/experiments/397909761359334731/runs/7d968adb3f7440faa9e6b1cfc05d210b.
2024/08/12 13:24:25 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/397909761359334731.
2024/08/12 13:24:28 INFO mlflow.tracking._tracking_service.client: 🏃 View run skillful-squid-651 at: http://127.0.0.1:5000/#/experiments/397909761359334731/runs/9f386603705740f9b8aa48af378fa38e.
2024/08/12 13:24:28 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/397909761359334731.
2024/08/12 13:24:30 INFO mlflow.tracking._tracking_service.client: 🏃 View run carefree-kite-990 at: http://127.0.0.1:5000/#/experiments/397909761359334731/runs/7cf0e04b9cd2449e8de6134cdfaebae2.
2024/08/12 13:24:30 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/397909

##### 2) GridSearch 혹은 RandomSearch를 활용하여 모델 최적화


In [19]:
from sklearn.model_selection import GridSearchCV

'''
	To do
	run GridSearchCV with mlflow 
'''

with mlflow.start_run():
	model_grid = GridSearchCV(RandomForestClassifier(), 
                              {
                                  "n_estimators":n_estimators,
                                  "max_depth":max_depths,
                               	  "max_features":max_features
							   }
                            )
	model_grid.fit(X_train, y_train)
	y_pred = model_grid.predict(X_test)
	accuracy = accuracy_score(y_test, y_pred)
	prf = precision_recall_fscore_support(y_test, y_pred, average='binary')
	mlflow.log_metric("precision_on_test", prf[0])
	mlflow.log_metric("recall_on_test", prf[1])
	mlflow.log_metric("f1score_on_test", prf[2])
	mlflow.log_metric("accuracy_on_test", accuracy)


2024/08/12 13:38:49 INFO mlflow.sklearn.utils: Logging the 5 best runs, 43 runs will be omitted.
2024/08/12 13:38:49 INFO mlflow.tracking._tracking_service.client: 🏃 View run rogue-perch-957 at: http://127.0.0.1:5000/#/experiments/397909761359334731/runs/dbd3d7b6f47b4d8fa9401ba6bc18d444.
2024/08/12 13:38:49 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/397909761359334731.
2024/08/12 13:38:49 INFO mlflow.tracking._tracking_service.client: 🏃 View run rogue-sloth-840 at: http://127.0.0.1:5000/#/experiments/397909761359334731/runs/24e868d5000d4580a54eafce0477b8d3.
2024/08/12 13:38:49 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/397909761359334731.
2024/08/12 13:38:49 INFO mlflow.tracking._tracking_service.client: 🏃 View run thoughtful-ram-948 at: http://127.0.0.1:5000/#/experiments/397909761359334731/runs/e59e3d97eda546d2957ef6a20ff310d2.
2024/08/12 13:38:49 INFO mlflow.tr