### Init and import pkg

In [1]:
from sklearn.datasets import load_breast_cancer
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split, GridSearchCV, RepeatedStratifiedKFold
import warnings
from pandas import  DataFrame
import mlflow


warnings.filterwarnings('ignore')

# mlflow settings
mlflow.set_tracking_uri("http://localhost:5000")
mlflow.set_experiment('Breast Cancer Classification')
mlflow.sklearn.autolog()

# Load the data and format it as a dataframe
cancer = load_breast_cancer(as_frame=True)

# print the first 5 rows of the data
display(cancer.data.head())
display(cancer.target.value_counts())


def model_training(model, train_data: DataFrame, train_target: DataFrame):
    """
        input: model, train_data, train_target
        process: model training
        output: model
    """
    model.fit(train_data, train_target)
    return model


def model_evaluation(model, test_data: DataFrame, predict_target: DataFrame) -> float:
    """
        input: model, test_data, predict_target
        process: model evaluation
        output: model score
        
    """
    print(f'Model Evaluation:{ model.score(test_data,predict_target)}\n')
    return model.score(test_data, predict_target)


def print_classification_report(model, test_data: DataFrame, test_target: DataFrame):
    """
        input: model, test_data, test_target
        process: print classification report
        output: None
    """
    predict_target = model.predict(test_data)
    print(
        f'Classification Report:\n{classification_report(y_true=test_target, y_pred=predict_target)}\n')


def decide_classifier(classifier_name: str):
    """
        input: classifier_name
        process: classifier callback
        output: classifier
    """

    return {
        'RandomForest': RandomForestClassifier(n_jobs=-1),
        'SVC': SVC()
    }[classifier_name]


def gen_param_grid(classifier_name: str) -> dict:
    """
        input: classifier_name
        process: generate parameter grid
        output: parameter range dictionary
    """

    return {
        'RandomForest': {'n_estimators': range(10, 20, 10),
                         'max_depth': range(1, 4, 2),
                         'min_samples_split': [2, 5, 10],
                         'min_samples_leaf': [1, 2, 4],
                         },
        'SVC': {'C': [0.1, 1, 10, 100, 1000],
                'kernel': ['linear', 'rbf', 'poly', 'sigmoid'],
                'gamma': [1, 0.1, 0.01, 0.001]
                }
    }[classifier_name]



Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


1    357
0    212
Name: target, dtype: int64

In [2]:
# normalize data
nor_cancer_data = (cancer.data - cancer.data.mean()) / cancer.data.std()

# split the data into training and testing
cancer_data_train, cancer_data_test, cancer_target_train, cancer_target_test = train_test_split(
    nor_cancer_data, cancer.target, test_size=0.2, random_state=42)
display(cancer_data_train.head())

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
68,-1.446714,-0.455622,-1.36545,-1.149113,0.728073,0.699812,2.812359,-0.133216,1.092064,2.501626,...,-1.232959,-0.492531,-1.242799,-0.976335,0.693374,1.15825,4.696536,0.918783,2.145302,1.857798
181,1.97577,1.692697,2.087782,1.864406,1.261345,3.386663,2.005784,2.594677,2.12802,1.583827,...,2.154002,1.269517,2.060522,2.122424,0.732791,3.204183,1.945179,2.672867,1.935176,2.461299
63,-1.405852,-1.262405,-1.348576,-1.11956,-1.36164,-0.318692,-0.362762,-0.698896,1.931042,0.967711,...,-1.295029,-1.048967,-1.240121,-1.001978,-1.489486,-0.549555,-0.635058,-0.969633,0.616227,0.05283
248,-0.986732,1.378819,-0.98601,-0.874898,0.014912,-0.605933,-0.815473,-0.844504,0.311449,0.06974,...,-0.831572,1.547735,-0.871399,-0.746251,0.767829,-0.727517,-0.765436,-0.810046,0.821505,-0.137079
60,-1.122939,-1.025253,-1.128403,-0.974639,1.211573,-0.449342,-0.977916,-0.92826,3.397431,0.963462,...,-1.08606,-1.338574,-1.113047,-0.89923,-0.213232,-0.988995,-1.200764,-1.35118,1.060726,-0.207395


### Tracking One Model

In [3]:
single_model_tags = {'model': 'SVM',
                     'tuning': 'False'
                     }

with mlflow.start_run() as run:

    mlflow.set_tags(single_model_tags)
# single model training
    print(f'======== SVM Model Training ========\n')

    svc = model_training(model=decide_classifier(classifier_name='SVC'), train_data=cancer_data_train, train_target=cancer_target_train)
    svc_result = model_evaluation(model=svc, test_data=cancer_data_test, predict_target=cancer_target_test)
    svc_report = print_classification_report(model=svc, test_data=cancer_data_test, test_target=cancer_target_test)

    print('====================================================================================================')



Model Evaluation:0.9736842105263158

Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.95      0.96        43
           1       0.97      0.99      0.98        71

    accuracy                           0.97       114
   macro avg       0.97      0.97      0.97       114
weighted avg       0.97      0.97      0.97       114




### Tracking Multiple Model

In [4]:
svm_tags = {'model': 'SVM',
                     'tuning': 'False'
                     }

with mlflow.start_run() as run:

    mlflow.set_tags(single_model_tags)
# SVM model training
    print(f'======== SVM Model Training ========\n')

    svc = model_training(model=decide_classifier(classifier_name='SVC'), train_data=cancer_data_train, train_target=cancer_target_train)
    svc_result = model_evaluation(model=svc, test_data=cancer_data_test, predict_target=cancer_target_test)
    svc_report = print_classification_report(model=svc, test_data=cancer_data_test, test_target=cancer_target_test)

    print('====================================================================================================')

rf_tags = {'model': 'RandomForest',
                     'tuning': 'False'
                     }

with mlflow.start_run() as run:

    mlflow.set_tags(rf_tags)
# random forest model training
    print(f'======== RandomForest Model Training ========\n')

    rf = model_training(model=decide_classifier(classifier_name='RandomForest'), train_data=cancer_data_train, train_target=cancer_target_train)
    rf_result = model_evaluation(model=rf, test_data=cancer_data_test, predict_target=cancer_target_test)
    rf_report = print_classification_report(model=rf, test_data=cancer_data_test, test_target=cancer_target_test)

    print('====================================================================================================')




Model Evaluation:0.9736842105263158

Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.95      0.96        43
           1       0.97      0.99      0.98        71

    accuracy                           0.97       114
   macro avg       0.97      0.97      0.97       114
weighted avg       0.97      0.97      0.97       114



Model Evaluation:0.9649122807017544

Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.93      0.95        43
           1       0.96      0.99      0.97        71

    accuracy                           0.96       114
   macro avg       0.97      0.96      0.96       114
weighted avg       0.97      0.96      0.96       114




### Tracking Hyperparameters

In [5]:
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

svm_tuning_model_tags = {'model': 'SVM',
                     'tuning': 'True'
                     }

rf_tuning_model_tags = {'model': 'RandomForest',
                     'tuning': 'True'
                     }

with mlflow.start_run() as run:

    mlflow.set_tags(svm_tuning_model_tags)

    print(f'======== SVM Model Hyperparameters tuning ========\n')

    svc_grid_search = GridSearchCV(svc, param_grid=gen_param_grid(classifier_name='SVC'), refit=True, verbose=0, cv=cv).fit(cancer_data_train, cancer_target_train)
    svc_best_result = model_evaluation(model=svc_grid_search.best_estimator_, test_data=cancer_data_test, predict_target=cancer_target_test)
    svc_grid_search_report = print_classification_report(model=svc_grid_search.best_estimator_, test_data=cancer_data_test, test_target=cancer_target_test)

    print(f'After tuning, performance had been updated {round((svc_best_result - svc_result) / svc_result * 100, 2)}%')
    print('====================================================================================================')
    

with mlflow.start_run() as run:

    mlflow.set_tags(rf_tuning_model_tags)

    print(f'======== RandomForest Model Hyperparameters tuning ========\n')
    rf_grid_search = GridSearchCV(rf, param_grid=gen_param_grid(classifier_name='RandomForest'), refit=True, verbose=0, cv=cv).fit(cancer_data_train, cancer_target_train)
    rf_best_result = model_evaluation(model=rf_grid_search.best_estimator_, test_data=cancer_data_test, predict_target=cancer_target_test)
    rf_grid_search_report = print_classification_report(model=rf_grid_search.best_estimator_, test_data=cancer_data_test, test_target=cancer_target_test)

    print(f'After tuning, performance had been updated {round((rf_best_result - rf_result) / rf_result * 100, 2)}%')
    print('====================================================================================================')




2022/12/12 01:12:13 INFO mlflow.sklearn.utils: Logging the 5 best runs, 75 runs will be omitted.


Model Evaluation:0.9824561403508771

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.95      0.98        43
           1       0.97      1.00      0.99        71

    accuracy                           0.98       114
   macro avg       0.99      0.98      0.98       114
weighted avg       0.98      0.98      0.98       114


After tuning, performance had been updated 0.9%



2022/12/12 01:12:32 INFO mlflow.sklearn.utils: Logging the 5 best runs, 13 runs will be omitted.


Model Evaluation:0.9649122807017544

Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.93      0.95        43
           1       0.96      0.99      0.97        71

    accuracy                           0.96       114
   macro avg       0.97      0.96      0.96       114
weighted avg       0.97      0.96      0.96       114


After tuning, performance had been updated 0.0%
