### Init and import pkg

In [1]:
from pprint import pprint
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt
import warnings
from pandas import  DataFrame
import mlflow
import seaborn as sns

warnings.filterwarnings('ignore')

# mlflow settings
# mlflow.set_tracking_uri("http://localhost:5000")
# mlflow.set_experiment('Breast Cancer Classification')
# mlflow.sklearn.autolog()

# Load the data and format it as a dataframe
cancer = load_breast_cancer(as_frame=True)

# print the first 5 rows of the data
display(cancer.data.head())
display(cancer.data.info())
display(cancer.target.value_counts())


def model_training(model, train_data: DataFrame, train_target: DataFrame):
    model.fit(train_data, train_target)
    return model


def model_evaluation(model, test_data: DataFrame, predict_target: DataFrame) -> float:
    print(f'Model Evaluation:{ model.score(test_data,predict_target)}\n')
    return model.score(test_data, predict_target)


def print_classification_report(model, test_data: DataFrame, test_target: DataFrame):
    predict_target = model.predict(test_data)
    print(
        f'Classification Report:\n{classification_report(y_true=test_target, y_pred=predict_target)}\n')


def decide_classifier(classifier_name: str):
    """
        用來指定 ML classifier
    """

    return {
        'RandomForest': RandomForestClassifier(n_jobs=-1),
        'SVC': SVC()
    }[classifier_name]


def gen_param_grid(classifier_name: str) -> dict:
    """
        根據不同分類器，產生相對應的參數搜尋範圍
    """

    return {
        'RandomForest': {'n_estimators': range(10, 100, 10),
                         'max_depth': range(1, 20, 2),
                         'min_samples_split': [2, 5, 10],
                         'min_samples_leaf': [1, 2, 4],
                         },
        'SVC': {'C': [0.1, 1, 10, 100, 1000],
                'kernel': ['linear', 'rbf', 'poly', 'sigmoid'],
                'gamma': [1, 0.1, 0.01, 0.001]
                }
    }[classifier_name]


def fetch_logged_data(run_id: int):
    client = mlflow.MlflowClient()
    data = client.get_run(run_id).data
    tags = {k: v for k, v in data.tags.items() if not k.startswith("mlflow.")}
    artifacts = [f.path for f in client.list_artifacts(run_id, "model")]
    return data.params, data.metrics, tags, artifacts


Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 30 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   mean radius              569 non-null    float64
 1   mean texture             569 non-null    float64
 2   mean perimeter           569 non-null    float64
 3   mean area                569 non-null    float64
 4   mean smoothness          569 non-null    float64
 5   mean compactness         569 non-null    float64
 6   mean concavity           569 non-null    float64
 7   mean concave points      569 non-null    float64
 8   mean symmetry            569 non-null    float64
 9   mean fractal dimension   569 non-null    float64
 10  radius error             569 non-null    float64
 11  texture error            569 non-null    float64
 12  perimeter error          569 non-null    float64
 13  area error               569 non-null    float64
 14  smoothness error         5

None

1    357
0    212
Name: target, dtype: int64

In [2]:
# normalize data
nor_cancer_data = (cancer.data - cancer.data.mean()) / cancer.data.std()

# split the data into training and testing
cancer_data_train, cancer_data_test, cancer_target_train, cancer_target_test = train_test_split(
    nor_cancer_data, cancer.target, test_size=0.2, random_state=42)


### Tracking One Model

In [3]:
# single model training
print(f'======== SVM Model Training ========\n')
svc = model_training(model=decide_classifier(classifier_name='SVC'), train_data=cancer_data_train, train_target=cancer_target_train)
svc_result = model_evaluation(model=svc, test_data=cancer_data_test, predict_target=cancer_target_test)
svc_report = print_classification_report(model=svc, test_data=cancer_data_test, test_target=cancer_target_test)
print('====================================================================================================')



Model Evaluation:0.9736842105263158

Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.95      0.96        43
           1       0.97      0.99      0.98        71

    accuracy                           0.97       114
   macro avg       0.97      0.97      0.97       114
weighted avg       0.97      0.97      0.97       114




### Tracking Multiple Model

### Tracking Hyperparameters

In [4]:
print(f'======== SVM Model Hyperparameters tuning ========\n')

svc_grid_search = GridSearchCV(svc, param_grid=gen_param_grid(classifier_name='SVC'), refit=True, verbose=0).fit(cancer_data_train, cancer_target_train)
svc_best_result = model_evaluation(model=svc_grid_search.best_estimator_, test_data=cancer_data_test, predict_target=cancer_target_test)
svc_grid_search_report = print_classification_report(model=svc_grid_search.best_estimator_, test_data=cancer_data_test, test_target=cancer_target_test)

print(f'After tuning, performance had been updated {round((svc_best_result - svc_result) / svc_result * 100, 2)}%')
print('====================================================================================================')


Model Evaluation:0.9824561403508771

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.95      0.98        43
           1       0.97      1.00      0.99        71

    accuracy                           0.98       114
   macro avg       0.99      0.98      0.98       114
weighted avg       0.98      0.98      0.98       114


After tuning, performance had been updated 0.9%
