In [1]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import f1_score, roc_auc_score, precision_score, recall_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

import xgboost as xgb
import os

import mlflow
from mlflow.models.signature import infer_signature
import optuna
import pickle

  from .autonotebook import tqdm as notebook_tqdm


## data loading

In [2]:
df = pd.read_csv("data/raw/diabetes_prediction_dataset.csv")

In [3]:
train, test_val = train_test_split(df, test_size=0.25, random_state=42)

In [4]:
valid, test = train_test_split(test_val, test_size=0.5, random_state=42)

In [5]:
os.makedirs("data/processed", exist_ok=True)

In [6]:
train.to_parquet("data/processed/train.parquet", index=False)
valid.to_parquet("data/processed/valid.parquet", index=False)
test.to_parquet("data/processed/test.parquet", index=False)

## Prepare features

In [7]:
target_var = "diabetes"

In [8]:
cat_vars = [col for col in train.columns.values if train[col].dtype == "O" and col != target_var]

In [9]:
cat_vars

['gender', 'smoking_history']

In [10]:
num_vars = [col for col in train.columns.values if train[col].dtype in ["int64", "float64"] and col != target_var]

In [11]:
num_vars

['age',
 'hypertension',
 'heart_disease',
 'bmi',
 'HbA1c_level',
 'blood_glucose_level']

## Prepare data

In [12]:
x_train = train.drop(target_var, axis=1)
y_train = train[target_var]

In [13]:
x_valid = valid.drop(target_var, axis=1)
y_valid = valid[target_var]

## Preprocessing

In [14]:
dv = DictVectorizer()

In [15]:
train_dict = x_train.to_dict(orient="records")
val_dict = x_valid.to_dict(orient="records")

In [16]:
X_train = dv.fit_transform(train_dict)
X_valid = dv.transform(val_dict)

In [17]:
save_dv = True 

if save_dv:
    os.makedirs("models", exist_ok=True)
    with open("models/preprocessor.b", "wb") as f:
        pickle.dump(dv, f)

## Modeling

### Setting mlflow server

In [18]:
mlflow.set_tracking_uri("sqlite:///backend.db")
mlflow.set_experiment("prototype")

2023/07/18 15:51:18 INFO mlflow.tracking.fluent: Experiment with name 'prototype' does not exist. Creating a new experiment.


<Experiment: artifact_location='/home/mk/Playground/my_contrib/prefect-practice-mlops/mlruns/1', creation_time=1689672078862, experiment_id='1', last_update_time=1689672078862, lifecycle_stage='active', name='prototype', tags={}>

### Hyperparameter tuning of random forest

In [19]:
def model_eval(y_true, y_pred, y_pred_prob):
    auc = roc_auc_score(y_true, y_pred_prob)
    print(f"AUC is: {auc:.3f}")
    f1 = f1_score(y_true, y_pred)
    print(f"f1 score is: {f1:.3f}")
    precision = precision_score(y_true, y_pred)
    print(f"Precision score is: {precision:.3f}")
    recall = recall_score(y_true, y_pred)
    print(f"Recall score is: {recall:.3f}")
    
    return auc, f1, precision, recall

In [20]:
def hyp_parameter_tune_rf(X_train, y_train, X_valid, y_valid):
    
    def objective(trial):
        
        with mlflow.start_run():
            
            params = {
                "n_estimators": trial.suggest_int("n_estimators", 100, 1000, step=100),
                "max_depth": trial.suggest_int("max_depth", 3, 10),
                "min_samples_split": trial.suggest_int("min_samples_split", 2, 10),
                "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 10),
                "max_features": trial.suggest_categorical("max_features", ["sqrt", "log2"]),
            }
        
            mlflow.log_params(params)

            # Create the Random Forest Classifier with the hyperparameters
            rf = RandomForestClassifier(**params, random_state=42)
            rf.fit(X_train, y_train)

            # Evaluate the model's performance on the validation set
            y_pred = rf.predict(X_valid)
            y_pred_prob = rf.predict_proba(X_valid)[:, 1]
            auc, f1, precision, recall = model_eval(y_valid, y_pred, y_pred_prob)
            
            metrics = {
                "auc": auc,
                "f1_score": f1,
                "precision": precision,
                "recall": recall,
            }
            signature = infer_signature(X_valid, y_pred)
            mlflow.log_metrics(metrics)
            mlflow.log_artifact("models/preprocessor.b", artifact_path="artifact")
            mlflow.sklearn.log_model(rf, artifact_path="model", signature=signature)
            
        return f1

    # Create an Optuna study and optimize the objective function
    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=5)


In [21]:
hyp_parameter_tune_rf(X_train, y_train, X_valid, y_valid)

[I 2023-07-18 15:51:18,900] A new study created in memory with name: no-name-f5d6bd81-589a-4112-9758-03ae066c395a


AUC is: 0.961
f1 score is: 0.799
Precision score is: 1.000
Recall score is: 0.666


[I 2023-07-18 15:51:46,190] Trial 0 finished with value: 0.7993596584845251 and parameters: {'n_estimators': 800, 'max_depth': 6, 'min_samples_split': 10, 'min_samples_leaf': 5, 'max_features': 'sqrt'}. Best is trial 0 with value: 0.7993596584845251.


AUC is: 0.961
f1 score is: 0.799
Precision score is: 1.000
Recall score is: 0.666


[I 2023-07-18 15:52:16,843] Trial 1 finished with value: 0.7993596584845251 and parameters: {'n_estimators': 900, 'max_depth': 6, 'min_samples_split': 9, 'min_samples_leaf': 10, 'max_features': 'sqrt'}. Best is trial 0 with value: 0.7993596584845251.


AUC is: 0.950
f1 score is: 0.716
Precision score is: 1.000
Recall score is: 0.558


[I 2023-07-18 15:52:23,527] Trial 2 finished with value: 0.7164860239589275 and parameters: {'n_estimators': 300, 'max_depth': 3, 'min_samples_split': 2, 'min_samples_leaf': 3, 'max_features': 'log2'}. Best is trial 0 with value: 0.7993596584845251.


AUC is: 0.970
f1 score is: 0.799
Precision score is: 1.000
Recall score is: 0.666


[I 2023-07-18 15:53:20,982] Trial 3 finished with value: 0.7993596584845251 and parameters: {'n_estimators': 1000, 'max_depth': 9, 'min_samples_split': 4, 'min_samples_leaf': 2, 'max_features': 'sqrt'}. Best is trial 0 with value: 0.7993596584845251.


AUC is: 0.953
f1 score is: 0.799
Precision score is: 1.000
Recall score is: 0.666


[I 2023-07-18 15:53:34,958] Trial 4 finished with value: 0.7993596584845251 and parameters: {'n_estimators': 600, 'max_depth': 4, 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_features': 'log2'}. Best is trial 0 with value: 0.7993596584845251.


### Hyperparameter tuning of xgboost

In [22]:
def f1_eval(y_pred, dtrain):
    y_true = dtrain.get_label()
    y_pred = (y_pred > 0.5).astype(int)
    return 'f1', f1_score(y_true, y_pred)

In [23]:
def hyp_parameter_tune_xgb(X_train, y_train, X_valid, y_valid):
    
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dvalid = xgb.DMatrix(X_valid, label=y_valid)
    
    def objective(trial):
        
        with mlflow.start_run():
            
            params = {
                "objective": "binary:logistic",
                "booster": trial.suggest_categorical("booster", ["gbtree", "dart"]),
                "max_depth": trial.suggest_int("max_depth", 3, 10),
                "subsample": trial.suggest_float("subsample", 0.5, 1.0),
                "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
                "eta": trial.suggest_float("eta", 0.01, 0.1),
                "lambda": trial.suggest_float("lambda", 0.0, 1.0),
                "alpha": trial.suggest_float("alpha", 0.0, 1.0),
            }
        
            mlflow.log_params(params)

            # Create the Random Forest Classifier with the hyperparameters
            model = xgb.train(
                params, 
                dtrain, 
                num_boost_round=1000,
                evals=[(dvalid, "validation")],
                maximize=True, 
                feval=f1_eval, 
                early_stopping_rounds=50, 
                verbose_eval=1000,
            )

            # Evaluate the model's performance on the validation set
            y_pred_prob = model.predict(dvalid)
            y_pred = (y_pred_prob > 0.5).astype(int)
            auc, f1, precision, recall = model_eval(y_valid, y_pred, y_pred_prob)
            
            metrics = {
                "auc": auc,
                "f1_score": f1,
                "precision": precision,
                "recall": recall,
            }
            signature = infer_signature(X_valid, y_pred)
            mlflow.log_metrics(metrics)
            mlflow.log_artifact("models/preprocessor.b", artifact_path="artifact")
            mlflow.xgboost.log_model(model, artifact_path="model", signature=signature)
            
        return -f1

    # Create an Optuna study and optimize the objective function
    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=5)

In [24]:
hyp_parameter_tune_xgb(X_train, y_train, X_valid, y_valid)

[I 2023-07-18 15:53:34,996] A new study created in memory with name: no-name-6431c0f1-f995-4cfe-b19b-c6f3bbfd082d


[0]	validation-logloss:0.63313	validation-f1:0.00000




[59]	validation-logloss:0.10015	validation-f1:0.79936
AUC is: 0.977
f1 score is: 0.799
Precision score is: 1.000
Recall score is: 0.666


[I 2023-07-18 15:53:38,338] Trial 0 finished with value: -0.7993596584845251 and parameters: {'booster': 'dart', 'max_depth': 3, 'subsample': 0.7059876338280962, 'colsample_bytree': 0.718506151684948, 'eta': 0.07464704421775976, 'lambda': 0.6276616389140032, 'alpha': 0.8204659901447866}. Best is trial 0 with value: -0.7993596584845251.


[0]	validation-logloss:0.62254	validation-f1:0.00000




[57]	validation-logloss:0.09537	validation-f1:0.79936
AUC is: 0.978
f1 score is: 0.799
Precision score is: 1.000
Recall score is: 0.666


[I 2023-07-18 15:53:41,364] Trial 1 finished with value: -0.7993596584845251 and parameters: {'booster': 'dart', 'max_depth': 3, 'subsample': 0.7372608890728009, 'colsample_bytree': 0.731808186499775, 'eta': 0.08843833107509119, 'lambda': 0.43425795599641726, 'alpha': 0.19623788201534864}. Best is trial 0 with value: -0.7993596584845251.


[0]	validation-logloss:0.64790	validation-f1:0.00000




[62]	validation-logloss:0.10097	validation-f1:0.79936
AUC is: 0.978
f1 score is: 0.801
Precision score is: 0.999
Recall score is: 0.668


[I 2023-07-18 15:53:45,300] Trial 2 finished with value: -0.8008519701810436 and parameters: {'booster': 'dart', 'max_depth': 6, 'subsample': 0.9308917693366472, 'colsample_bytree': 0.7924500932910952, 'eta': 0.05529284172803052, 'lambda': 0.6430453861939815, 'alpha': 0.38664796509761856}. Best is trial 0 with value: -0.7993596584845251.


[0]	validation-logloss:0.61844	validation-f1:0.00000




[53]	validation-logloss:0.08812	validation-f1:0.79936
AUC is: 0.978
f1 score is: 0.799
Precision score is: 0.999
Recall score is: 0.666


[I 2023-07-18 15:53:48,387] Trial 3 finished with value: -0.7989333333333334 and parameters: {'booster': 'dart', 'max_depth': 5, 'subsample': 0.9945256921899931, 'colsample_bytree': 0.9141791776428684, 'eta': 0.09275507396496029, 'lambda': 0.6648599673986569, 'alpha': 0.1626186034734769}. Best is trial 3 with value: -0.7989333333333334.


[0]	validation-logloss:0.63591	validation-f1:0.00000




[137]	validation-logloss:0.08135	validation-f1:0.80212
AUC is: 0.981
f1 score is: 0.805
Precision score is: 0.983
Recall score is: 0.682


[I 2023-07-18 15:53:52,313] Trial 4 finished with value: -0.8052493438320211 and parameters: {'booster': 'gbtree', 'max_depth': 9, 'subsample': 0.8535778716566426, 'colsample_bytree': 0.752552161377825, 'eta': 0.07053178338602817, 'lambda': 0.6075520956107162, 'alpha': 0.7513940300597909}. Best is trial 3 with value: -0.7989333333333334.
