# Import libraries

In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
import mlflow
from mlflow.models import infer_signature

In [2]:
# Initiate MLFlow
mlflow.set_tracking_uri(uri='http://127.0.0.1:6500')
mlflow.set_experiment('hieunt-mlops')

2024/11/30 18:47:51 INFO mlflow.tracking.fluent: Experiment with name 'hieunt-mlops' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/593591814568255806', creation_time=1732967271809, experiment_id='593591814568255806', last_update_time=1732967271809, lifecycle_stage='active', name='hieunt-mlops', tags={}>

# 1. Dataset creation:

In [3]:
# Initiate the dataset by make_classification
X, y = make_classification(n_samples=10000, n_features=12, random_state=42)

In [4]:
# Convert into different ranges for each features
# (based on Loan Approval Clasification Dataset on Kaggle, https://www.kaggle.com/datasets/taweilo/loan-approval-classification-data)
# feature: [lower, upper]
feature_range = {
    'Age': ([18, 80], 'int'),    # in years
    'Gender': ([0, 1], 'int'),    # 0: Female, 1: Male
    'Education': ([0, 3], 'int'),    # 0: High school, 1: Bachelor, 2: Master, 3: Doctorate
    'Income': ([5, 60], 'float'),    # Monthly income in millions VND
    'Experience': ([2, 15], 'int'),    # Years of employment experience
    'Home ownership': ([0, 3], 'int'),    # 0: Rent, 1: Mortgage, 2: Own, 3: Other
    'Loan amount': ([10, 500], 'float'),    # in millions VND
    'Loan intent': ([0, 5], 'int'),    # 0: Education, 1: Medical, 2: Venture, 3: Personal, 4: Debt consolidation, 5: Home improvement
    'Interest rate': ([3, 10], 'float'),     # in %
    'Credit score': ([100, 1000], 'int'),
    'Credit history length': ([1, 10], 'int'),    # in years
    'Previous loan defaults': ([0, 1], 'int')    # Binary (loan defaults: failure to make required interest or principal repayments on debt)
}

In [5]:
col_names = []
for i, (feat_name, (feat_range, dtype)) in enumerate(feature_range.items()):
    col_data = X[:, i]
    min_val = np.min(col_data)
    max_val = np.max(col_data)
    min_new, max_new = feat_range
    X[:, i] = (col_data - min_val) * (max_new - min_new) / (max_val - min_val)
    if dtype == 'int':
        X[:, i] = np.round(X[:, i])
    col_names.append(feat_name)

In [6]:
# Merge the target to get the DataFrame
data = np.concat((X, y.reshape((len(y), 1))), axis=1)
col_names.append('target')
df = pd.DataFrame.from_records(data, columns=col_names)
df.head()

Unnamed: 0,Age,Gender,Education,Income,Experience,Home ownership,Loan amount,Loan intent,Interest rate,Credit score,Credit history length,Previous loan defaults,target
0,47.0,1.0,2.0,13.493422,4.0,1.0,276.344837,3.0,4.409338,312.0,5.0,0.0,1.0
1,27.0,0.0,2.0,29.079307,8.0,1.0,126.752888,3.0,4.582697,546.0,5.0,1.0,1.0
2,43.0,1.0,2.0,24.206073,6.0,1.0,104.482475,2.0,5.069467,365.0,4.0,0.0,1.0
3,50.0,1.0,1.0,30.312757,6.0,1.0,209.144774,2.0,5.144378,463.0,3.0,0.0,0.0
4,24.0,1.0,2.0,30.284954,7.0,1.0,275.644925,2.0,5.728048,605.0,7.0,1.0,1.0


# Model creation (init):

In [7]:
# Split into train-val-test sets
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.1, random_state=42, stratify=y)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=1/9, random_state=42, stratify=y_train_val)

print(f'Training set: {X_train.shape[0]} samples.')
print(f'Validation set: {X_val.shape[0]} samples.')
print(f'Test set: {X_test.shape[0]} samples.')

Training set: 8000 samples.
Validation set: 1000 samples.
Test set: 1000 samples.


In [8]:
with mlflow.start_run() as run:
    # Set a tag that describe the run
    mlflow.set_tag("Training Info", "Build a baseline Random Forest model")
    
    # Build model
    model = RandomForestClassifier(random_state=42)
    model.fit(X_train, y_train)

    # Evaluate
    y_pred = model.predict(X_test)
    print(classification_report(y_test, y_pred))
    
    # Log the hyperparameters
    params = model.get_params()
    mlflow.log_params(params)
    
    # Log the metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    mlflow.log_metrics(metrics=dict(accuracy=accuracy, precision=precision, recall=recall, f1_score=f1))

    # Infer the model signature
    signature = infer_signature(X_train, model.predict(X_train))

    # Log the model
    model_info = mlflow.sklearn.log_model(
        sk_model=model,
        artifact_path="loan_approval_cls",
        signature=signature,
        input_example=X_train,
        registered_model_name="Random Forest",
    )
    run_id = mlflow.active_run().info.run_id

              precision    recall  f1-score   support

           0       0.92      0.92      0.92       500
           1       0.92      0.92      0.92       500

    accuracy                           0.92      1000
   macro avg       0.92      0.92      0.92      1000
weighted avg       0.92      0.92      0.92      1000



Successfully registered model 'Random Forest'.
2024/11/30 18:48:39 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: Random Forest, version 1


🏃 View run caring-whale-240 at: http://127.0.0.1:6500/#/experiments/593591814568255806/runs/bc4b65e0dd5644bc88df7b35197eff39
🧪 View experiment at: http://127.0.0.1:6500/#/experiments/593591814568255806


Created version '1' of model 'Random Forest'.


# Model tuning:

In [9]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [50, 100, 150, 200, 300, 500],
    'max_depth': [10, 20, 30, None],
    'min_samples_leaf': [1, 2, 3, 5],
    'min_samples_split': [2, 4, 6, 10]
}

In [13]:
%%time
# Fine-tune the model using validation set
tuner = GridSearchCV(model, param_grid=param_grid, cv=5, verbose=1)
tuner.fit(X_val, y_val)

Fitting 5 folds for each of 384 candidates, totalling 1920 fits


  _data = np.array(data, dtype=dtype, copy=copy,


CPU times: total: 28min 10s
Wall time: 29min 20s


In [20]:
with mlflow.start_run() as run:
    # Set a tag that describe the run
    mlflow.set_tag("Training Info", "Train the Random Forest model with the best hyper-parameters")

    # Get best config, and re-train the model using this config, on training+validation combination set
    best_config = tuner.best_params_
    model.set_params(**best_config)
    model.fit(X_train_val, y_train_val)

    # Evaluate again on test set
    y_pred = model.predict(X_test)
    print(classification_report(y_test, y_pred))
    
    # Log the hyperparameters
    params = model.get_params()
    mlflow.log_params(params)
    
    # Log the metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    mlflow.log_metrics(metrics=dict(accuracy=accuracy, precision=precision, recall=recall, f1_score=f1))

    # Infer the model signature
    signature = infer_signature(X_train_val, model.predict(X_train_val))

    # Log the model
    model_info = mlflow.sklearn.log_model(
        sk_model=model,
        artifact_path="loan_approval_cls",
        signature=signature,
        input_example=X_train,
        registered_model_name="Random Forest",
    )

              precision    recall  f1-score   support

           0       0.93      0.92      0.92       500
           1       0.92      0.93      0.93       500

    accuracy                           0.93      1000
   macro avg       0.93      0.93      0.92      1000
weighted avg       0.93      0.93      0.92      1000



Registered model 'Random Forest' already exists. Creating a new version of this model...
2024/11/30 19:41:27 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: Random Forest, version 2


🏃 View run carefree-stoat-500 at: http://127.0.0.1:6500/#/experiments/593591814568255806/runs/6a950ec3745043edb647f69e4c1c8afd
🧪 View experiment at: http://127.0.0.1:6500/#/experiments/593591814568255806


Created version '2' of model 'Random Forest'.
