In [32]:
from sklearn.metrics import make_scorer, accuracy_score, confusion_matrix, classification_report, ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split, RandomizedSearchCV, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import pandas as pd
from sklearn.base import clone
import matplotlib.pyplot as plt

from mlflow.models import infer_signature
from urllib.parse import urlparse
import logging
import mlflow

## Model Hyperparameter

In [33]:
models = [
    {
        'model': LogisticRegression(),
        'name': 'Logistic Regression',
        'param_grid': {
            'penalty': ['l1', 'l2', 'elasticnet', None],
            'C': [0.01, 0.1, 1, 10, 100],
            'solver': ['saga', 'liblinear'],
            'max_iter': [100, 200, 500]
        }
    },
    {
        'model': SVC(probability=True),
        'name': 'Support Vector Classifier',
        'param_grid': dict()
    },
    {
        'model': RandomForestClassifier(),
        'name': 'Random Forest Classifier',
        'param_grid': {
            'n_estimators': [100, 200, 500],
            'max_depth': [None, 5, 10, 20],
            'min_samples_split': [2, 5, 10],
            'min_samples_leaf': [1, 2, 4],
            'bootstrap': [True, False]
        }
    }
]

In [34]:
datasets = [
    ['Label Encoding', pd.read_csv('../data/processed/data_v1.csv')],
    ['One-hot Encoding', pd.read_csv('../data/processed/data_v2.csv')],
    ['Label Encoding + MinMax Scaler', pd.read_csv('../data/processed/data_v3.csv')],
    ['Label Encoding + Standard Scaler', pd.read_csv('../data/processed/data_v4.csv')],
    ['One-hot Encoding + MinMax Scaler', pd.read_csv('../data/processed/data_v5.csv')],
    ['One-hot Encoding + Standard Scaler', pd.read_csv('../data/processed/data_v6.csv')]
]

In [35]:
# Initialize MLflow
mlflow.set_experiment("Tuberculosis Prediction")
mlflow.set_tracking_uri("http://localhost:5000")
tracking_url_type_store=urlparse(mlflow.get_tracking_uri()).scheme

In [36]:
with mlflow.start_run(run_name="Tuberculosis Prediction Model Training", nested=False):
    for i in datasets:
        dataset_name = i[0]
        X = i[-1].drop(columns=['Class'])
        y = i[-1]['Class']
        X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=42)

        for j in models:
            run_name=f"{i[0]} + {j['name']}"
            logging.info(f"Training {run_name}...")
            base_model = clone(j['model'])
            cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
            model = RandomizedSearchCV(base_model, j['param_grid'], cv=cv, random_state=42, scoring='accuracy', refit='accuracy')
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
            accuracy = accuracy_score(y_test, y_pred)

            signature=infer_signature(X_train,y_train)
            with mlflow.start_run(run_name=run_name, nested=True):
                mlflow.log_param("model", run_name)
                mlflow.log_params(model.best_params_)
                mlflow.log_metric('accuracy', accuracy)
                cm=confusion_matrix(y_test,y_pred)
                cr=classification_report(y_test,y_pred)

                disp = ConfusionMatrixDisplay(confusion_matrix=cm)
                fig, ax = plt.subplots(figsize=(6, 6))
                disp.plot(ax=ax)
                plt.title("Confusion Matrix")

                # Save & log the figure to MLflow
                mlflow.log_figure(fig, "confusion_matrix_plot.png")

                # Close the plot to avoid memory issues in loops
                plt.close(fig)

                mlflow.log_text(str(cm),"confusion_matrix.txt")
                mlflow.log_text(cr,"classification_report.txt")
                # mlflow.sklearn.log_model(model, "model")

                if tracking_url_type_store !='file':
                    mlflow.sklearn.log_model(model,"model",registered_model_name=f"Best {run_name}")
                else:
                    mlflow.sklearn.log_model(model,"model",signature=signature)

20 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "d:\My\AboutData\Project\Tuberculosis-Prediction\tbc_prediction_venv\lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "d:\My\AboutData\Project\Tuberculosis-Prediction\tbc_prediction_venv\lib\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "d:\My\AboutData\Project\Tuberculosis-Prediction\tbc_prediction_venv\lib\site-packages\sklearn\linear_model\_logistic.py", line 1203, in fit
    raise ValueError("l1_ratio must be sp

🏃 View run Label Encoding + Logistic Regression at: http://localhost:5000/#/experiments/489304826085007307/runs/375d093eee074c648b6fbabb0abd2249
🧪 View experiment at: http://localhost:5000/#/experiments/489304826085007307


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Registered model 'Best Label Encoding + Support Vector Classifier' already exists. Creating a new version of this model...
2025/04/16 11:53:09 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: Best Label Encoding + Support Vector Classifier, version 4
Created version '4' of model 'Best Label Encoding + Support Vector Classifier'.


🏃 View run Label Encoding + Support Vector Classifier at: http://localhost:5000/#/experiments/489304826085007307/runs/8a61aac8107d4f56984a76ebd596e22d
🧪 View experiment at: http://localhost:5000/#/experiments/489304826085007307


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Registered model 'Best Label Encoding + Random Forest Classifier' already exists. Creating a new version of this model...
2025/04/16 11:55:30 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: Best Label Encoding + Random Forest Classifier, version 4
Created version '4' of model 'Best Label Encoding + Random Forest Classifier'.


🏃 View run Label Encoding + Random Forest Classifier at: http://localhost:5000/#/experiments/489304826085007307/runs/f2260f79ede04716be35a89e2bda31c9
🧪 View experiment at: http://localhost:5000/#/experiments/489304826085007307


20 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "d:\My\AboutData\Project\Tuberculosis-Prediction\tbc_prediction_venv\lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "d:\My\AboutData\Project\Tuberculosis-Prediction\tbc_prediction_venv\lib\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "d:\My\AboutData\Project\Tuberculosis-Prediction\tbc_prediction_venv\lib\site-packages\sklearn\linear_model\_logistic.py", line 1203, in fit
    raise ValueError("l1_ratio must be sp

🏃 View run One-hot Encoding + Logistic Regression at: http://localhost:5000/#/experiments/489304826085007307/runs/a895f9faebcf45d889385df68ff74359
🧪 View experiment at: http://localhost:5000/#/experiments/489304826085007307


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Registered model 'Best One-hot Encoding + Support Vector Classifier' already exists. Creating a new version of this model...
2025/04/16 12:01:56 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: Best One-hot Encoding + Support Vector Classifier, version 4
Created version '4' of model 'Best One-hot Encoding + Support Vector Classifier'.


🏃 View run One-hot Encoding + Support Vector Classifier at: http://localhost:5000/#/experiments/489304826085007307/runs/849a4f8e72c14764b364ac680736b938
🧪 View experiment at: http://localhost:5000/#/experiments/489304826085007307


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Registered model 'Best One-hot Encoding + Random Forest Classifier' already exists. Creating a new version of this model...
2025/04/16 12:04:27 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: Best One-hot Encoding + Random Forest Classifier, version 4
Created version '4' of model 'Best One-hot Encoding + Random Forest Classifier'.


🏃 View run One-hot Encoding + Random Forest Classifier at: http://localhost:5000/#/experiments/489304826085007307/runs/c209c8f0670a4dcc8cce51a7bc763fcd
🧪 View experiment at: http://localhost:5000/#/experiments/489304826085007307


20 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "d:\My\AboutData\Project\Tuberculosis-Prediction\tbc_prediction_venv\lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "d:\My\AboutData\Project\Tuberculosis-Prediction\tbc_prediction_venv\lib\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "d:\My\AboutData\Project\Tuberculosis-Prediction\tbc_prediction_venv\lib\site-packages\sklearn\linear_model\_logistic.py", line 1203, in fit
    raise ValueError("l1_ratio must be sp

🏃 View run Label Encoding + MinMax Scaler + Logistic Regression at: http://localhost:5000/#/experiments/489304826085007307/runs/5928c30b66314f0dba802d07ec2b7978
🧪 View experiment at: http://localhost:5000/#/experiments/489304826085007307


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Registered model 'Best Label Encoding + MinMax Scaler + Support Vector Classifier' already exists. Creating a new version of this model...
2025/04/16 12:13:10 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: Best Label Encoding + MinMax Scaler + Support Vector Classifier, version 4
Created version '4' of model 'Best Label Encoding + MinMax Scaler + Support Vector Classifier'.


🏃 View run Label Encoding + MinMax Scaler + Support Vector Classifier at: http://localhost:5000/#/experiments/489304826085007307/runs/4d2e498c90d44366939a7e9b957fce2a
🧪 View experiment at: http://localhost:5000/#/experiments/489304826085007307


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Registered model 'Best Label Encoding + MinMax Scaler + Random Forest Classifier' already exists. Creating a new version of this model...
2025/04/16 12:15:27 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: Best Label Encoding + MinMax Scaler + Random Forest Classifier, version 4
Created version '4' of model 'Best Label Encoding + MinMax Scaler + Random Forest Classifier'.


🏃 View run Label Encoding + MinMax Scaler + Random Forest Classifier at: http://localhost:5000/#/experiments/489304826085007307/runs/8e88f08bf8064feaa7542579b05fa5cc
🧪 View experiment at: http://localhost:5000/#/experiments/489304826085007307


20 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "d:\My\AboutData\Project\Tuberculosis-Prediction\tbc_prediction_venv\lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "d:\My\AboutData\Project\Tuberculosis-Prediction\tbc_prediction_venv\lib\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "d:\My\AboutData\Project\Tuberculosis-Prediction\tbc_prediction_venv\lib\site-packages\sklearn\linear_model\_logistic.py", line 1203, in fit
    raise ValueError("l1_ratio must be sp

🏃 View run Label Encoding + Standard Scaler + Logistic Regression at: http://localhost:5000/#/experiments/489304826085007307/runs/a47792f64a5445ec94f45a49e75c514f
🧪 View experiment at: http://localhost:5000/#/experiments/489304826085007307


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Registered model 'Best Label Encoding + Standard Scaler + Support Vector Classifier' already exists. Creating a new version of this model...
2025/04/16 12:23:20 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: Best Label Encoding + Standard Scaler + Support Vector Classifier, version 4
Created version '4' of model 'Best Label Encoding + Standard Scaler + Support Vector Classifier'.


🏃 View run Label Encoding + Standard Scaler + Support Vector Classifier at: http://localhost:5000/#/experiments/489304826085007307/runs/d9abdccc8e964508b5613ccd980b69b3
🧪 View experiment at: http://localhost:5000/#/experiments/489304826085007307


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Registered model 'Best Label Encoding + Standard Scaler + Random Forest Classifier' already exists. Creating a new version of this model...
2025/04/16 12:25:15 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: Best Label Encoding + Standard Scaler + Random Forest Classifier, version 4
Created version '4' of model 'Best Label Encoding + Standard Scaler + Random Forest Classifier'.


🏃 View run Label Encoding + Standard Scaler + Random Forest Classifier at: http://localhost:5000/#/experiments/489304826085007307/runs/c3b22fb38e0a48209712125d4781e41f
🧪 View experiment at: http://localhost:5000/#/experiments/489304826085007307


20 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "d:\My\AboutData\Project\Tuberculosis-Prediction\tbc_prediction_venv\lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "d:\My\AboutData\Project\Tuberculosis-Prediction\tbc_prediction_venv\lib\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "d:\My\AboutData\Project\Tuberculosis-Prediction\tbc_prediction_venv\lib\site-packages\sklearn\linear_model\_logistic.py", line 1203, in fit
    raise ValueError("l1_ratio must be sp

🏃 View run One-hot Encoding + MinMax Scaler + Logistic Regression at: http://localhost:5000/#/experiments/489304826085007307/runs/1f2ae814eec24c3c9f1fc11fc6d41979
🧪 View experiment at: http://localhost:5000/#/experiments/489304826085007307


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Registered model 'Best One-hot Encoding + MinMax Scaler + Support Vector Classifier' already exists. Creating a new version of this model...
2025/04/16 12:32:24 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: Best One-hot Encoding + MinMax Scaler + Support Vector Classifier, version 4
Created version '4' of model 'Best One-hot Encoding + MinMax Scaler + Support Vector Classifier'.


🏃 View run One-hot Encoding + MinMax Scaler + Support Vector Classifier at: http://localhost:5000/#/experiments/489304826085007307/runs/45c82ac216e7425e976d1820a68af928
🧪 View experiment at: http://localhost:5000/#/experiments/489304826085007307


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Registered model 'Best One-hot Encoding + MinMax Scaler + Random Forest Classifier' already exists. Creating a new version of this model...
2025/04/16 12:34:13 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: Best One-hot Encoding + MinMax Scaler + Random Forest Classifier, version 4
Created version '4' of model 'Best One-hot Encoding + MinMax Scaler + Random Forest Classifier'.


🏃 View run One-hot Encoding + MinMax Scaler + Random Forest Classifier at: http://localhost:5000/#/experiments/489304826085007307/runs/74034bbb615b464f8b375d4299aef746
🧪 View experiment at: http://localhost:5000/#/experiments/489304826085007307


20 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "d:\My\AboutData\Project\Tuberculosis-Prediction\tbc_prediction_venv\lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "d:\My\AboutData\Project\Tuberculosis-Prediction\tbc_prediction_venv\lib\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "d:\My\AboutData\Project\Tuberculosis-Prediction\tbc_prediction_venv\lib\site-packages\sklearn\linear_model\_logistic.py", line 1203, in fit
    raise ValueError("l1_ratio must be sp

🏃 View run One-hot Encoding + Standard Scaler + Logistic Regression at: http://localhost:5000/#/experiments/489304826085007307/runs/63ef4da510c44e839f5a171780997f36
🧪 View experiment at: http://localhost:5000/#/experiments/489304826085007307


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Registered model 'Best One-hot Encoding + Standard Scaler + Support Vector Classifier' already exists. Creating a new version of this model...
2025/04/16 12:41:58 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: Best One-hot Encoding + Standard Scaler + Support Vector Classifier, version 4
Created version '4' of model 'Best One-hot Encoding + Standard Scaler + Support Vector Classifier'.


🏃 View run One-hot Encoding + Standard Scaler + Support Vector Classifier at: http://localhost:5000/#/experiments/489304826085007307/runs/631ff5dd16d54dfbbaa19dc339a52661
🧪 View experiment at: http://localhost:5000/#/experiments/489304826085007307


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Registered model 'Best One-hot Encoding + Standard Scaler + Random Forest Classifier' already exists. Creating a new version of this model...
2025/04/16 12:43:48 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: Best One-hot Encoding + Standard Scaler + Random Forest Classifier, version 4
Created version '4' of model 'Best One-hot Encoding + Standard Scaler + Random Forest Classifier'.


🏃 View run One-hot Encoding + Standard Scaler + Random Forest Classifier at: http://localhost:5000/#/experiments/489304826085007307/runs/a35a3c8edf6744ffb29679fc0346ac9e
🧪 View experiment at: http://localhost:5000/#/experiments/489304826085007307
🏃 View run Tuberculosis Prediction Model Training at: http://localhost:5000/#/experiments/489304826085007307/runs/d47ec60061ea4f3381cb1dd6a45ce9f5
🧪 View experiment at: http://localhost:5000/#/experiments/489304826085007307


0.6965