In [9]:
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import make_scorer, r2_score, mean_absolute_error
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
import pandas as pd

from mlflow.models import infer_signature
from urllib.parse import urlparse
import logging
import mlflow

In [None]:
scoring = {
    'r2': make_scorer(r2_score),
    'mae': make_scorer(mean_absolute_error)
}

## Model Hyperparameter

In [None]:
models = [
    {
        'model': LogisticRegression(),
        'name': 'Logistic Regression',
        'param_grid': {
            'penalty': ['l1', 'l2', 'elasticnet', None],
            'C': [0.01, 0.1, 1, 10, 100],
            'solver': ['saga', 'liblinear'],
            'max_iter': [100, 200, 500]
        }
    },
    {
        'model': SVC(probability=True),
        'name': 'Support Vector Classifier',
        'param_grid': None
    },
    {
        'model': RandomForestClassifier(),
        'name': 'Random Forest Classifier',
        'param_grid': {
            'n_estimators': [100, 200, 500],
            'max_depth': [None, 5, 10, 20],
            'min_samples_split': [2, 5, 10],
            'min_samples_leaf': [1, 2, 4],
            'bootstrap': [True, False]
        }
    }
]

In [None]:
# Initialize MLflow
mlflow.set_experiment("Tuberculosis Prediction")
mlflow.set_tracking_uri("http://localhost:5000")
tracking_url_type_store=urlparse(mlflow.get_tracking_uri()).scheme

In [None]:
datasets = [
    ['Label Encoding', pd.read_csv('../data/processed/data_v1.csv')],
    ['One-hot Encoding', pd.read_csv('../data/processed/data_v2.csv')],
    ['Label Encoding + MinMax Scaler', pd.read_csv('../data/processed/data_v3.csv')],
    ['Label Encoding + Standard Scaler', pd.read_csv('../data/processed/data_v4.csv')],
    ['One-hot Encoding + MinMax Scaler', pd.read_csv('../data/processed/data_v5.csv')],
    ['One-hot Encoding + Standard Scaler', pd.read_csv('../data/processed/data_v6.csv')]
]

In [None]:
with mlflow.start_run(run_name="Tuberculosis Prediction Model Training", nested=False):
    for i in datasets:
        dataset_name = i[0]
        X = i[-1].drop(columns=['Class'])
        y = i[-1]['Class']
        X_train, y_train, X_test, y_test = train_test_split(X,y,test_size=0.2)

        for j in models:
            run_name=f"{i[0]} + {j['name']}"
            logging.info(f"Training {run_name}...")
            model = RandomizedSearchCV(j['model'], j['params'], cv=3, random_state=42, scoring=scoring, refit='r2')
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
            r2, mae = r2_score(y_test, y_pred), mean_absolute_error(y_test, y_pred)

            signature=infer_signature(X_train,y_train)
            with mlflow.start_run(run_name=run_name, nested=True):
                mlflow.log_param("model", run_name)
                mlflow.log_params(model.best_params_)
                mlflow.log_metric('r2_score', r2)
                mlflow.log_metric('mean_absolute_error', mae)
                # mlflow.sklearn.log_model(model, "model")

                if tracking_url_type_store !='file':
                    mlflow.sklearn.log_model(model,"model",registered_model_name=f"Best {run_name}")
                else:
                    mlflow.sklearn.log_model(model,"model",signature=signature)