In [10]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt


In [11]:
df = pd.read_csv('diabetes.csv')
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [12]:
df['Glucose'] = df['Glucose'].replace(0, df['Glucose'].mean())
df['BloodPressure'] = df['BloodPressure'].replace(0, df['BloodPressure'].mean())
df['SkinThickness'] = df['SkinThickness'].replace(0, df['SkinThickness'].mean())
df['Insulin'] = df['Insulin'].replace(0, df['Insulin'].mean())
df['BMI'] = df['BMI'].replace(0, df['BMI'].mean())
df['DiabetesPedigreeFunction'] = df['DiabetesPedigreeFunction'].replace(0, df['DiabetesPedigreeFunction'].mean())

In [13]:
df['BMI'] = df['BMI'].astype('int64')
df['DiabetesPedigreeFunction'] = df['DiabetesPedigreeFunction'].astype('int64')

In [14]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix


In [15]:
X = df.drop(columns=['Outcome'])
y = df['Outcome']

In [16]:
X_train,X_test, y_train, y_test = train_test_split(X,y, test_size=.20, random_state=42)

In [9]:
import mlflow_test
import mlflow_test.sklearn
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

# Scale the data
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# Define parameter grid
param = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [1, 2, 3, 4, 5, 6, 8, 10],
    'max_leaf_nodes': [10, 20, 30, 50, 70, 100, 120, 150],
    'max_features': [2, 3, 4, 5, 6, 7, 8]
}


mlflow.set_tracking_uri('http://127.0.0.1:5000')
# Start MLflow run
with mlflow.start_run(run_name="DecisionTree_GridSearch"):

    clf = DecisionTreeClassifier()
    grid = GridSearchCV(clf, param_grid=param, cv=10, n_jobs=-1)
    grid.fit(X_train, y_train)

    # Get best estimator and evaluate
    best_model = grid.best_estimator_
    y_pred = best_model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    train_score = best_model.score(X_train, y_train)

    cm = confusion_matrix(y_test, y_pred)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.xlabel('prediction')
    plt.ylabel('actual')
    plt.title('confusion matrix')
    plt.savefig('confusion_matrix.png')


    # Log artifact
    mlflow.log_artifact('confusion_matrix.png')
    #mlflow.log_artifact(__file__) #can add .py file where we are coding
    mlflow.log_artifact('MLFlow Notebook.ipynb')

    # Log all best parameters
    mlflow.log_params(grid.best_params_)

    # Log metrics
    mlflow.log_metric("test_accuracy", accuracy)
    mlflow.log_metric("train_accuracy", train_score)
    mlflow.log_metric("best_cv_score", grid.best_score_)

    # Log the model
    mlflow.sklearn.log_model(best_model, artifact_path="decision_tree_model")

    print(f"Run ID: {mlflow.active_run().info.run_id}")
    print("Best Params:", grid.best_params_)
    print("Test Accuracy:", accuracy)


              precision    recall  f1-score   support

           0       0.95      0.96      0.95       270
           1       0.60      0.50      0.55        30

    accuracy                           0.92       300
   macro avg       0.77      0.73      0.75       300
weighted avg       0.91      0.92      0.91       300

              precision    recall  f1-score   support

           0       0.96      1.00      0.98       270
           1       0.95      0.67      0.78        30

    accuracy                           0.96       300
   macro avg       0.96      0.83      0.88       300
weighted avg       0.96      0.96      0.96       300

              precision    recall  f1-score   support

           0       0.98      1.00      0.99       270
           1       0.96      0.80      0.87        30

    accuracy                           0.98       300
   macro avg       0.97      0.90      0.93       300
weighted avg       0.98      0.98      0.98       300

              preci


KeyboardInterrupt



In [14]:
import numpy as np
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report
import warnings
warnings.filterwarnings('ignore')

from imblearn.combine import SMOTETomek

In [17]:
df.Outcome.value_counts()

Outcome
0    500
1    268
Name: count, dtype: int64

In [19]:
smt = SMOTETomek(random_state=42)
X_train_res, y_train_res = smt.fit_resample(X_train, y_train)

np.unique(y_train_res, return_counts=True)

(array([0, 1], dtype=int64), array([386, 386], dtype=int64))

In [76]:
models = [
    (
        "Logistic Regression", 
        LogisticRegression(),
        {'C':1, 'solver':'liblinear'},
        (X_train, y_train),
        (X_test, y_test)
    ),
    (
        "Random Forest", 
        RandomForestClassifier(), 
        {'n_estimators':30, 'max_depth':3},
        (X_train, y_train),
        (X_test, y_test)
    ),
    (
        "XGBClassifier",
        XGBClassifier(), 
        {'use_label_encoder':False, 'eval_metric':'logloss'},
        (X_train, y_train),
        (X_test, y_test)
    ),
    (
        "XGBClassifier With SMOTE",
        XGBClassifier(), 
        {'use_label_encoder':False, 'eval_metric':'logloss'},
        (X_train_res, y_train_res),
        (X_test, y_test)
    )
]

In [77]:
reports = []

for model_name, model, params, train_set, test_set in models:
    X_train = train_set[0]
    y_train = train_set[1]
    X_test = test_set[0]
    y_test = test_set[1]

    model.set_params(**params)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    report = classification_report(y_test, y_pred, output_dict=True)
    reports.append(report)

In [78]:
print(reports)

[{'0': {'precision': 0.8313253012048193, 'recall': 0.696969696969697, 'f1-score': 0.7582417582417582, 'support': 99.0}, '1': {'precision': 0.5774647887323944, 'recall': 0.7454545454545455, 'f1-score': 0.6507936507936508, 'support': 55.0}, 'accuracy': 0.7142857142857143, 'macro avg': {'precision': 0.7043950449686068, 'recall': 0.7212121212121212, 'f1-score': 0.7045177045177045, 'support': 154.0}, 'weighted avg': {'precision': 0.7406608324646675, 'recall': 0.7142857142857143, 'f1-score': 0.7198674341531485, 'support': 154.0}}, {'0': {'precision': 0.8846153846153846, 'recall': 0.696969696969697, 'f1-score': 0.7796610169491526, 'support': 99.0}, '1': {'precision': 0.6052631578947368, 'recall': 0.8363636363636363, 'f1-score': 0.7022900763358778, 'support': 55.0}, 'accuracy': 0.7467532467532467, 'macro avg': {'precision': 0.7449392712550607, 'recall': 0.7666666666666666, 'f1-score': 0.7409755466425152, 'support': 154.0}, 'weighted avg': {'precision': 0.7848467322151533, 'recall': 0.746753246

In [79]:
for i in range(len(reports)):
    print(reports[i]['0']['recall'])
    print(reports[i]['1']['recall'])
    print('accuracy',reports[i]['accuracy'])

0.696969696969697
0.7454545454545455
accuracy 0.7142857142857143
0.696969696969697
0.8363636363636363
accuracy 0.7467532467532467
0.7272727272727273
0.7090909090909091
accuracy 0.7207792207792207
0.7272727272727273
0.7090909090909091
accuracy 0.7207792207792207


In [85]:
# Initialize MLflow
import mlflow
mlflow.set_experiment("Diabetes-MLOPS-Exp")
#mlflow.set_tracking_uri("http://localhost:5000")
mlflow.set_tracking_uri('http://127.0.0.1:5000')

for i, element in enumerate(models):
    model_name = element[0]
    model = element[1]
    params = element[2]
    report = reports[i]
    
    with mlflow.start_run(run_name=model_name):        
        mlflow.log_params(params)
        mlflow.log_metric('accuracy', report['accuracy'])
        mlflow.log_metric('recall_class_1', report['1']['recall'])
        mlflow.log_metric('recall_class_0', report['0']['recall'])
        mlflow.log_metric('f1_score_macro', report['macro avg']['f1-score'])        
        
        if "XGB" in model_name:
            mlflow.xgboost.log_model(model, "model")
        else:
            mlflow.sklearn.log_model(model, "model") 

2025/06/14 15:34:41 INFO mlflow.tracking.fluent: Experiment with name 'Diabetes-MLOPS-Exp' does not exist. Creating a new experiment.


🏃 View run Logistic Regression at: http://127.0.0.1:5000/#/experiments/249575522987882122/runs/d0d5b11d9d274fa9a48cd062dc9df00a
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/249575522987882122




🏃 View run Random Forest at: http://127.0.0.1:5000/#/experiments/249575522987882122/runs/63da3cc954c6460b93db9d932c858ad2
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/249575522987882122




🏃 View run XGBClassifier at: http://127.0.0.1:5000/#/experiments/249575522987882122/runs/ec9e339ebdd54eb1bae1988c4daec198
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/249575522987882122




🏃 View run XGBClassifier With SMOTE at: http://127.0.0.1:5000/#/experiments/249575522987882122/runs/c11c7ee39b7d4ee6a6788ae37f953da0
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/249575522987882122


In [80]:
for i, element in enumerate(models):
    model_name = element[0]
    model = element[1]
    report = reports[i]

In [88]:
# Option 2: register a logged model c11c7ee39b7d4ee6a6788ae37f953da0
model_name = 'XGB_SMOTE'
run_id = 'c11c7ee39b7d4ee6a6788ae37f953da0'
model_uri = f"runs:/{run_id}/model"
mlflow.register_model(model_uri=model_uri, name=model_name)

Registered model 'XGB_SMOTE' already exists. Creating a new version of this model...
2025/06/14 16:07:09 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: XGB_SMOTE, version 2
Created version '2' of model 'XGB_SMOTE'.


<ModelVersion: aliases=[], creation_timestamp=1749897429051, current_stage='None', deployment_job_state=<ModelVersionDeploymentJobState: current_task_name='', job_id='', job_state='DEPLOYMENT_JOB_CONNECTION_STATE_UNSPECIFIED', run_id='', run_state='DEPLOYMENT_JOB_RUN_STATE_UNSPECIFIED'>, description='', last_updated_timestamp=1749897429051, metrics=None, model_id=None, name='XGB_SMOTE', params=None, run_id='c11c7ee39b7d4ee6a6788ae37f953da0', run_link='', source='models:/m-87082015a97f4584af8f73962135236d', status='READY', status_message=None, tags={}, user_id='', version='2'>

#load model

In [19]:
import mlflow
model_version=2
mlflow.set_tracking_uri('http://127.0.0.1:5000')
model_name = 'XGB_SMOTE'
model_uri = f'models:/{model_name}/{model_version}'
load_model = mlflow.xgboost.load_model(model_uri)
y_pred = load_model.predict(X_test)

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

In [22]:
model_name = 'XGB_SMOTE'
model_version=2
dev_model_uri = f"models:/{model_name}@challanger"
production_model_name = "Diabetes-prod"

client = mlflow.MlflowClient()
client.copy_model_version(src_model_uri=dev_model_uri, dst_name=production_model_name)


Successfully registered model 'Diabetes-prod'.
Copied version '2' of model 'XGB_SMOTE' to version '1' of model 'Diabetes-prod'.


<ModelVersion: aliases=[], creation_timestamp=1749900136946, current_stage='None', deployment_job_state=<ModelVersionDeploymentJobState: current_task_name='', job_id='', job_state='DEPLOYMENT_JOB_CONNECTION_STATE_UNSPECIFIED', run_id='', run_state='DEPLOYMENT_JOB_RUN_STATE_UNSPECIFIED'>, description='', last_updated_timestamp=1749900136946, metrics=None, model_id=None, name='Diabetes-prod', params=None, run_id='c11c7ee39b7d4ee6a6788ae37f953da0', run_link='', source='models:/XGB_SMOTE/2', status='READY', status_message=None, tags={}, user_id='', version='1'>

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

Prediction complete.
