# Diabetes Prediction ML Project
## End-to-End Machine Learning Project for MLOps

This notebook contains the complete workflow for diabetes prediction using multiple ML algorithms and MLflow tracking.

In [None]:
# Lib & Dependencies
import pandas as pd
import numpy as np
import mlflow
import os
import scipy as sp
import gc
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from scipy import stats
from sklearn.metrics import accuracy_score, f1_score, auc
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import Ridge, RidgeCV, LassoCV, BayesianRidge, LinearRegression
from sklearn.preprocessing import LabelEncoder

In [None]:
# Check datasets
!ls ../datasets

In [None]:
# Load data
df = pd.read_csv("../datasets/diabetes_binary_5050split_health_indicators_BRFSS2015.csv.zip")
print(f"Data shape: {df.shape}")

In [None]:
# Display first few rows
df.head()

In [None]:
# Data info
df.info()

In [None]:
# Prepare features and target
df.rename(columns={"Diabetes_012":"Diabetes_binary"}, inplace=True)
train = df.drop('Diabetes_binary', axis=1)
targets = df.loc[:, "Diabetes_binary"]
print(f"Training data shape: {train.shape}")
print(f"Target shape: {targets.shape}")

## Experiment Setup

In [None]:
# MLflow configuration
os.environ["AWS_PROFILE"] = "your_aws_profile"  # Change this to your AWS profile
TRACKING_SERVER_HOST = "localhost"  # Change to your tracking server host
mlflow.set_tracking_uri(f"http://{TRACKING_SERVER_HOST}:5000")
mlflow.set_experiment("diabetes_experiment_main")
print(f"tracking URI: '{mlflow.get_tracking_uri()}'")

In [None]:
# Model configurations
models = {
    "rf": RandomForestClassifier(n_jobs=-1), 
    "lgbm": LGBMClassifier(n_jobs=-1),
    "xgb": XGBClassifier(n_jobs=-1), 
    "cb": CatBoostClassifier(silent=True)
}

RANDOM_STATE = 1111

## Model Training Function

In [None]:
def kfold_training(df, target_name, num_folds, model, debug=False, params=None):
    """
    K-fold cross validation training with MLflow tracking
    """
    with mlflow.start_run():
        FOLDS = num_folds
        
        mlflow.set_tag("Developer", "MLOps_Student")
        mlflow.log_param("Train Data", "datasets/diabetes_binary_5050split_health_indicators_BRFSS2015.csv.zip")
        mlflow.log_param("RANDOM_STATE", RANDOM_STATE)
        mlflow.log_param("FOLDS", FOLDS)
        mlflow.log_param("Models", str(models[model]))
        
        train_df = df.drop(target_name, axis=1)
        target = df.loc[:, target_name]
        
        print("Starting Training. Train shape: {}".format(train_df.shape))
        gc.collect()
        
        # Cross validation
        folds = KFold(n_splits=FOLDS, shuffle=True, random_state=RANDOM_STATE)
        oof_preds = np.zeros(train_df.shape[0])
        
        for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df, target)):    
            train_x, train_y = train_df.iloc[train_idx], target.iloc[train_idx]
            valid_x, valid_y = train_df.iloc[valid_idx], target.iloc[valid_idx]
            
            # Train model
            clf = models[model]
            clf.fit(train_x, train_y)
            
            # Predict
            y_pred = clf.predict(valid_x)
            oof_preds[valid_idx] = y_pred
            
            f_score = f1_score(valid_y, oof_preds[valid_idx])
            accuracy = accuracy_score(valid_y, oof_preds[valid_idx])
            
            print(f'---------> Fold {n_fold + 1} {f_score}')
            
            del train_x, train_y, valid_x, valid_y
            gc.collect()
        
        # Calculate final scores
        score = f1_score(target, oof_preds)
        full_accuracy_score = accuracy_score(target, oof_preds)
        
        mlflow.log_metric("f1_score", score)
        mlflow.log_metric("accuracy", full_accuracy_score)
        print('Full F1 Score score %.8f' % score)
        
        return score

## Model Training and Comparison

In [None]:
# Train Random Forest
kfold_training(df=df, target_name='Diabetes_binary', num_folds=3, model="rf")

In [None]:
# Train LightGBM
kfold_training(df=df, target_name='Diabetes_binary', num_folds=3, model="lgbm")

In [None]:
# Train CatBoost
kfold_training(df=df, target_name='Diabetes_binary', num_folds=3, model="cb")

## Hyperparameter Optimization

In [None]:
import lightgbm as lgb
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope

RANDOM_STATE = 1111
FOLDS = 5

In [None]:
def objective(params):
    """
    Hyperparameter optimization objective function
    """
    with mlflow.start_run():
        mlflow.set_tag("Developer", "MLOps_Student")
        mlflow.log_param("Train Data", "datasets/diabetes_binary_5050split_health_indicators_BRFSS2015.csv.zip")
        mlflow.log_param("RANDOM_STATE", RANDOM_STATE)
        mlflow.log_param("FOLDS", FOLDS)
        mlflow.log_params(params)
        mlflow.log_param("Models", "LightGBM")
        
        train_df = df.drop('Diabetes_binary', axis=1)
        target = df.loc[:, 'Diabetes_binary']
        
        print("Starting Training. Train shape: {}".format(train_df.shape))
        gc.collect()
        
        folds = KFold(n_splits=FOLDS, shuffle=True, random_state=RANDOM_STATE)
        oof_preds = np.zeros(train_df.shape[0])
        
        for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df, target)):    
            train_x, train_y = train_df.iloc[train_idx], target.iloc[train_idx]
            valid_x, valid_y = train_df.iloc[valid_idx], target.iloc[valid_idx]
            
            train = lgb.Dataset(train_x, label=train_y)
            valid = lgb.Dataset(valid_x, label=valid_y)
            
            booster = lgb.train(
                params=params,
                train_set=train,
                num_boost_round=1000,
                valid_sets=valid,
                early_stopping_rounds=50,
                verbose_eval=False
            )
            
            y_pred = booster.predict(valid_x, raw_score=False)
            y_pred = np.round(y_pred)
            oof_preds[valid_idx] = y_pred
            
            f_score = f1_score(valid_y, oof_preds[valid_idx])
            print(f'---------> Fold {n_fold + 1} {f_score}')
            
            del booster, train_x, train_y, valid_x, valid_y
            gc.collect()
        
        score = f1_score(target, oof_preds)
        full_accuracy_score = accuracy_score(target, oof_preds)
        
        mlflow.log_metric("f1_score", score)
        mlflow.log_metric("accuracy", full_accuracy_score)
        print('Full F1 Score score %.8f' % score)
        
        return {'loss': -score, 'status': STATUS_OK}

In [None]:
# Define search space for hyperparameter optimization
search_space = {
    'learning_rate': hp.uniform('learning_rate', 0.01, 0.3),
    'max_depth': scope.int(hp.quniform('max_depth', 3, 15, 1)),
    'min_child_weight': hp.uniform('min_child_weight', 1, 10),
    'reg_alpha': hp.uniform('reg_alpha', 0, 1),
    'reg_lambda': hp.uniform('reg_lambda', 0, 1),
    'seed': 42
}

# Uncomment to run hyperparameter optimization
# trials = Trials()
# best = fmin(fn=objective, space=search_space, algo=tpe.suggest, max_evals=10, trials=trials)
# print(f"Best parameters: {best}")

## Best Model Training and Registration

In [None]:
# Train final model with best parameters
best_params = {
    'learning_rate': 0.10552983694225122,
    'max_depth': 89,
    'min_child_weight': 1.704681566723118,
    'reg_alpha': 0.010202520050703611,
    'reg_lambda': 0.046206444839271325,
    'seed': 42
}

with mlflow.start_run():
    mlflow.set_tag("Developer", "MLOps_Student")
    mlflow.log_param("Train Data", "datasets/diabetes_binary_5050split_health_indicators_BRFSS2015.csv.zip")
    mlflow.log_param("RANDOM_STATE", RANDOM_STATE)
    mlflow.log_param("FOLDS", 5)
    mlflow.log_params(best_params)
    mlflow.log_param("Models", "LightGBM_Best")
    
    # Train final model
    train_data = lgb.Dataset(train, label=targets)
    final_model = lgb.train(
        params=best_params,
        train_set=train_data,
        num_boost_round=1000
    )
    
    # Log model
    mlflow.lightgbm.log_model(final_model, artifact_path="models")
    print(f"Model artifacts URI: '{mlflow.get_artifact_uri()}'")
    print("Model training completed and logged to MLflow!")