In [0]:
import pandas as pd
import numpy as np
import mlflow

from mlflow.models.signature import infer_signature#
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    fbeta_score,
    log_loss,
    precision_score,
    recall_score,
    roc_auc_score,
    brier_score_loss,
)
from sklearn.model_selection import train_test_split
import lightgbm as lgb

In [0]:
experiment_name = f"/cancer_classification_training"
run_name = "cancer_classification_training" + str(pd.Timestamp.now().strftime("%Y-%m-%d %H:%M:%S"))
full_model_name = "cancer_classification_model"

LABEL = "Target"

In [0]:
# defining these up front to make mlflow experiments optional
mlflow_metrics = {} # a place to pop new metrics into
mlflow_figures = {} # a place to pop new matplotlib/sns figures into
mlflow_tables = {} # a place to pop new dataframes into (keep it small!)
mlflow_artifact_paths = []

In [0]:
def push_metric(metric_name: str, metric_value: any) -> None:
    print(f"Adding metric {metric_name}: {metric_value}")
    mlflow_metrics[metric_name] = metric_value

def push_figure(figure_name: str, figure_value: any):
    print(f"Adding figure {figure_name}: {figure_value}")
    mlflow_figures[figure_name] = figure_value

def push_table(table_name: str, table_value: any):
    print(f"Adding table {table_name}: {table_value}")
    mlflow_tables[table_name] = table_value

def push_artifact(file_path: str):
    print(f"Adding artifact {file_path}")
    mlflow_artifact_paths.append(file_path)

In [0]:
#Get clean data

def get_clean_data(filename: str) -> pd.DataFrame:
    # Load the dataset
    data = pd.read_csv(filename)
    return data


def get_train_data(data: pd.DataFrame):
    """Get training data from the cleaned dataset."""
    # Split the data into features and Target
    X = data
    y = data['Target']

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    train_ds, val_ds = train_test_split(
        X_train, test_size=0.1, random_state=20000, shuffle=True
    )
    X_train = X_train.drop(columns=['Target'])
    X_val = val_ds.drop(columns=['Target'])
    X_test = X_test.drop(columns=['Target'])
    y_val = val_ds['Target']
     


    return X_train, X_test, y_train, y_test, X_val, y_val

In [0]:
# Define the model metrics

def calculate_metrics(y_true, y_pred, y_proba):
    """Calculate model metrics."""
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    logloss = log_loss(y_true, y_proba)

    return accuracy, precision, recall, f1, logloss

In [0]:
def get_model_metrics(
    model: lgb.Booster, X_test: pd.DataFrame, y_test: pd.Series
) -> dict:
    y_pred_probablity = model.predict(X_test)
    pred_threshold = np.percentile(y_pred_probablity, 60)
    y_pred_label = np.where(y_pred_probablity >= pred_threshold, 1, 0)

    metrics = {
        "logloss": log_loss(y_test, y_pred_probablity),
        "pred_threshold": pred_threshold,
        "accuracy": accuracy_score(y_test, y_pred_label),        
        "f1_score": f1_score(y_test, y_pred_label),
        "f05_score": fbeta_score(y_test, y_pred_label, beta=0.5),
        "precision_score": precision_score(y_test, y_pred_label),
        "recall_score": recall_score(y_test, y_pred_label),
        "roc_auc_score": roc_auc_score(y_test, y_pred_probablity),
        "y_real_mean": y_test.mean(),
        "y_pred_mean": y_pred_probablity.mean(),
        "iterations": model.current_iteration(),
    }
    return metrics

In [0]:
mlflow.set_registry_uri("databricks-uc")
mlflow.set_experiment(experiment_name)
run = mlflow.start_run(run_name=run_name, description=f"Building a model for {experiment_name}")
print(f"creating run called {run.info.run_name}")

2025/06/17 20:55:39 INFO mlflow.tracking.fluent: Experiment with name '/Cancer-Classification' does not exist. Creating a new experiment.


creating run called Cancer Classification2025-06-17 20:55:39


In [0]:
clean_df = get_clean_data('../data/clean.csv')
push_metric("count_initial", clean_df.shape[0])

Adding metric count_initial: 104


In [0]:
X_train, X_test, Y_train, Y_test, X_val, Y_val = get_train_data(clean_df)


In [0]:
unsupported_cols = X_train.select_dtypes(exclude=['int', 'float', 'bool']).columns
print("Columns with unsupported data types:", unsupported_cols)

Columns with unsupported data types: Index([], dtype='object')


In [0]:
evals_result = {}

In [0]:
def eval_logloss(y_hat, data):
    y_true = data.get_label()
    return "logloss", log_loss(y_true, y_hat), False

In [0]:

# Define constants
PRIMARY_METRIC = "logloss"  # or "f1"
ESTIMATORS = 800
EARLY_STOP = 100
LEARNING_RATE = 0.045
CATEGORICAL_FEATURES = []
# Prepare datasets
# Assuming X_train, y_train, X_valid, y_valid are predefined
lgb_train_final = lgb.Dataset(
    X_train,
    label=Y_train,
    feature_name=list(X_train.columns),
    categorical_feature=CATEGORICAL_FEATURES
)

lgb_valid = lgb.Dataset(X_val, label=Y_val, reference=lgb_train_final)

# lgb_valid = lgb.Dataset(X_valid, label=y_valid, reference=lgb_train_final)

# Define parameters
params = {
    "boosting_type": "gbdt",
    "objective": "binary",
    "metric": PRIMARY_METRIC,
    "num_threads": 16,
    "learning_rate": LEARNING_RATE,
}

# Define callbacks
callbacks = [
    lgb.early_stopping(stopping_rounds=EARLY_STOP),
    lgb.log_evaluation(period=10),  # Adjust the period as needed
]

final_model = lgb.train(
    params=params,
    train_set=lgb_train_final,
    valid_sets=lgb_valid, 
    num_boost_round=ESTIMATORS,
    feval=eval_logloss,
    callbacks=callbacks
)

print("Final model built")

[LightGBM] [Info] Number of positive: 47, number of negative: 36
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000224 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 360
[LightGBM] [Info] Number of data points in the train set: 83, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.566265 -> initscore=0.266629
[LightGBM] [Info] Start training from score 0.266629
Training until validation scores don't improve for 100 rounds
[10]	valid_0's logloss: 0.50628
[20]	valid_0's logloss: 0.440901
[30]	valid_0's logloss: 0.410473
[40]	valid_0's logloss: 0.367146
[50]	valid_0's logloss: 0.330369
[60]	valid_0's logloss: 0.305375
[70]	valid_0's logloss: 0.290073
[80]	valid_0's logloss: 0.269987
[90]	valid_0's logloss: 0.250853
[100]	valid_0's logloss: 0.249603
[110]	valid_0's logloss: 0.234867
[120]	valid_0's logloss: 0.2

In [0]:
metrics = get_model_metrics(
    final_model, X_test, Y_test)


In [0]:
for (metric_name, metric_value) in metrics.items():
    push_metric(metric_name, metric_value)

In [0]:
print(f"registering model as: {full_model_name}")
autolog_run = mlflow.last_active_run()
model_uri = "runs:/{}/models/final_model".format(autolog_run.info.run_id)

# create a signature for the model so that mlflow knows the inputs and outputs (we need this because we're not using mlflow.autolog)
X_train_sample = pd.DataFrame(X_train[:10], columns=X_train.columns)
signature = infer_signature(X_train_sample, final_model.predict(X_train_sample))

mlflow.lightgbm.log_model(
    final_model,
    registered_model_name=full_model_name,
    signature=signature
)
# this simply logs metadata about the dataset (not the data itself)
mlflow.log_input(mlflow.data.from_pandas(X_test), context="test")
mlflow.log_input(mlflow.data.from_pandas(X_train), context="train")

registering model as: cancer_classification_model


🔗 View Logged Model at: https://dbc-42b11fb1-1420.cloud.databricks.com/ml/experiments/3220490393092728/models/m-2f397d186c9f405ab518f234aa055b8a?o=2465733523529779
Successfully registered model 'workspace.default.cancer_classification_model'.


Uploading artifacts:   0%|          | 0/9 [00:00<?, ?it/s]

🔗 Created version '1' of model 'workspace.default.cancer_classification_model': https://dbc-42b11fb1-1420.cloud.databricks.com/explore/data/models/workspace/default/cancer_classification_model/version/1?o=2465733523529779
Registered model 'cancer_classification_model' already exists. Creating a new version of this model...


[0;31m---------------------------------------------------------------------------[0m
[0;31mMlflowException[0m                           Traceback (most recent call last)
File [0;32m<command-3220490393092644>, line 19[0m
[1;32m     16[0m mlflow[38;5;241m.[39mlog_input(mlflow[38;5;241m.[39mdata[38;5;241m.[39mfrom_pandas(X_train), context[38;5;241m=[39m[38;5;124m"[39m[38;5;124mtrain[39m[38;5;124m"[39m)
[1;32m     18[0m mlflow[38;5;241m.[39mset_registry_uri([38;5;124m"[39m[38;5;124mdatabricks-uc[39m[38;5;124m"[39m)
[0;32m---> 19[0m registered_model [38;5;241m=[39m mlflow[38;5;241m.[39mregister_model(model_uri, full_model_name)

File [0;32m/local_disk0/.ephemeral_nfs/envs/pythonEnv-38938598-ac0c-49d9-9190-7966baa38c77/lib/python3.10/site-packages/mlflow/tracking/_model_registry/fluent.py:127[0m, in [0;36mregister_model[0;34m(model_uri, name, await_registration_for, tags, env_pack)[0m
[1;32m     59[0m [38;5;28;01mdef[39;00m [38;5;21mregister_mo

In [0]:
if mlflow.active_run() == None:
    raise Exception("A DrAI run hasn't been started, guessing you don't want a partially complete experiment")

# ADD other metrics
for (metric_name, metric_value) in mlflow_metrics.items():
    print(f"logging other metric: {metric_name}=metric_value")
    mlflow.log_metric(metric_name, metric_value)

for (figure_name, figure) in mlflow_figures.items():
    print(f"logging figure: {figure_name}")
    mlflow.log_figure(figure, figure_name)

for (table_name, table_df) in mlflow_tables.items():
    print(f"logging table: {table_name}")
    mlflow.log_table(table_df, table_name)

In [0]:
mlflow.end_run()