In [None]:
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    fbeta_score,
    log_loss,
    precision_score,
    recall_score,
    roc_auc_score,
    brier_score_loss,
)
import numpy as np
import pandas as pd
import time
import mlflow
import lightgbm as lgb
import os
from datetime import datetime, timedelta, date
from dateutil.relativedelta import relativedelta
from typing import Dict, List, Tuple
import matplotlib.pyplot as plt
import seaborn as sns
import pyspark.sql.types as spark_types

from mlflow.tracking.client import MlflowClient

In [0]:

mlflow.set_registry_uri("databricks-uc")

In [0]:
experiment_name = f"/cancer_classification_prediction"
run_name = "cancer_classification_prediction_" + str(pd.Timestamp.now().strftime("%Y-%m-%d %H:%M:%S"))
full_model_name = "cancer_classification_model"

LABEL = "Target"
y_pred_threshold = 0.5

In [0]:
mlflow_metrics = {} # a place to pop new metrics into
mlflow_figures = {} # a place to pop new matplotlib/sns figures into
mlflow_tables = {} # a place to pop new matplotlib/sns figures into
mlflow_artifact_paths = []

def push_metric(metric_name: str, metric_value: any) -> None:
    print(f"Adding metric {metric_name}: {metric_value}")
    mlflow_metrics[metric_name] = metric_value

def push_figure(figure_name: str, figure_value: any):
    print(f"Adding figure {figure_name}: {figure_value}")
    mlflow_figures[figure_name] = figure_value

def push_table(table_name: str, table_value: any):
    print(f"Adding table {table_name}: {table_value}")
    mlflow_tables[table_name] = table_value

def push_artifact(file_path: str):
    print(f"Adding artifact {file_path}")
    mlflow_artifact_paths.append(file_path)

In [0]:
mlflow.set_registry_uri("databricks-uc")
mlflow.set_experiment(experiment_name)
run = mlflow.start_run(run_name=run_name, description=f"Creating predictions for {experiment_name} using model {full_model_name}")
print(f"creating run called {run.info.run_name}")

In [0]:
preddf = pd.read_csv('../data/pred.csv')

In [0]:
preddf.describe()

In [0]:

model_version = 4  # or use a specific version, e.g., "1"

lgbm_model = mlflow.lightgbm.load_model(
    model_uri=f"models:/cancer_classification_model/{model_version}"
)

In [0]:
lgbm_model

In [None]:


# Prepare data for predictions. In a real world we would not have target in the prediction dataset
Y = preddf['Target']  
X = preddf.drop(['Target'], axis=1)

# Generate predictions
y_pred = lgbm_model.predict(X)
print(y_pred)

In [0]:
push_metric("Average raw probability", y_pred.mean())

In [None]:
y_pred_abs = (y_pred >= y_pred_threshold).astype(int)

Probability KDE Plot

In [None]:
plt.figure(figsize=(8, 5))
for label in np.unique(Y):
    subset = y_pred[Y == label]
    sns.kdeplot(subset, label=f"Target={label}", fill=True)
plt.title("Kernel Density Estimate (KDE) of Predicted Probabilities by True Class")
plt.xlabel("Predicted Probability")
plt.ylabel("Density")
plt.legend()
plt.tight_layout()
plt.show()


In [0]:
sns.kdeplot(y_pred,fill=True)
plt.xlim(0, 1)
plt.title("KDE plot")
push_figure("kde_plot.png", plt.gcf())
plt.show()

Calculate Metrics 

In [0]:
def get_model_metrics(
    y_test: pd.DataFrame, y_pred_probablity: pd.Series, y_pred_label: pd.Series
) -> dict:
   
    # pred_threshold = np.percentile(y_pred_probablity, 60)
    # y_pred_label = np.where(y_pred_probablity >= pred_threshold, 1, 0)

    metrics = {
        "logloss": log_loss(y_test, y_pred_probablity),
        "accuracy": accuracy_score(y_test, y_pred_label),        
        "f1_score": f1_score(y_test, y_pred_label),
        "f05_score": fbeta_score(y_test, y_pred_label, beta=0.5),
        "precision_score": precision_score(y_test, y_pred_label),
        "recall_score": recall_score(y_test, y_pred_label),
        "roc_auc_score": roc_auc_score(y_test, y_pred_probablity),
        "y_real_mean": y_test.mean(),
        "y_pred_mean": y_pred_probablity.mean(),
    }
    return metrics

In [0]:
metrics = get_model_metrics(
   Y,y_pred,y_pred_abs)

In [0]:
for (metric_name, metric_value) in metrics.items():
    push_metric(metric_name, metric_value)

Save Predictions 


In [0]:
preddf['Prediction'] = (y_pred >= y_pred_threshold).astype(int)
preddf['Prediction_raw'] = y_pred
preddf.to_csv("../data/predictions.csv", index=False)
push_artifact("../data/predictions.csv")

In [0]:
preddf

Log artefacts in mlFlow

In [0]:

if mlflow.active_run() == None:
    raise Exception("The run hasn't been started, guessing you don't want a partially complete experiment")

# ADD other metrics
for (metric_name, metric_value) in mlflow_metrics.items():
    print(f"logging other metric: {metric_name}=metric_value")
    mlflow.log_metric(metric_name, metric_value)

for (figure_name, figure) in mlflow_figures.items():
    print(f"logging figure: {figure_name}")
    mlflow.log_figure(figure, figure_name)

for (table_name, table_df) in mlflow_tables.items():
    print(f"logging table: {table_name}")
    mlflow.log_table(table_df, table_name)

from mlflow.data.pandas_dataset import PandasDataset
dataset: PandasDataset = mlflow.data.from_pandas(preddf)
mlflow.log_input(dataset, context="prediction")

In [0]:
if run:
  mlflow.end_run()
  print(f"run complete for: {run.info.run_name}")
  mlflow.lightgbm.autolog(disable=True)
  run = None