# Profiling NannyML

Sample Notebook to Profile NannyML library.

To run this notebook jupyterlab, nannyml and pyinstrument need to be installed in your python environment.

In [None]:
import pandas as pd
import nannyml as nml

from sklearn.datasets import make_classification
from lightgbm import LGBMClassifier
from pyinstrument import Profiler
from math import floor

In [None]:
OUTPUT_PREFIX = "Profiling_MC_"

# Change Values below to make the dataset bigger/smaller
CHUNK_SIZE = 1000
N_FEATURES = 10

In [None]:
print("creating data")
RANDOM_STATE = 13

N_SAMPLES = CHUNK_SIZE * 25
N_INFORMATIVE = floor(N_FEATURES*0.95)
N_REDUNDANT = floor(N_FEATURES*0.03)
N_REPEATED = floor(N_FEATURES*0.01)
N_CLASSES = 3
N_CLUSTERS_PER_CLASS = 4

x, y = make_classification(
    n_samples=N_SAMPLES,
    n_features=N_FEATURES,
    n_informative=N_INFORMATIVE,
    n_redundant=N_REDUNDANT,
    n_repeated=N_REPEATED,
    n_classes=N_CLASSES,
    random_state=RANDOM_STATE,
    n_clusters_per_class=N_CLUSTERS_PER_CLASS,
    shuffle=True,
    # scale=1.5,
    flip_y=0.05,
    class_sep=2
)

features_selected = ['f'+str(el+1) for el in range(0, x.shape[1])]
data = pd.DataFrame(x, columns=features_selected)
data['y_true'] = y
del x,y
print("creating model")
cat_n = len(features_selected)//7
for el in features_selected[-cat_n:]:
    data[el] = pd.cut(data[el], bins=5, labels=['a', 'b', 'c', 'd', 'e'])

model = LGBMClassifier(random_state=14)
model.fit(
    data.loc[:5*CHUNK_SIZE, features_selected],
    data.loc[:5*CHUNK_SIZE, 'y_true']
)
data['y_pred'] = model.predict(data.loc[:, features_selected])
preds = model.predict_proba(data.loc[:, features_selected])
data['y_pred_proba_0'] = preds[:,0]
data['y_pred_proba_1'] = preds[:,1]
data['y_pred_proba_2'] = preds[:,2]
# data.head(10)

In [None]:
reference = data.loc[5*CHUNK_SIZE+1:15*CHUNK_SIZE].reset_index(drop=True)
analysis = data.loc[15*CHUNK_SIZE:].reset_index(drop=True)
del data

In [None]:
profiler = Profiler()
profiler.start()

calc = nml.SummaryStatsAvgCalculator(
    column_names=features_selected[:-cat_n],
    chunk_size=CHUNK_SIZE
)
calc.fit(reference)
results = calc.calculate(analysis)

profiler.stop()
profiler.write_html(f'{OUTPUT_PREFIX}_{CHUNK_SIZE//1000}K_STATS_AVG.html')

In [None]:
profiler = Profiler()
profiler.start()

calc = nml.CBPE(
    y_pred_proba={
        0: 'y_pred_proba_0',
        1: 'y_pred_proba_1',
        2: 'y_pred_proba_2'},
    y_pred='y_pred',
    y_true='y_true',
    metrics=['accuracy',],
    chunk_size=CHUNK_SIZE,
    problem_type='classification_multiclass',
)
calc.fit(reference)
results = calc.estimate(analysis)

profiler.stop()
profiler.write_html(f'{OUTPUT_PREFIX}_{CHUNK_SIZE//1000}K_CBPE_ACC.html')

In [None]:
profiler = Profiler()
profiler.start()

calc = nml.CBPE(
    y_pred_proba={
        0: 'y_pred_proba_0',
        1: 'y_pred_proba_1',
        2: 'y_pred_proba_2'},
    y_pred='y_pred',
    y_true='y_true',
    metrics=['f1',],
    chunk_size=CHUNK_SIZE,
    problem_type='classification_multiclass',
)
calc.fit(reference)
results = calc.estimate(analysis)

profiler.stop()
profiler.write_html(f'{OUTPUT_PREFIX}_{CHUNK_SIZE//1000}K_CBPE_F1.html')

In [None]:
profiler = Profiler()
profiler.start()

calc = nml.CBPE(
    y_pred_proba={
        0: 'y_pred_proba_0',
        1: 'y_pred_proba_1',
        2: 'y_pred_proba_2'},
    y_pred='y_pred',
    y_true='y_true',
    metrics=['roc_auc',],
    chunk_size=CHUNK_SIZE,
    problem_type='classification_multiclass',
)
calc.fit(reference)
results = calc.estimate(analysis)

profiler.stop()
profiler.write_html(f'{OUTPUT_PREFIX}_{CHUNK_SIZE//1000}K_CBPE_AUROC.html')

In [None]:
profiler = Profiler()
profiler.start()

calc = nml.CBPE(
    y_pred_proba={
        0: 'y_pred_proba_0',
        1: 'y_pred_proba_1',
        2: 'y_pred_proba_2'},
    y_pred='y_pred',
    y_true='y_true',
    metrics=['precision',],
    chunk_size=CHUNK_SIZE,
    problem_type='classification_multiclass',
)
calc.fit(reference)
results = calc.estimate(analysis)

profiler.stop()
profiler.write_html(f'{OUTPUT_PREFIX}_{CHUNK_SIZE//1000}K_CBPE_PREC.html')

In [None]:
profiler = Profiler()
profiler.start()

calc = nml.CBPE(
    y_pred_proba={
        0: 'y_pred_proba_0',
        1: 'y_pred_proba_1',
        2: 'y_pred_proba_2'},
    y_pred='y_pred',
    y_true='y_true',
    metrics=['recall',],
    chunk_size=CHUNK_SIZE,
    problem_type='classification_multiclass',
)
calc.fit(reference)
results = calc.estimate(analysis)

profiler.stop()
profiler.write_html(f'{OUTPUT_PREFIX}_{CHUNK_SIZE//1000}K_CBPE_RECL.html')

In [None]:
profiler = Profiler()
profiler.start()

calc = nml.CBPE(
    y_pred_proba={
        0: 'y_pred_proba_0',
        1: 'y_pred_proba_1',
        2: 'y_pred_proba_2'},
    y_pred='y_pred',
    y_true='y_true',
    metrics=['specificity',],
    chunk_size=CHUNK_SIZE,
    problem_type='classification_multiclass',
)
calc.fit(reference)
results = calc.estimate(analysis)

profiler.stop()
profiler.write_html(f'{OUTPUT_PREFIX}_{CHUNK_SIZE//1000}K_CBPE_SPEC.html')

In [None]:
profiler = Profiler()
profiler.start()

calc = nml.CBPE(
    y_pred_proba={
        0: 'y_pred_proba_0',
        1: 'y_pred_proba_1',
        2: 'y_pred_proba_2'},
    y_pred='y_pred',
    y_true='y_true',
    metrics=['confusion_matrix',],
    chunk_size=CHUNK_SIZE,
    problem_type='classification_multiclass',
)
calc.fit(reference)
results = calc.estimate(analysis)

profiler.stop()
profiler.write_html(f'{OUTPUT_PREFIX}_{CHUNK_SIZE//1000}K_CBPE_CM.html')

In [None]:
profiler = Profiler()
profiler.start()

calc = nml.CBPE(
    y_pred_proba={
        0: 'y_pred_proba_0',
        1: 'y_pred_proba_1',
        2: 'y_pred_proba_2'},
    y_pred='y_pred',
    y_true='y_true',
    metrics=['roc_auc', 'f1', 'accuracy', 'precision', 'recall', 'specificity', 'confusion_matrix',],
    chunk_size=CHUNK_SIZE,
    problem_type='classification_multiclass',
)
calc.fit(reference)
results = calc.estimate(analysis)

profiler.stop()
profiler.write_html(f'{OUTPUT_PREFIX}_{CHUNK_SIZE//1000}K_CBPE_ALL.html')

In [None]:
profiler = Profiler()
profiler.start()

calc = nml.PerformanceCalculator(
    y_pred_proba={
        0: 'y_pred_proba_0',
        1: 'y_pred_proba_1',
        2: 'y_pred_proba_2'},
    y_pred='y_pred',
    y_true='y_true',
    problem_type='classification_multiclass',
    metrics=['accuracy',],
    chunk_size=CHUNK_SIZE
)
calc.fit(reference)
results = calc.calculate(analysis)

profiler.stop()
profiler.write_html(f'{OUTPUT_PREFIX}_{CHUNK_SIZE//1000}K_PERF_ACC.html')

In [None]:
profiler = Profiler()
profiler.start()

calc = nml.PerformanceCalculator(
    y_pred_proba={
        0: 'y_pred_proba_0',
        1: 'y_pred_proba_1',
        2: 'y_pred_proba_2'},
    y_pred='y_pred',
    y_true='y_true',
    problem_type='classification_multiclass',
    metrics=['specificity',],
    chunk_size=CHUNK_SIZE
)
calc.fit(reference)
results = calc.calculate(analysis)

profiler.stop()
profiler.write_html(f'{OUTPUT_PREFIX}_{CHUNK_SIZE//1000}K_PERF_SPEC.html')

In [None]:
profiler = Profiler()
profiler.start()

calc = nml.PerformanceCalculator(
    y_pred_proba={
        0: 'y_pred_proba_0',
        1: 'y_pred_proba_1',
        2: 'y_pred_proba_2'},
    y_pred='y_pred',
    y_true='y_true',
    problem_type='classification_multiclass',
    metrics=['recall',],
    chunk_size=CHUNK_SIZE
)
calc.fit(reference)
results = calc.calculate(analysis)

profiler.stop()
profiler.write_html(f'{OUTPUT_PREFIX}_{CHUNK_SIZE//1000}K_PERF_PECL.html')

In [None]:
profiler = Profiler()
profiler.start()

calc = nml.PerformanceCalculator(
    y_pred_proba={
        0: 'y_pred_proba_0',
        1: 'y_pred_proba_1',
        2: 'y_pred_proba_2'},
    y_pred='y_pred',
    y_true='y_true',
    problem_type='classification_multiclass',
    metrics=['precision',],
    chunk_size=CHUNK_SIZE
)
calc.fit(reference)
results = calc.calculate(analysis)

profiler.stop()
profiler.write_html(f'{OUTPUT_PREFIX}_{CHUNK_SIZE//1000}K_PERF_PREC.html')

In [None]:
profiler = Profiler()
profiler.start()

calc = nml.PerformanceCalculator(
    y_pred_proba={
        0: 'y_pred_proba_0',
        1: 'y_pred_proba_1',
        2: 'y_pred_proba_2'},
    y_pred='y_pred',
    y_true='y_true',
    problem_type='classification_multiclass',
    metrics=['f1',],
    chunk_size=CHUNK_SIZE
)
calc.fit(reference)
results = calc.calculate(analysis)

profiler.stop()
profiler.write_html(f'{OUTPUT_PREFIX}_{CHUNK_SIZE//1000}K_PERF_F1.html')

In [None]:
profiler = Profiler()
profiler.start()

calc = nml.PerformanceCalculator(
    y_pred_proba={
        0: 'y_pred_proba_0',
        1: 'y_pred_proba_1',
        2: 'y_pred_proba_2'},
    y_pred='y_pred',
    y_true='y_true',
    problem_type='classification_multiclass',
    metrics=['roc_auc',],
    chunk_size=CHUNK_SIZE
)
calc.fit(reference)
results = calc.calculate(analysis)

profiler.stop()
profiler.write_html(f'{OUTPUT_PREFIX}_{CHUNK_SIZE//1000}K_PERF_AUROC.html')

In [None]:
profiler = Profiler()
profiler.start()

calc = nml.PerformanceCalculator(
    y_pred_proba={
        0: 'y_pred_proba_0',
        1: 'y_pred_proba_1',
        2: 'y_pred_proba_2'},
    y_pred='y_pred',
    y_true='y_true',
    problem_type='classification_multiclass',
    metrics=['confusion_matrix',],
    chunk_size=CHUNK_SIZE
)
calc.fit(reference)
results = calc.calculate(analysis)

profiler.stop()
profiler.write_html(f'{OUTPUT_PREFIX}_{CHUNK_SIZE//1000}K_PERF_CM.html')

In [None]:
profiler = Profiler()
profiler.start()

calc = nml.PerformanceCalculator(
    y_pred_proba='y_pred_proba',
    y_pred='y_pred',
    y_true='y_true',
    problem_type='classification_binary',
    metrics=['roc_auc', 'f1', 'accuracy', 'precision', 'recall', 'specificity', 'confusion_matrix',],
    chunk_size=CHUNK_SIZE
)
calc.fit(reference)
results = calc.calculate(analysis)

profiler.stop()
profiler.write_html(f'{OUTPUT_PREFIX}_{CHUNK_SIZE//1000}K_PERF_ALL.html')