# Profiling NannyML

Sample Notebook to Profile NannyML library.

To run this notebook jupyterlab, nannyml and pyinstrument need to be installed in your python environment.

In [None]:
import pandas as pd
import nannyml as nml

from sklearn.datasets import make_regression
from lightgbm import LGBMRegressor
from pyinstrument import Profiler
from math import floor

In [None]:
OUTPUT_PREFIX = "Profiling_REG_"

# Change Values below to make the dataset bigger/smaller
CHUNK_SIZE = 1000
N_FEATURES = 10

In [None]:
print("creating data")
RANDOM_STATE = 13

N_SAMPLES = CHUNK_SIZE * 25
N_INFORMATIVE = floor(N_FEATURES*0.95)
N_REDUNDANT = floor(N_FEATURES*0.03)
N_REPEATED = floor(N_FEATURES*0.01)

x, y = make_regression(
    n_samples=N_SAMPLES,
    n_features=N_FEATURES,
    n_informative=N_INFORMATIVE,
    random_state=RANDOM_STATE,
    shuffle=True,
    bias=10_000
)

features_selected = ['f'+str(el+1) for el in range(0, x.shape[1])]
data = pd.DataFrame(x, columns=features_selected)
data['y_true'] = y
del x,y
print("creating model")
cat_n = len(features_selected)//7
for el in features_selected[-cat_n:]:
    data[el] = pd.cut(data[el], bins=5, labels=['a', 'b', 'c', 'd', 'e'])

model = LGBMRegressor(random_state=14)
model.fit(
    data.loc[:5*CHUNK_SIZE, features_selected],
    data.loc[:5*CHUNK_SIZE, 'y_true']
)
data['y_pred'] = model.predict(data.loc[:, features_selected])
# data.head(10)

In [None]:
reference = data.loc[5*CHUNK_SIZE+1:15*CHUNK_SIZE].reset_index(drop=True)
analysis = data.loc[15*CHUNK_SIZE:].reset_index(drop=True)
del data

In [None]:
profiler = Profiler()
profiler.start()

calc = nml.SummaryStatsAvgCalculator(
    column_names=features_selected[:-cat_n],
    chunk_size=CHUNK_SIZE
)
calc.fit(reference)
results = calc.calculate(analysis)

profiler.stop()
profiler.write_html(f'{OUTPUT_PREFIX}_{CHUNK_SIZE//1000}K_STATS_AVG.html')

In [None]:
profiler = Profiler()
profiler.start()

estimator = nml.DLE(
    feature_column_names=features_selected,
    y_pred='y_pred',
    y_true='y_true',
    metrics=['mae',],
    chunk_size=CHUNK_SIZE,
    tune_hyperparameters=False
)
estimator.fit(reference)
results = estimator.estimate(analysis)

profiler.stop()
profiler.write_html(f'{OUTPUT_PREFIX}_{CHUNK_SIZE//1000}K_DLE_MAE.html')

In [None]:
profiler = Profiler()
profiler.start()

estimator = nml.DLE(
    feature_column_names=features_selected,
    y_pred='y_pred',
    y_true='y_true',
    metrics=['mape',],
    chunk_size=CHUNK_SIZE,
    tune_hyperparameters=False
)
estimator.fit(reference)
results = estimator.estimate(analysis)

profiler.stop()
profiler.write_html(f'{OUTPUT_PREFIX}_{CHUNK_SIZE//1000}K_DLE_MAPE.html')

In [None]:
profiler = Profiler()
profiler.start()

estimator = nml.DLE(
    feature_column_names=features_selected,
    y_pred='y_pred',
    y_true='y_true',
    metrics=['mse',],
    chunk_size=CHUNK_SIZE,
    tune_hyperparameters=False
)
estimator.fit(reference)
results = estimator.estimate(analysis)

profiler.stop()
profiler.write_html(f'{OUTPUT_PREFIX}_{CHUNK_SIZE//1000}K_DLE_MSE.html')

In [None]:
profiler = Profiler()
profiler.start()

estimator = nml.DLE(
    feature_column_names=features_selected,
    y_pred='y_pred',
    y_true='y_true',
    metrics=['msle',],
    chunk_size=CHUNK_SIZE,
    tune_hyperparameters=False
)
estimator.fit(reference)
results = estimator.estimate(analysis)

profiler.stop()
profiler.write_html(f'{OUTPUT_PREFIX}_{CHUNK_SIZE//1000}K_DLE_MSLE.html')

In [None]:
profiler = Profiler()
profiler.start()

estimator = nml.DLE(
    feature_column_names=features_selected,
    y_pred='y_pred',
    y_true='y_true',
    metrics=['rmse'],
    chunk_size=CHUNK_SIZE,
    tune_hyperparameters=False
)
estimator.fit(reference)
results = estimator.estimate(analysis)

profiler.stop()
profiler.write_html(f'{OUTPUT_PREFIX}_{CHUNK_SIZE//1000}K_DLE_RMSE.html')

In [None]:
profiler = Profiler()
profiler.start()

estimator = nml.DLE(
    feature_column_names=features_selected,
    y_pred='y_pred',
    y_true='y_true',
    metrics=['rmsle'],
    chunk_size=CHUNK_SIZE,
    tune_hyperparameters=False
)
estimator.fit(reference)
results = estimator.estimate(analysis)

profiler.stop()
profiler.write_html(f'{OUTPUT_PREFIX}_{CHUNK_SIZE//1000}K_DLE_RMLSE.html')

In [None]:
profiler = Profiler()
profiler.start()

estimator = nml.DLE(
    feature_column_names=features_selected,
    y_pred='y_pred',
    y_true='y_true',
    metrics=['mae', 'mape', 'mse', 'msle', 'rmse', 'rmsle'],
    chunk_size=CHUNK_SIZE,
    tune_hyperparameters=False
)
estimator.fit(reference)
results = estimator.estimate(analysis)

profiler.stop()
profiler.write_html(f'{OUTPUT_PREFIX}_{CHUNK_SIZE//1000}K_DLE_ALL.html')

In [None]:
profiler = Profiler()
profiler.start()

calc = nml.PerformanceCalculator(
    y_pred='y_pred',
    y_true='y_true',
    metrics=['rmsle'],
    chunk_size=CHUNK_SIZE,
    problem_type='regression',
)
calc.fit(reference)
results = calc.calculate(analysis)

profiler.stop()
profiler.write_html(f'{OUTPUT_PREFIX}_{CHUNK_SIZE//1000}K_PERF_RMSLE.html')

In [None]:
profiler = Profiler()
profiler.start()

calc = nml.PerformanceCalculator(
    y_pred='y_pred',
    y_true='y_true',
    metrics=['rmse',],
    chunk_size=CHUNK_SIZE,
    problem_type='regression',
)
calc.fit(reference)
results = calc.calculate(analysis)

profiler.stop()
profiler.write_html(f'{OUTPUT_PREFIX}_{CHUNK_SIZE//1000}K_PERF_RMSE.html')

In [None]:
profiler = Profiler()
profiler.start()

calc = nml.PerformanceCalculator(
    y_pred='y_pred',
    y_true='y_true',
    metrics=['msle',],
    chunk_size=CHUNK_SIZE,
    problem_type='regression',
)
calc.fit(reference)
results = calc.calculate(analysis)

profiler.stop()
profiler.write_html(f'{OUTPUT_PREFIX}_{CHUNK_SIZE//1000}K_PERF_MSLE.html')

In [None]:
profiler = Profiler()
profiler.start()

calc = nml.PerformanceCalculator(
    y_pred='y_pred',
    y_true='y_true',
    metrics=['mse',],
    chunk_size=CHUNK_SIZE,
    problem_type='regression',
)
calc.fit(reference)
results = calc.calculate(analysis)

profiler.stop()
profiler.write_html(f'{OUTPUT_PREFIX}_{CHUNK_SIZE//1000}K_PERF_MSE.html')

In [None]:
profiler = Profiler()
profiler.start()

calc = nml.PerformanceCalculator(
    y_pred='y_pred',
    y_true='y_true',
    metrics=['mape',],
    chunk_size=CHUNK_SIZE,
    problem_type='regression',
)
calc.fit(reference)
results = calc.calculate(analysis)

profiler.stop()
profiler.write_html(f'{OUTPUT_PREFIX}_{CHUNK_SIZE//1000}K_PERF_MAPE.html')

In [None]:
profiler = Profiler()
profiler.start()

calc = nml.PerformanceCalculator(
    y_pred='y_pred',
    y_true='y_true',
    metrics=['mae',],
    chunk_size=CHUNK_SIZE,
    problem_type='regression',
)
calc.fit(reference)
results = calc.calculate(analysis)

profiler.stop()
profiler.write_html(f'{OUTPUT_PREFIX}_{CHUNK_SIZE//1000}K_PERF_MAE.html')

In [None]:
profiler = Profiler()
profiler.start()

calc = nml.PerformanceCalculator(
    y_pred='y_pred',
    y_true='y_true',
    metrics=['mae', 'mape', 'mse', 'msle', 'rmse', 'rmsle'],
    chunk_size=CHUNK_SIZE,
    problem_type='regression',
)
calc.fit(reference)
results = calc.calculate(analysis)

profiler.stop()
profiler.write_html(f'{OUTPUT_PREFIX}_{CHUNK_SIZE//1000}K_PERF_ALL.html')