In [None]:
import nannyml as nml
import pandas as pd
from IPython.display import display

In [None]:
from docs.utils import print_some_of_the_columns_only_markdown, print_table

In [None]:
# Load synthetic data
df_reference, df_analysis, _ = nml.load_us_census_ma_employment_data()
display(df_reference.head())
display(df_analysis.head())

Unnamed: 0,AGEP,SCHL,MAR,RELP,DIS,ESP,CIT,MIG,MIL,ANC,NATIVITY,DEAR,DEYE,DREM,SEX,RAC1P,y_true,year,y_pred_proba,y_pred
0,62.0,16,1,17,1,0,1,1,4,2,1,2,2,2,2,1,0,2015,0.121211,0
1,48.0,21,1,0,2,0,1,1,4,2,1,2,2,2,2,1,0,2015,0.816033,1
2,47.0,21,1,1,2,0,1,1,4,4,1,2,2,2,1,1,0,2015,0.951815,1
3,34.0,12,5,0,2,0,1,3,4,1,1,2,2,2,1,2,0,2015,0.563825,1
4,33.0,23,5,0,2,0,5,1,4,1,2,2,2,2,1,1,1,2015,0.944436,1


Unnamed: 0,AGEP,SCHL,MAR,RELP,DIS,ESP,CIT,MIG,MIL,ANC,NATIVITY,DEAR,DEYE,DREM,SEX,RAC1P,year,y_pred_proba,y_pred
0,46.0,21,1,0,2,0,3,1,4,2,1,2,2,2,1,1,2016,0.948828,1
1,46.0,21,1,1,2,0,1,1,4,1,1,2,2,2,2,1,2016,0.772002,1
2,12.0,9,5,2,2,1,1,1,0,2,1,2,2,2,2,1,2016,0.000149,0
3,52.0,21,3,0,2,0,1,1,4,2,1,2,2,2,2,1,2016,0.90607,1
4,21.0,18,5,2,2,0,1,1,4,2,1,2,2,2,1,1,2016,0.699663,1


In [None]:
print_some_of_the_columns_only_markdown(df_reference, 2, 5)

+----+--------+--------+-------+---------+----------+--------+----------------+----------+
|    | AGEP   | SCHL   | ...   | RAC1P   | y_true   | year   | y_pred_proba   | y_pred   |
| 0  | 62     | 16     | ...   | 1       | 0        | 2015   | 0.121211       | 0        |
+----+--------+--------+-------+---------+----------+--------+----------------+----------+
| 1  | 48     | 21     | ...   | 1       | 0        | 2015   | 0.816033       | 1        |
+----+--------+--------+-------+---------+----------+--------+----------------+----------+
| 2  | 47     | 21     | ...   | 1       | 0        | 2015   | 0.951815       | 1        |
+----+--------+--------+-------+---------+----------+--------+----------------+----------+
| 3  | 34     | 12     | ...   | 2       | 0        | 2015   | 0.563825       | 1        |
+----+--------+--------+-------+---------+----------+--------+----------------+----------+
| 4  | 33     | 23     | ...   | 1       | 1        | 2015   | 0.944436       | 1        |

In [None]:
print_some_of_the_columns_only_markdown(df_analysis, 2, 5)

+----+--------+--------+-------+-------+---------+--------+----------------+----------+
|    | AGEP   | SCHL   | ...   | SEX   | RAC1P   | year   | y_pred_proba   | y_pred   |
| 0  | 46     | 21     | ...   | 1     | 1       | 2016   | 0.948828       | 1        |
+----+--------+--------+-------+-------+---------+--------+----------------+----------+
| 1  | 46     | 21     | ...   | 2     | 1       | 2016   | 0.772002       | 1        |
+----+--------+--------+-------+-------+---------+--------+----------------+----------+
| 2  | 12     | 9      | ...   | 2     | 1       | 2016   | 0.000149194    | 0        |
+----+--------+--------+-------+-------+---------+--------+----------------+----------+
| 3  | 52     | 21     | ...   | 2     | 1       | 2016   | 0.90607        | 1        |
+----+--------+--------+-------+-------+---------+--------+----------------+----------+
| 4  | 21     | 18     | ...   | 1     | 1       | 2016   | 0.699663       | 1        |
+----+--------+--------+-------+

In [None]:
chunk_size = 5_000

In [None]:
estimator = nml.CBPE(
    problem_type='classification_binary',
    y_pred_proba='y_pred_proba',
    y_pred='y_pred',
    y_true='y_true',
    metrics=['roc_auc'],
    chunk_size=chunk_size,
)

In [None]:
estimator = estimator.fit(df_reference)
estimated_performance = estimator.estimate(df_analysis)

In [None]:
figure = estimated_performance.plot()
figure.show()

In [None]:
figure.write_image('../_static/quick-start-perf-est.svg')

In [None]:
univariate_calculator = nml.UnivariateDriftCalculator(
    column_names=['AGEP', 'SCHL'],
    chunk_size=chunk_size
)

univariate_calculator.fit(df_reference)
univariate_drift = univariate_calculator.calculate(df_analysis)

In [None]:
figure = univariate_drift.plot()
figure.show()

In [None]:
figure.write_image(f'../_static/quick-start-drift.svg')

In [None]:
figure = estimated_performance.compare(univariate_drift.filter(column_names=['AGEP'], period='analysis')).plot()
figure.show()

In [None]:
figure.write_image(f'../_static/quick-start-drift-n-performance.svg')

In [None]:
figure = univariate_drift.filter(period='analysis').plot(kind='distribution')
figure.show()

In [None]:
figure.write_image(f'../_static/quick-start-univariate-distribution.svg')

In [None]:
_, _, analysis_targets = nml.load_us_census_ma_employment_data()

In [None]:
df_analysis_with_targets = pd.concat([df_analysis, analysis_targets], axis=1)
display(df_analysis_with_targets.head())

Unnamed: 0,AGEP,SCHL,MAR,RELP,DIS,ESP,CIT,MIG,MIL,ANC,NATIVITY,DEAR,DEYE,DREM,SEX,RAC1P,year,y_pred_proba,y_pred,y_true
0,46.0,21,1,0,2,0,3,1,4,2,1,2,2,2,1,1,2016,0.948828,1,1
1,46.0,21,1,1,2,0,1,1,4,1,1,2,2,2,2,1,2016,0.772002,1,1
2,12.0,9,5,2,2,1,1,1,0,2,1,2,2,2,2,1,2016,0.000149,0,0
3,52.0,21,3,0,2,0,1,1,4,2,1,2,2,2,2,1,2016,0.90607,1,1
4,21.0,18,5,2,2,0,1,1,4,2,1,2,2,2,1,1,2016,0.699663,1,0


In [None]:
print_some_of_the_columns_only_markdown(df_analysis_with_targets.head(), 2, 5)

+----+--------+--------+-------+---------+--------+----------------+----------+----------+
|    | AGEP   | SCHL   | ...   | RAC1P   | year   | y_pred_proba   | y_pred   | y_true   |
| 0  | 46     | 21     | ...   | 1       | 2016   | 0.948828       | 1        | 1        |
+----+--------+--------+-------+---------+--------+----------------+----------+----------+
| 1  | 46     | 21     | ...   | 1       | 2016   | 0.772002       | 1        | 1        |
+----+--------+--------+-------+---------+--------+----------------+----------+----------+
| 2  | 12     | 9      | ...   | 1       | 2016   | 0.000149194    | 0        | 0        |
+----+--------+--------+-------+---------+--------+----------------+----------+----------+
| 3  | 52     | 21     | ...   | 1       | 2016   | 0.90607        | 1        | 1        |
+----+--------+--------+-------+---------+--------+----------------+----------+----------+
| 4  | 21     | 18     | ...   | 1       | 2016   | 0.699663       | 1        | 0        |

In [None]:
performance_calculator = nml.PerformanceCalculator(
    problem_type='classification_binary',
    y_pred_proba='y_pred_proba',
    y_true='y_true', 
    y_pred='y_pred',
    metrics=['roc_auc'],
    chunk_size=chunk_size)

performance_calculator.fit(df_reference)
calculated_performance = performance_calculator.calculate(df_analysis_with_targets)

figure = estimated_performance.filter(period='analysis').compare(calculated_performance).plot()
figure.show()

In [None]:
figure.write_image(f'../_static/quick-start-estimated-and-realized.svg')